You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by uw...@apache.org on 2017/04/03 06:43:53 UTC
arrow git commit: ARROW-749: [Python] Delete partially-written
Feather file when column write fails
Repository: arrow
Updated Branches:
refs/heads/master 8f113b4d0 -> 96f3d6176
ARROW-749: [Python] Delete partially-written Feather file when column write fails
This is currently the only place where we are doing an atomic create-file/write-file. We should be mindful of other serialization functions which may yield unreadable files in the future.
Author: Wes McKinney <we...@twosigma.com>
Closes #484 from wesm/ARROW-749 and squashes the following commits:
137e235 [Wes McKinney] Delete partially-written Feather file when column write fails
Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/96f3d617
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/96f3d617
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/96f3d617
Branch: refs/heads/master
Commit: 96f3d6176d8c95717f4ff45e4226161de3168b05
Parents: 8f113b4
Author: Wes McKinney <we...@twosigma.com>
Authored: Mon Apr 3 08:43:47 2017 +0200
Committer: Uwe L. Korn <uw...@xhochy.com>
Committed: Mon Apr 3 08:43:47 2017 +0200
----------------------------------------------------------------------
python/pyarrow/feather.py | 79 ++++++++++++++++++++-----------
python/pyarrow/tests/test_feather.py | 16 +++++++
2 files changed, 67 insertions(+), 28 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/arrow/blob/96f3d617/python/pyarrow/feather.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/feather.py b/python/pyarrow/feather.py
index f87c7f3..3b5716e 100644
--- a/python/pyarrow/feather.py
+++ b/python/pyarrow/feather.py
@@ -15,8 +15,10 @@
# specific language governing permissions and limitations
# under the License.
-import six
from distutils.version import LooseVersion
+import os
+
+import six
import pandas as pd
from pyarrow.compat import pdapi
@@ -54,45 +56,66 @@ class FeatherReader(ext.FeatherReader):
return table.to_pandas()
-def write_feather(df, dest):
- '''
- Write a pandas.DataFrame to Feather format
- '''
- writer = ext.FeatherWriter()
- writer.open(dest)
+class FeatherWriter(object):
- if isinstance(df, pd.SparseDataFrame):
- df = df.to_dense()
+ def __init__(self, dest):
+ self.dest = dest
+ self.writer = ext.FeatherWriter()
+ self.writer.open(dest)
- if not df.columns.is_unique:
- raise ValueError("cannot serialize duplicate column names")
+ def write(self, df):
+ if isinstance(df, pd.SparseDataFrame):
+ df = df.to_dense()
- # TODO(wesm): pipeline conversion to Arrow memory layout
- for i, name in enumerate(df.columns):
- col = df.iloc[:, i]
+ if not df.columns.is_unique:
+ raise ValueError("cannot serialize duplicate column names")
- if pdapi.is_object_dtype(col):
- inferred_type = pd.lib.infer_dtype(col)
- msg = ("cannot serialize column {n} "
- "named {name} with dtype {dtype}".format(
- n=i, name=name, dtype=inferred_type))
+ # TODO(wesm): pipeline conversion to Arrow memory layout
+ for i, name in enumerate(df.columns):
+ col = df.iloc[:, i]
- if inferred_type in ['mixed']:
+ if pdapi.is_object_dtype(col):
+ inferred_type = pd.lib.infer_dtype(col)
+ msg = ("cannot serialize column {n} "
+ "named {name} with dtype {dtype}".format(
+ n=i, name=name, dtype=inferred_type))
- # allow columns with nulls + an inferable type
- inferred_type = pd.lib.infer_dtype(col[col.notnull()])
if inferred_type in ['mixed']:
+
+ # allow columns with nulls + an inferable type
+ inferred_type = pd.lib.infer_dtype(col[col.notnull()])
+ if inferred_type in ['mixed']:
+ raise ValueError(msg)
+
+ elif inferred_type not in ['unicode', 'string']:
raise ValueError(msg)
- elif inferred_type not in ['unicode', 'string']:
- raise ValueError(msg)
+ if not isinstance(name, six.string_types):
+ name = str(name)
- if not isinstance(name, six.string_types):
- name = str(name)
+ self.writer.write_array(name, col)
- writer.write_array(name, col)
+ self.writer.close()
- writer.close()
+
+def write_feather(df, dest):
+ '''
+ Write a pandas.DataFrame to Feather format
+ '''
+ writer = FeatherWriter(dest)
+ try:
+ writer.write(df)
+ except:
+ # Try to make sure the resource is closed
+ import gc
+ writer = None
+ gc.collect()
+ if isinstance(dest, six.string_types):
+ try:
+ os.remove(dest)
+ except os.error:
+ pass
+ raise
def read_feather(source, columns=None):
http://git-wip-us.apache.org/repos/asf/arrow/blob/96f3d617/python/pyarrow/tests/test_feather.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py
index 525da34..c7b4f1e 100644
--- a/python/pyarrow/tests/test_feather.py
+++ b/python/pyarrow/tests/test_feather.py
@@ -249,6 +249,22 @@ class TestFeatherReader(unittest.TestCase):
df = pd.DataFrame({'bools': arr})
self._check_pandas_roundtrip(df, null_counts=[1 * repeats])
+ def test_delete_partial_file_on_error(self):
+ # strings will fail
+ df = pd.DataFrame(
+ {
+ 'numbers': range(5),
+ 'strings': [b'foo', None, u'bar', 'qux', np.nan]},
+ columns=['numbers', 'strings'])
+
+ path = random_path()
+ try:
+ write_feather(df, path)
+ except:
+ pass
+
+ assert not os.path.exists(path)
+
def test_strings(self):
repeats = 1000