You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by uw...@apache.org on 2017/04/03 06:43:53 UTC

arrow git commit: ARROW-749: [Python] Delete partially-written Feather file when column write fails

Repository: arrow
Updated Branches:
  refs/heads/master 8f113b4d0 -> 96f3d6176


ARROW-749: [Python] Delete partially-written Feather file when column write fails

This is currently the only place where we are doing an atomic create-file/write-file. We should be mindful of other serialization functions which may yield unreadable files in the future.

Author: Wes McKinney <we...@twosigma.com>

Closes #484 from wesm/ARROW-749 and squashes the following commits:

137e235 [Wes McKinney] Delete partially-written Feather file when column write fails


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/96f3d617
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/96f3d617
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/96f3d617

Branch: refs/heads/master
Commit: 96f3d6176d8c95717f4ff45e4226161de3168b05
Parents: 8f113b4
Author: Wes McKinney <we...@twosigma.com>
Authored: Mon Apr 3 08:43:47 2017 +0200
Committer: Uwe L. Korn <uw...@xhochy.com>
Committed: Mon Apr 3 08:43:47 2017 +0200

----------------------------------------------------------------------
 python/pyarrow/feather.py            | 79 ++++++++++++++++++++-----------
 python/pyarrow/tests/test_feather.py | 16 +++++++
 2 files changed, 67 insertions(+), 28 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/96f3d617/python/pyarrow/feather.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/feather.py b/python/pyarrow/feather.py
index f87c7f3..3b5716e 100644
--- a/python/pyarrow/feather.py
+++ b/python/pyarrow/feather.py
@@ -15,8 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import six
 from distutils.version import LooseVersion
+import os
+
+import six
 import pandas as pd
 
 from pyarrow.compat import pdapi
@@ -54,45 +56,66 @@ class FeatherReader(ext.FeatherReader):
         return table.to_pandas()
 
 
-def write_feather(df, dest):
-    '''
-    Write a pandas.DataFrame to Feather format
-    '''
-    writer = ext.FeatherWriter()
-    writer.open(dest)
+class FeatherWriter(object):
 
-    if isinstance(df, pd.SparseDataFrame):
-        df = df.to_dense()
+    def __init__(self, dest):
+        self.dest = dest
+        self.writer = ext.FeatherWriter()
+        self.writer.open(dest)
 
-    if not df.columns.is_unique:
-        raise ValueError("cannot serialize duplicate column names")
+    def write(self, df):
+        if isinstance(df, pd.SparseDataFrame):
+            df = df.to_dense()
 
-    # TODO(wesm): pipeline conversion to Arrow memory layout
-    for i, name in enumerate(df.columns):
-        col = df.iloc[:, i]
+        if not df.columns.is_unique:
+            raise ValueError("cannot serialize duplicate column names")
 
-        if pdapi.is_object_dtype(col):
-            inferred_type = pd.lib.infer_dtype(col)
-            msg = ("cannot serialize column {n} "
-                   "named {name} with dtype {dtype}".format(
-                       n=i, name=name, dtype=inferred_type))
+        # TODO(wesm): pipeline conversion to Arrow memory layout
+        for i, name in enumerate(df.columns):
+            col = df.iloc[:, i]
 
-            if inferred_type in ['mixed']:
+            if pdapi.is_object_dtype(col):
+                inferred_type = pd.lib.infer_dtype(col)
+                msg = ("cannot serialize column {n} "
+                       "named {name} with dtype {dtype}".format(
+                           n=i, name=name, dtype=inferred_type))
 
-                # allow columns with nulls + an inferable type
-                inferred_type = pd.lib.infer_dtype(col[col.notnull()])
                 if inferred_type in ['mixed']:
+
+                    # allow columns with nulls + an inferable type
+                    inferred_type = pd.lib.infer_dtype(col[col.notnull()])
+                    if inferred_type in ['mixed']:
+                        raise ValueError(msg)
+
+                elif inferred_type not in ['unicode', 'string']:
                     raise ValueError(msg)
 
-            elif inferred_type not in ['unicode', 'string']:
-                raise ValueError(msg)
+            if not isinstance(name, six.string_types):
+                name = str(name)
 
-        if not isinstance(name, six.string_types):
-            name = str(name)
+            self.writer.write_array(name, col)
 
-        writer.write_array(name, col)
+        self.writer.close()
 
-    writer.close()
+
+def write_feather(df, dest):
+    '''
+    Write a pandas.DataFrame to Feather format
+    '''
+    writer = FeatherWriter(dest)
+    try:
+        writer.write(df)
+    except:
+        # Try to make sure the resource is closed
+        import gc
+        writer = None
+        gc.collect()
+        if isinstance(dest, six.string_types):
+            try:
+                os.remove(dest)
+            except os.error:
+                pass
+        raise
 
 
 def read_feather(source, columns=None):

http://git-wip-us.apache.org/repos/asf/arrow/blob/96f3d617/python/pyarrow/tests/test_feather.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py
index 525da34..c7b4f1e 100644
--- a/python/pyarrow/tests/test_feather.py
+++ b/python/pyarrow/tests/test_feather.py
@@ -249,6 +249,22 @@ class TestFeatherReader(unittest.TestCase):
         df = pd.DataFrame({'bools': arr})
         self._check_pandas_roundtrip(df, null_counts=[1 * repeats])
 
+    def test_delete_partial_file_on_error(self):
+        # strings will fail
+        df = pd.DataFrame(
+            {
+                'numbers': range(5),
+                'strings': [b'foo', None, u'bar', 'qux', np.nan]},
+            columns=['numbers', 'strings'])
+
+        path = random_path()
+        try:
+            write_feather(df, path)
+        except:
+            pass
+
+        assert not os.path.exists(path)
+
     def test_strings(self):
         repeats = 1000