You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2018/01/25 22:01:11 UTC

[arrow] branch master updated: ARROW-2031: [Python] HadoopFileSystem is pickleable

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 8edd62e  ARROW-2031: [Python] HadoopFileSystem is pickleable
8edd62e is described below

commit 8edd62e1bda0bf38f0fce872167be99826d28da5
Author: Jim Crist <ji...@gmail.com>
AuthorDate: Thu Jan 25 17:01:02 2018 -0500

    ARROW-2031: [Python] HadoopFileSystem is pickleable
    
    Adds support for pickling `HadoopFileSystem`
    
    A few additional small fixes:
    - Adds a check that `driver` is one of {'libhdfs3', 'libhdfs'}
    - A few small tweaks to the hdfs tests to make them easier to run locally.
    
    Author: Jim Crist <ji...@gmail.com>
    
    Closes #1505 from jcrist/pickle-hdfs-filesystem and squashes the following commits:
    
    b19ed3e0 [Jim Crist] Compat with older cython versions
    1f747264 [Jim Crist] HadoopFileSystem is pickleable
---
 python/pyarrow/hdfs.py            |  4 ++++
 python/pyarrow/io-hdfs.pxi        | 20 ++++++++++++++++----
 python/pyarrow/tests/test_hdfs.py | 23 ++++++++++++++++++++---
 3 files changed, 40 insertions(+), 7 deletions(-)

diff --git a/python/pyarrow/hdfs.py b/python/pyarrow/hdfs.py
index 3c9d041..3f2014b 100644
--- a/python/pyarrow/hdfs.py
+++ b/python/pyarrow/hdfs.py
@@ -36,6 +36,10 @@ class HadoopFileSystem(lib.HadoopFileSystem, FileSystem):
 
         self._connect(host, port, user, kerb_ticket, driver)
 
+    def __reduce__(self):
+        return (HadoopFileSystem, (self.host, self.port, self.user,
+                                   self.kerb_ticket, self.driver))
+
     @implements(FileSystem.isdir)
     def isdir(self, path):
         return super(HadoopFileSystem, self).isdir(path)
diff --git a/python/pyarrow/io-hdfs.pxi b/python/pyarrow/io-hdfs.pxi
index e653813..3abf045 100644
--- a/python/pyarrow/io-hdfs.pxi
+++ b/python/pyarrow/io-hdfs.pxi
@@ -59,29 +59,41 @@ cdef class HadoopFileSystem:
 
     cdef readonly:
         bint is_open
-
-    def __cinit__(self):
-        pass
+        str host
+        str user
+        str kerb_ticket
+        str driver
+        int port
 
     def _connect(self, host, port, user, kerb_ticket, driver):
         cdef HdfsConnectionConfig conf
 
         if host is not None:
             conf.host = tobytes(host)
+        self.host = host
+
         conf.port = port
+        self.port = port
+
         if user is not None:
             conf.user = tobytes(user)
+        self.user = user
+
         if kerb_ticket is not None:
             conf.kerb_ticket = tobytes(kerb_ticket)
+        self.kerb_ticket = kerb_ticket
 
         if driver == 'libhdfs':
             with nogil:
                 check_status(HaveLibHdfs())
             conf.driver = HdfsDriver_LIBHDFS
-        else:
+        elif driver == 'libhdfs3':
             with nogil:
                 check_status(HaveLibHdfs3())
             conf.driver = HdfsDriver_LIBHDFS3
+        else:
+            raise ValueError("unknown driver: %r" % driver)
+        self.driver = driver
 
         with nogil:
             check_status(CHadoopFileSystem.Connect(&conf, &self.client))
diff --git a/python/pyarrow/tests/test_hdfs.py b/python/pyarrow/tests/test_hdfs.py
index 51b6ba2..b62458c 100644
--- a/python/pyarrow/tests/test_hdfs.py
+++ b/python/pyarrow/tests/test_hdfs.py
@@ -18,6 +18,7 @@
 from io import BytesIO
 from os.path import join as pjoin
 import os
+import pickle
 import random
 import unittest
 
@@ -36,7 +37,7 @@ import pyarrow.tests.test_parquet as test_parquet
 
 def hdfs_test_client(driver='libhdfs'):
     host = os.environ.get('ARROW_HDFS_TEST_HOST', 'localhost')
-    user = os.environ['ARROW_HDFS_TEST_USER']
+    user = os.environ.get('ARROW_HDFS_TEST_USER', None)
     try:
         port = int(os.environ.get('ARROW_HDFS_TEST_PORT', 20500))
     except ValueError:
@@ -72,6 +73,22 @@ class HdfsTestCases(object):
         cls.hdfs.delete(cls.tmp_path, recursive=True)
         cls.hdfs.close()
 
+    def test_unknown_driver(self):
+        with pytest.raises(ValueError):
+            hdfs_test_client(driver="not_a_driver_name")
+
+    def test_pickle(self):
+        s = pickle.dumps(self.hdfs)
+        h2 = pickle.loads(s)
+        assert h2.is_open
+        assert h2.host == self.hdfs.host
+        assert h2.port == self.hdfs.port
+        assert h2.user == self.hdfs.user
+        assert h2.kerb_ticket == self.hdfs.kerb_ticket
+        assert h2.driver == self.hdfs.driver
+        # smoketest unpickled client works
+        h2.ls(self.tmp_path)
+
     def test_cat(self):
         path = pjoin(self.tmp_path, 'cat-test')
 
@@ -299,7 +316,7 @@ class TestLibHdfs(HdfsTestCases, unittest.TestCase):
     @classmethod
     def check_driver(cls):
         if not pa.have_libhdfs():
-            pytest.fail('No libhdfs available on system')
+            pytest.skip('No libhdfs available on system')
 
     def test_orphaned_file(self):
         hdfs = hdfs_test_client()
@@ -318,4 +335,4 @@ class TestLibHdfs3(HdfsTestCases, unittest.TestCase):
     @classmethod
     def check_driver(cls):
         if not pa.have_libhdfs3():
-            pytest.fail('No libhdfs3 available on system')
+            pytest.skip('No libhdfs3 available on system')

-- 
To stop receiving notification emails like this one, please contact
wesm@apache.org.