You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2018/01/25 22:01:11 UTC
[arrow] branch master updated: ARROW-2031: [Python]
HadoopFileSystem is pickleable
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 8edd62e ARROW-2031: [Python] HadoopFileSystem is pickleable
8edd62e is described below
commit 8edd62e1bda0bf38f0fce872167be99826d28da5
Author: Jim Crist <ji...@gmail.com>
AuthorDate: Thu Jan 25 17:01:02 2018 -0500
ARROW-2031: [Python] HadoopFileSystem is pickleable
Adds support for pickling `HadoopFileSystem`
A few additional small fixes:
- Adds a check that `driver` is one of {'libhdfs3', 'libhdfs'}
- A few small tweaks to the hdfs tests to make them easier to run locally.
Author: Jim Crist <ji...@gmail.com>
Closes #1505 from jcrist/pickle-hdfs-filesystem and squashes the following commits:
b19ed3e0 [Jim Crist] Compat with older cython versions
1f747264 [Jim Crist] HadoopFileSystem is pickleable
---
python/pyarrow/hdfs.py | 4 ++++
python/pyarrow/io-hdfs.pxi | 20 ++++++++++++++++----
python/pyarrow/tests/test_hdfs.py | 23 ++++++++++++++++++++---
3 files changed, 40 insertions(+), 7 deletions(-)
diff --git a/python/pyarrow/hdfs.py b/python/pyarrow/hdfs.py
index 3c9d041..3f2014b 100644
--- a/python/pyarrow/hdfs.py
+++ b/python/pyarrow/hdfs.py
@@ -36,6 +36,10 @@ class HadoopFileSystem(lib.HadoopFileSystem, FileSystem):
self._connect(host, port, user, kerb_ticket, driver)
+ def __reduce__(self):
+ return (HadoopFileSystem, (self.host, self.port, self.user,
+ self.kerb_ticket, self.driver))
+
@implements(FileSystem.isdir)
def isdir(self, path):
return super(HadoopFileSystem, self).isdir(path)
diff --git a/python/pyarrow/io-hdfs.pxi b/python/pyarrow/io-hdfs.pxi
index e653813..3abf045 100644
--- a/python/pyarrow/io-hdfs.pxi
+++ b/python/pyarrow/io-hdfs.pxi
@@ -59,29 +59,41 @@ cdef class HadoopFileSystem:
cdef readonly:
bint is_open
-
- def __cinit__(self):
- pass
+ str host
+ str user
+ str kerb_ticket
+ str driver
+ int port
def _connect(self, host, port, user, kerb_ticket, driver):
cdef HdfsConnectionConfig conf
if host is not None:
conf.host = tobytes(host)
+ self.host = host
+
conf.port = port
+ self.port = port
+
if user is not None:
conf.user = tobytes(user)
+ self.user = user
+
if kerb_ticket is not None:
conf.kerb_ticket = tobytes(kerb_ticket)
+ self.kerb_ticket = kerb_ticket
if driver == 'libhdfs':
with nogil:
check_status(HaveLibHdfs())
conf.driver = HdfsDriver_LIBHDFS
- else:
+ elif driver == 'libhdfs3':
with nogil:
check_status(HaveLibHdfs3())
conf.driver = HdfsDriver_LIBHDFS3
+ else:
+ raise ValueError("unknown driver: %r" % driver)
+ self.driver = driver
with nogil:
check_status(CHadoopFileSystem.Connect(&conf, &self.client))
diff --git a/python/pyarrow/tests/test_hdfs.py b/python/pyarrow/tests/test_hdfs.py
index 51b6ba2..b62458c 100644
--- a/python/pyarrow/tests/test_hdfs.py
+++ b/python/pyarrow/tests/test_hdfs.py
@@ -18,6 +18,7 @@
from io import BytesIO
from os.path import join as pjoin
import os
+import pickle
import random
import unittest
@@ -36,7 +37,7 @@ import pyarrow.tests.test_parquet as test_parquet
def hdfs_test_client(driver='libhdfs'):
host = os.environ.get('ARROW_HDFS_TEST_HOST', 'localhost')
- user = os.environ['ARROW_HDFS_TEST_USER']
+ user = os.environ.get('ARROW_HDFS_TEST_USER', None)
try:
port = int(os.environ.get('ARROW_HDFS_TEST_PORT', 20500))
except ValueError:
@@ -72,6 +73,22 @@ class HdfsTestCases(object):
cls.hdfs.delete(cls.tmp_path, recursive=True)
cls.hdfs.close()
+ def test_unknown_driver(self):
+ with pytest.raises(ValueError):
+ hdfs_test_client(driver="not_a_driver_name")
+
+ def test_pickle(self):
+ s = pickle.dumps(self.hdfs)
+ h2 = pickle.loads(s)
+ assert h2.is_open
+ assert h2.host == self.hdfs.host
+ assert h2.port == self.hdfs.port
+ assert h2.user == self.hdfs.user
+ assert h2.kerb_ticket == self.hdfs.kerb_ticket
+ assert h2.driver == self.hdfs.driver
+ # smoketest unpickled client works
+ h2.ls(self.tmp_path)
+
def test_cat(self):
path = pjoin(self.tmp_path, 'cat-test')
@@ -299,7 +316,7 @@ class TestLibHdfs(HdfsTestCases, unittest.TestCase):
@classmethod
def check_driver(cls):
if not pa.have_libhdfs():
- pytest.fail('No libhdfs available on system')
+ pytest.skip('No libhdfs available on system')
def test_orphaned_file(self):
hdfs = hdfs_test_client()
@@ -318,4 +335,4 @@ class TestLibHdfs3(HdfsTestCases, unittest.TestCase):
@classmethod
def check_driver(cls):
if not pa.have_libhdfs3():
- pytest.fail('No libhdfs3 available on system')
+ pytest.skip('No libhdfs3 available on system')
--
To stop receiving notification emails like this one, please contact
wesm@apache.org.