You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/09/05 23:17:23 UTC

arrow git commit: ARROW-1317: [Python] Attempt to set Hadoop CLASSPATH when using JNI

Repository: arrow
Updated Branches:
  refs/heads/master 2660dda40 -> f355354c2


ARROW-1317: [Python] Attempt to set Hadoop CLASSPATH when using JNI

Author: Wes McKinney <we...@twosigma.com>

Closes #1040 from wesm/ARROW-1317 and squashes the following commits:

274f839c [Wes McKinney] Add note to documentation
1d664b29 [Wes McKinney] If HADOOP_HOME is not set, see if 'hadoop' is in PATH
c6e52d12 [Wes McKinney] Try to set CLASSPATH if HADOOP_HOME is set but classpath is not


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/f355354c
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/f355354c
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/f355354c

Branch: refs/heads/master
Commit: f355354c24d58ea86558e8fc46f33b47d402ac04
Parents: 2660dda
Author: Wes McKinney <we...@twosigma.com>
Authored: Tue Sep 5 19:17:19 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Tue Sep 5 19:17:19 2017 -0400

----------------------------------------------------------------------
 python/doc/source/filesystems.rst |  3 +++
 python/pyarrow/hdfs.py            | 19 +++++++++++++++++++
 2 files changed, 22 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/f355354c/python/doc/source/filesystems.rst
----------------------------------------------------------------------
diff --git a/python/doc/source/filesystems.rst b/python/doc/source/filesystems.rst
index c0530f9..5c3297b 100644
--- a/python/doc/source/filesystems.rst
+++ b/python/doc/source/filesystems.rst
@@ -54,6 +54,9 @@ LD_LIBRARY_PATH), and relies on some environment variables.
 
     export CLASSPATH=`$HADOOP_HOME/bin/hdfs classpath --glob`
 
+If ``CLASSPATH`` is not set, then it will be set automatically if the
+``hadoop`` executable is in your system path, or if ``HADOOP_HOME`` is set.
+
 You can also use libhdfs3, a thirdparty C++ library for HDFS from Pivotal Labs:
 
 .. code-block:: python

http://git-wip-us.apache.org/repos/asf/arrow/blob/f355354c/python/pyarrow/hdfs.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/hdfs.py b/python/pyarrow/hdfs.py
index 2f20be2..3c9d041 100644
--- a/python/pyarrow/hdfs.py
+++ b/python/pyarrow/hdfs.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import os
 import posixpath
 
 from pyarrow.util import implements
@@ -30,6 +31,9 @@ class HadoopFileSystem(lib.HadoopFileSystem, FileSystem):
 
     def __init__(self, host="default", port=0, user=None, kerb_ticket=None,
                  driver='libhdfs'):
+        if driver == 'libhdfs':
+            _maybe_set_hadoop_classpath()
+
         self._connect(host, port, user, kerb_ticket, driver)
 
     @implements(FileSystem.isdir)
@@ -105,6 +109,21 @@ class HadoopFileSystem(lib.HadoopFileSystem, FileSystem):
                 yield tup
 
 
+def _maybe_set_hadoop_classpath():
+    import subprocess
+
+    if 'hadoop' in os.environ.get('CLASSPATH', ''):
+        return
+
+    if 'HADOOP_HOME' in os.environ:
+        hadoop_bin = '{0}/bin/hadoop'.format(os.environ['HADOOP_HOME'])
+    else:
+        hadoop_bin = 'hadoop'
+
+    classpath = subprocess.check_output([hadoop_bin, 'classpath', '--glob'])
+    os.environ['CLASSPATH'] = classpath.decode('utf-8')
+
+
 def _libhdfs_walk_files_dirs(top_path, contents):
     files = []
     directories = []