You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2016/10/29 01:13:17 UTC

arrow git commit: ARROW-350: Added Kerberos to HDFS client

Repository: arrow
Updated Branches:
  refs/heads/master 3d2e4df21 -> 6178bf7b0


ARROW-350: Added Kerberos to HDFS client

Author: Christopher C. Aycock <ch...@twosigma.com>

Closes #185 from chrisaycock/ARROW-350 and squashes the following commits:

c2a4e64 [Christopher C. Aycock] Renamed 'kerb' parameter to 'kerb_ticket'
f1d63de [Christopher C. Aycock] ARROW-350: Added Kerberos to HDFS client
8f1052f [Christopher C. Aycock] ARROW-345: Proper locations of libhdfs and libjvm on Mac


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/6178bf7b
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/6178bf7b
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/6178bf7b

Branch: refs/heads/master
Commit: 6178bf7b0f0cf66f52536f5d5fb5ee104e696f3c
Parents: 3d2e4df
Author: Christopher C. Aycock <ch...@twosigma.com>
Authored: Fri Oct 28 21:13:02 2016 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Fri Oct 28 21:13:02 2016 -0400

----------------------------------------------------------------------
 cpp/doc/HDFS.md                         | 22 ++++++-
 cpp/src/arrow/io/hdfs.cc                | 16 ++++-
 cpp/src/arrow/io/hdfs.h                 |  9 +--
 cpp/src/arrow/io/libhdfs_shim.cc        | 87 ++++++++++++++++++++--------
 python/pyarrow/includes/libarrow_io.pxd |  1 +
 python/pyarrow/io.pyx                   | 29 +++++++---
 6 files changed, 124 insertions(+), 40 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/6178bf7b/cpp/doc/HDFS.md
----------------------------------------------------------------------
diff --git a/cpp/doc/HDFS.md b/cpp/doc/HDFS.md
index 83311db..6b1bb8c 100644
--- a/cpp/doc/HDFS.md
+++ b/cpp/doc/HDFS.md
@@ -43,7 +43,7 @@ LD_LIBRARY_PATH), and relies on some environment variables.
 export CLASSPATH=`$HADOOP_HOME/bin/hadoop classpath --glob`
 ```
 
-#### Setting $JAVA_HOME  automatically on OS X
+### Mac Specifics
 
 The installed location of Java on OS X can vary, however the following snippet
 will set it automatically for you:
@@ -51,3 +51,23 @@ will set it automatically for you:
 ```shell
 export JAVA_HOME=$(/usr/libexec/java_home)
 ```
+
+Homebrew's Hadoop does not have native libs. Apache doesn't build these, so
+users must build Hadoop to get the native libs. See this Stack Overflow
+answer for details:
+
+http://stackoverflow.com/a/40051353/478288
+
+Be sure to include the path to the native libs in `JAVA_LIBRARY_PATH`:
+
+```shell
+export JAVA_LIBRARY_PATH=$HADOOP_HOME/lib/native:$JAVA_LIBRARY_PATH
+```
+
+If you get an error about needing to install Java 6, then add *BundledApp* and
+*JNI* to the `JVMCapabilities` in `$JAVA_HOME/../Info.plist`. See
+
+https://oliverdowling.com.au/2015/10/09/oracles-jre-8-on-mac-os-x-el-capitan/
+
+https://derflounder.wordpress.com/2015/08/08/modifying-oracles-java-sdk-to-run-java-applications-on-os-x/
+

http://git-wip-us.apache.org/repos/asf/arrow/blob/6178bf7b/cpp/src/arrow/io/hdfs.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/io/hdfs.cc b/cpp/src/arrow/io/hdfs.cc
index b74f846..6490a75 100644
--- a/cpp/src/arrow/io/hdfs.cc
+++ b/cpp/src/arrow/io/hdfs.cc
@@ -287,12 +287,25 @@ class HdfsClient::HdfsClientImpl {
   Status Connect(const HdfsConnectionConfig* config) {
     RETURN_NOT_OK(ConnectLibHdfs());
 
-    fs_ = hdfsConnectAsUser(config->host.c_str(), config->port, config->user.c_str());
+    // connect to HDFS with the builder object
+    hdfsBuilder* builder = hdfsNewBuilder();
+    if (!config->host.empty()) {
+      hdfsBuilderSetNameNode(builder, config->host.c_str());
+    }
+    hdfsBuilderSetNameNodePort(builder, config->port);
+    if (!config->user.empty()) {
+      hdfsBuilderSetUserName(builder, config->user.c_str());
+    }
+    if (!config->kerb_ticket.empty()) {
+      hdfsBuilderSetKerbTicketCachePath(builder, config->kerb_ticket.c_str());
+    }
+    fs_ = hdfsBuilderConnect(builder);
 
     if (fs_ == nullptr) { return Status::IOError("HDFS connection failed"); }
     namenode_host_ = config->host;
     port_ = config->port;
     user_ = config->user;
+    kerb_ticket_ = config->kerb_ticket;
 
     return Status::OK();
   }
@@ -425,6 +438,7 @@ class HdfsClient::HdfsClientImpl {
   std::string namenode_host_;
   std::string user_;
   int port_;
+  std::string kerb_ticket_;
 
   hdfsFS fs_;
 };

http://git-wip-us.apache.org/repos/asf/arrow/blob/6178bf7b/cpp/src/arrow/io/hdfs.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/io/hdfs.h b/cpp/src/arrow/io/hdfs.h
index 4a4e3ec..48699c9 100644
--- a/cpp/src/arrow/io/hdfs.h
+++ b/cpp/src/arrow/io/hdfs.h
@@ -60,19 +60,16 @@ struct HdfsConnectionConfig {
   std::string host;
   int port;
   std::string user;
-
-  // TODO: Kerberos, etc.
+  std::string kerb_ticket;
 };
 
 class ARROW_EXPORT HdfsClient : public FileSystemClient {
  public:
   ~HdfsClient();
 
-  // Connect to an HDFS cluster at indicated host, port, and as user
+  // Connect to an HDFS cluster given a configuration
   //
-  // @param host (in)
-  // @param port (in)
-  // @param user (in): user to identify as
+  // @param config (in): configuration for connecting
   // @param fs (out): the created client
   // @returns Status
   static Status Connect(

http://git-wip-us.apache.org/repos/asf/arrow/blob/6178bf7b/cpp/src/arrow/io/libhdfs_shim.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/io/libhdfs_shim.cc b/cpp/src/arrow/io/libhdfs_shim.cc
index f256c31..07eb625 100644
--- a/cpp/src/arrow/io/libhdfs_shim.cc
+++ b/cpp/src/arrow/io/libhdfs_shim.cc
@@ -73,9 +73,17 @@ static HINSTANCE libjvm_handle = NULL;
 
 // NOTE(wesm): cpplint does not like use of short and other imprecise C types
 
-static hdfsFS (*ptr_hdfsConnectAsUser)(
-    const char* host, tPort port, const char* user) = NULL;
-static hdfsFS (*ptr_hdfsConnect)(const char* host, tPort port) = NULL;
+static hdfsBuilder* (*ptr_hdfsNewBuilder)(void) = NULL;
+static void (*ptr_hdfsBuilderSetNameNode)(
+    hdfsBuilder* bld, const char* nn) = NULL;
+static void (*ptr_hdfsBuilderSetNameNodePort)(
+    hdfsBuilder* bld, tPort port) = NULL;
+static void (*ptr_hdfsBuilderSetUserName)(
+    hdfsBuilder* bld, const char* userName) = NULL;
+static void (*ptr_hdfsBuilderSetKerbTicketCachePath)(
+    hdfsBuilder* bld, const char* kerbTicketCachePath) = NULL;
+static hdfsFS (*ptr_hdfsBuilderConnect)(hdfsBuilder* bld) = NULL;
+
 static int (*ptr_hdfsDisconnect)(hdfsFS fs) = NULL;
 
 static hdfsFile (*ptr_hdfsOpenFile)(hdfsFS fs, const char* path, int flags,
@@ -149,18 +157,29 @@ static void* get_symbol(const char* symbol) {
 #endif
 }
 
-hdfsFS hdfsConnectAsUser(const char* host, tPort port, const char* user) {
-  return ptr_hdfsConnectAsUser(host, port, user);
+hdfsBuilder* hdfsNewBuilder(void) {
+  return ptr_hdfsNewBuilder();
 }
 
-// Returns NULL on failure
-hdfsFS hdfsConnect(const char* host, tPort port) {
-  if (ptr_hdfsConnect) {
-    return ptr_hdfsConnect(host, port);
-  } else {
-    // TODO: error reporting when shim setup fails
-    return NULL;
-  }
+void hdfsBuilderSetNameNode(hdfsBuilder* bld, const char* nn) {
+  ptr_hdfsBuilderSetNameNode(bld, nn);
+}
+
+void hdfsBuilderSetNameNodePort(hdfsBuilder* bld, tPort port) {
+  ptr_hdfsBuilderSetNameNodePort(bld, port);
+}
+
+void hdfsBuilderSetUserName(hdfsBuilder* bld, const char* userName) {
+  ptr_hdfsBuilderSetUserName(bld, userName);
+}
+
+void hdfsBuilderSetKerbTicketCachePath(hdfsBuilder* bld,
+    const char* kerbTicketCachePath) {
+  ptr_hdfsBuilderSetKerbTicketCachePath(bld , kerbTicketCachePath);
+}
+
+hdfsFS hdfsBuilderConnect(hdfsBuilder* bld) {
+  return ptr_hdfsBuilderConnect(bld);
 }
 
 int hdfsDisconnect(hdfsFS fs) {
@@ -342,18 +361,36 @@ int hdfsUtime(hdfsFS fs, const char* path, tTime mtime, tTime atime) {
 }
 
 static std::vector<fs::path> get_potential_libhdfs_paths() {
-  std::vector<fs::path> libhdfs_potential_paths = {
-      // find one in the local directory
-      fs::path("./libhdfs.so"), fs::path("./hdfs.dll"),
-      // find a global libhdfs.so
-      fs::path("libhdfs.so"), fs::path("hdfs.dll"),
+  std::vector<fs::path> libhdfs_potential_paths;
+  std::string file_name;
+
+  // OS-specific file name
+#ifdef __WIN32
+  file_name = "hdfs.dll";
+#elif __APPLE__
+  file_name = "libhdfs.dylib";
+#else
+  file_name = "libhdfs.so";
+#endif
+
+  // Common paths
+  std::vector<fs::path> search_paths = {
+      fs::path(""),
+      fs::path(".")
   };
 
+  // Path from environment variable
   const char* hadoop_home = std::getenv("HADOOP_HOME");
   if (hadoop_home != nullptr) {
-    auto path = fs::path(hadoop_home) / "lib/native/libhdfs.so";
-    libhdfs_potential_paths.push_back(path);
+    auto path = fs::path(hadoop_home) / "lib/native";
+    search_paths.push_back(path);
   }
+
+  // All paths with file name
+  for (auto& path : search_paths) {
+    libhdfs_potential_paths.push_back(path / file_name);
+  }
+
   return libhdfs_potential_paths;
 }
 
@@ -371,7 +408,7 @@ static std::vector<fs::path> get_potential_libjvm_paths() {
   file_name = "jvm.dll";
 #elif __APPLE__
   search_prefixes = {""};
-  search_suffixes = {""};
+  search_suffixes = {"", "/jre/lib/server"};
   file_name = "libjvm.dylib";
 
 // SFrame uses /usr/libexec/java_home to find JAVA_HOME; for now we are
@@ -513,8 +550,12 @@ Status ARROW_EXPORT ConnectLibHdfs() {
     return Status::IOError("Prior attempt to load libhdfs failed");
   }
 
-  GET_SYMBOL_REQUIRED(hdfsConnect);
-  GET_SYMBOL_REQUIRED(hdfsConnectAsUser);
+  GET_SYMBOL_REQUIRED(hdfsNewBuilder);
+  GET_SYMBOL_REQUIRED(hdfsBuilderSetNameNode);
+  GET_SYMBOL_REQUIRED(hdfsBuilderSetNameNodePort);
+  GET_SYMBOL_REQUIRED(hdfsBuilderSetUserName);
+  GET_SYMBOL_REQUIRED(hdfsBuilderSetKerbTicketCachePath);
+  GET_SYMBOL_REQUIRED(hdfsBuilderConnect);
   GET_SYMBOL_REQUIRED(hdfsCreateDirectory);
   GET_SYMBOL_REQUIRED(hdfsDelete);
   GET_SYMBOL_REQUIRED(hdfsDisconnect);

http://git-wip-us.apache.org/repos/asf/arrow/blob/6178bf7b/python/pyarrow/includes/libarrow_io.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/libarrow_io.pxd b/python/pyarrow/includes/libarrow_io.pxd
index 8074915..7703415 100644
--- a/python/pyarrow/includes/libarrow_io.pxd
+++ b/python/pyarrow/includes/libarrow_io.pxd
@@ -93,6 +93,7 @@ cdef extern from "arrow/io/hdfs.h" namespace "arrow::io" nogil:
         c_string host
         int port
         c_string user
+        c_string kerb_ticket
 
     cdef cppclass HdfsPathInfo:
         ObjectType kind;

http://git-wip-us.apache.org/repos/asf/arrow/blob/6178bf7b/python/pyarrow/io.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/io.pyx b/python/pyarrow/io.pyx
index 16ebfa1..0e6b81e 100644
--- a/python/pyarrow/io.pyx
+++ b/python/pyarrow/io.pyx
@@ -288,9 +288,6 @@ cdef class HdfsClient:
         shared_ptr[CHdfsClient] client
 
     cdef readonly:
-        object host
-        int port
-        object user
         bint is_open
 
     def __cinit__(self):
@@ -301,6 +298,9 @@ cdef class HdfsClient:
             self.close()
 
     def close(self):
+        """
+        Disconnect from the HDFS cluster
+        """
         self._ensure_client()
         with nogil:
             check_status(self.client.get().Disconnect())
@@ -313,14 +313,21 @@ cdef class HdfsClient:
             raise IOError('HDFS client is closed')
 
     @classmethod
-    def connect(cls, host, port, user):
+    def connect(cls, host="default", port=0, user=None, kerb_ticket=None):
         """
+        Connect to an HDFS cluster. All parameters are optional and should
+        only be set if the defaults need to be overridden.
+
+        Authentication should be automatic if the HDFS cluster uses Kerberos.
+        However, if a username is specified, then the ticket cache will likely
+        be required.
 
         Parameters
         ----------
-        host :
-        port :
-        user :
+        host : NameNode. Set to "default" for fs.defaultFS from core-site.xml.
+        port : NameNode's port. Set to 0 for default or logical (HA) nodes.
+        user : Username when connecting to HDFS; None implies login user.
+        kerb_ticket : Path to Kerberos ticket cache.
 
         Notes
         -----
@@ -335,9 +342,13 @@ cdef class HdfsClient:
             HdfsClient out = HdfsClient()
             HdfsConnectionConfig conf
 
-        conf.host = tobytes(host)
+        if host is not None:
+            conf.host = tobytes(host)
         conf.port = port
-        conf.user = tobytes(user)
+        if user is not None:
+            conf.user = tobytes(user)
+        if kerb_ticket is not None:
+            conf.kerb_ticket = tobytes(kerb_ticket)
 
         with nogil:
             check_status(CHdfsClient.Connect(&conf, &out.client))