You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2016/10/29 01:13:17 UTC
arrow git commit: ARROW-350: Added Kerberos to HDFS client
Repository: arrow
Updated Branches:
refs/heads/master 3d2e4df21 -> 6178bf7b0
ARROW-350: Added Kerberos to HDFS client
Author: Christopher C. Aycock <ch...@twosigma.com>
Closes #185 from chrisaycock/ARROW-350 and squashes the following commits:
c2a4e64 [Christopher C. Aycock] Renamed 'kerb' parameter to 'kerb_ticket'
f1d63de [Christopher C. Aycock] ARROW-350: Added Kerberos to HDFS client
8f1052f [Christopher C. Aycock] ARROW-345: Proper locations of libhdfs and libjvm on Mac
Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/6178bf7b
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/6178bf7b
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/6178bf7b
Branch: refs/heads/master
Commit: 6178bf7b0f0cf66f52536f5d5fb5ee104e696f3c
Parents: 3d2e4df
Author: Christopher C. Aycock <ch...@twosigma.com>
Authored: Fri Oct 28 21:13:02 2016 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Fri Oct 28 21:13:02 2016 -0400
----------------------------------------------------------------------
cpp/doc/HDFS.md | 22 ++++++-
cpp/src/arrow/io/hdfs.cc | 16 ++++-
cpp/src/arrow/io/hdfs.h | 9 +--
cpp/src/arrow/io/libhdfs_shim.cc | 87 ++++++++++++++++++++--------
python/pyarrow/includes/libarrow_io.pxd | 1 +
python/pyarrow/io.pyx | 29 +++++++---
6 files changed, 124 insertions(+), 40 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/arrow/blob/6178bf7b/cpp/doc/HDFS.md
----------------------------------------------------------------------
diff --git a/cpp/doc/HDFS.md b/cpp/doc/HDFS.md
index 83311db..6b1bb8c 100644
--- a/cpp/doc/HDFS.md
+++ b/cpp/doc/HDFS.md
@@ -43,7 +43,7 @@ LD_LIBRARY_PATH), and relies on some environment variables.
export CLASSPATH=`$HADOOP_HOME/bin/hadoop classpath --glob`
```
-#### Setting $JAVA_HOME automatically on OS X
+### Mac Specifics
The installed location of Java on OS X can vary, however the following snippet
will set it automatically for you:
@@ -51,3 +51,23 @@ will set it automatically for you:
```shell
export JAVA_HOME=$(/usr/libexec/java_home)
```
+
+Homebrew's Hadoop does not have native libs. Apache doesn't build these, so
+users must build Hadoop to get the native libs. See this Stack Overflow
+answer for details:
+
+http://stackoverflow.com/a/40051353/478288
+
+Be sure to include the path to the native libs in `JAVA_LIBRARY_PATH`:
+
+```shell
+export JAVA_LIBRARY_PATH=$HADOOP_HOME/lib/native:$JAVA_LIBRARY_PATH
+```
+
+If you get an error about needing to install Java 6, then add *BundledApp* and
+*JNI* to the `JVMCapabilities` in `$JAVA_HOME/../Info.plist`. See
+
+https://oliverdowling.com.au/2015/10/09/oracles-jre-8-on-mac-os-x-el-capitan/
+
+https://derflounder.wordpress.com/2015/08/08/modifying-oracles-java-sdk-to-run-java-applications-on-os-x/
+
http://git-wip-us.apache.org/repos/asf/arrow/blob/6178bf7b/cpp/src/arrow/io/hdfs.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/io/hdfs.cc b/cpp/src/arrow/io/hdfs.cc
index b74f846..6490a75 100644
--- a/cpp/src/arrow/io/hdfs.cc
+++ b/cpp/src/arrow/io/hdfs.cc
@@ -287,12 +287,25 @@ class HdfsClient::HdfsClientImpl {
Status Connect(const HdfsConnectionConfig* config) {
RETURN_NOT_OK(ConnectLibHdfs());
- fs_ = hdfsConnectAsUser(config->host.c_str(), config->port, config->user.c_str());
+ // connect to HDFS with the builder object
+ hdfsBuilder* builder = hdfsNewBuilder();
+ if (!config->host.empty()) {
+ hdfsBuilderSetNameNode(builder, config->host.c_str());
+ }
+ hdfsBuilderSetNameNodePort(builder, config->port);
+ if (!config->user.empty()) {
+ hdfsBuilderSetUserName(builder, config->user.c_str());
+ }
+ if (!config->kerb_ticket.empty()) {
+ hdfsBuilderSetKerbTicketCachePath(builder, config->kerb_ticket.c_str());
+ }
+ fs_ = hdfsBuilderConnect(builder);
if (fs_ == nullptr) { return Status::IOError("HDFS connection failed"); }
namenode_host_ = config->host;
port_ = config->port;
user_ = config->user;
+ kerb_ticket_ = config->kerb_ticket;
return Status::OK();
}
@@ -425,6 +438,7 @@ class HdfsClient::HdfsClientImpl {
std::string namenode_host_;
std::string user_;
int port_;
+ std::string kerb_ticket_;
hdfsFS fs_;
};
http://git-wip-us.apache.org/repos/asf/arrow/blob/6178bf7b/cpp/src/arrow/io/hdfs.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/io/hdfs.h b/cpp/src/arrow/io/hdfs.h
index 4a4e3ec..48699c9 100644
--- a/cpp/src/arrow/io/hdfs.h
+++ b/cpp/src/arrow/io/hdfs.h
@@ -60,19 +60,16 @@ struct HdfsConnectionConfig {
std::string host;
int port;
std::string user;
-
- // TODO: Kerberos, etc.
+ std::string kerb_ticket;
};
class ARROW_EXPORT HdfsClient : public FileSystemClient {
public:
~HdfsClient();
- // Connect to an HDFS cluster at indicated host, port, and as user
+ // Connect to an HDFS cluster given a configuration
//
- // @param host (in)
- // @param port (in)
- // @param user (in): user to identify as
+ // @param config (in): configuration for connecting
// @param fs (out): the created client
// @returns Status
static Status Connect(
http://git-wip-us.apache.org/repos/asf/arrow/blob/6178bf7b/cpp/src/arrow/io/libhdfs_shim.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/io/libhdfs_shim.cc b/cpp/src/arrow/io/libhdfs_shim.cc
index f256c31..07eb625 100644
--- a/cpp/src/arrow/io/libhdfs_shim.cc
+++ b/cpp/src/arrow/io/libhdfs_shim.cc
@@ -73,9 +73,17 @@ static HINSTANCE libjvm_handle = NULL;
// NOTE(wesm): cpplint does not like use of short and other imprecise C types
-static hdfsFS (*ptr_hdfsConnectAsUser)(
- const char* host, tPort port, const char* user) = NULL;
-static hdfsFS (*ptr_hdfsConnect)(const char* host, tPort port) = NULL;
+static hdfsBuilder* (*ptr_hdfsNewBuilder)(void) = NULL;
+static void (*ptr_hdfsBuilderSetNameNode)(
+ hdfsBuilder* bld, const char* nn) = NULL;
+static void (*ptr_hdfsBuilderSetNameNodePort)(
+ hdfsBuilder* bld, tPort port) = NULL;
+static void (*ptr_hdfsBuilderSetUserName)(
+ hdfsBuilder* bld, const char* userName) = NULL;
+static void (*ptr_hdfsBuilderSetKerbTicketCachePath)(
+ hdfsBuilder* bld, const char* kerbTicketCachePath) = NULL;
+static hdfsFS (*ptr_hdfsBuilderConnect)(hdfsBuilder* bld) = NULL;
+
static int (*ptr_hdfsDisconnect)(hdfsFS fs) = NULL;
static hdfsFile (*ptr_hdfsOpenFile)(hdfsFS fs, const char* path, int flags,
@@ -149,18 +157,29 @@ static void* get_symbol(const char* symbol) {
#endif
}
-hdfsFS hdfsConnectAsUser(const char* host, tPort port, const char* user) {
- return ptr_hdfsConnectAsUser(host, port, user);
+hdfsBuilder* hdfsNewBuilder(void) {
+ return ptr_hdfsNewBuilder();
}
-// Returns NULL on failure
-hdfsFS hdfsConnect(const char* host, tPort port) {
- if (ptr_hdfsConnect) {
- return ptr_hdfsConnect(host, port);
- } else {
- // TODO: error reporting when shim setup fails
- return NULL;
- }
+void hdfsBuilderSetNameNode(hdfsBuilder* bld, const char* nn) {
+ ptr_hdfsBuilderSetNameNode(bld, nn);
+}
+
+void hdfsBuilderSetNameNodePort(hdfsBuilder* bld, tPort port) {
+ ptr_hdfsBuilderSetNameNodePort(bld, port);
+}
+
+void hdfsBuilderSetUserName(hdfsBuilder* bld, const char* userName) {
+ ptr_hdfsBuilderSetUserName(bld, userName);
+}
+
+void hdfsBuilderSetKerbTicketCachePath(hdfsBuilder* bld,
+ const char* kerbTicketCachePath) {
+ ptr_hdfsBuilderSetKerbTicketCachePath(bld , kerbTicketCachePath);
+}
+
+hdfsFS hdfsBuilderConnect(hdfsBuilder* bld) {
+ return ptr_hdfsBuilderConnect(bld);
}
int hdfsDisconnect(hdfsFS fs) {
@@ -342,18 +361,36 @@ int hdfsUtime(hdfsFS fs, const char* path, tTime mtime, tTime atime) {
}
static std::vector<fs::path> get_potential_libhdfs_paths() {
- std::vector<fs::path> libhdfs_potential_paths = {
- // find one in the local directory
- fs::path("./libhdfs.so"), fs::path("./hdfs.dll"),
- // find a global libhdfs.so
- fs::path("libhdfs.so"), fs::path("hdfs.dll"),
+ std::vector<fs::path> libhdfs_potential_paths;
+ std::string file_name;
+
+ // OS-specific file name
+#ifdef __WIN32
+ file_name = "hdfs.dll";
+#elif __APPLE__
+ file_name = "libhdfs.dylib";
+#else
+ file_name = "libhdfs.so";
+#endif
+
+ // Common paths
+ std::vector<fs::path> search_paths = {
+ fs::path(""),
+ fs::path(".")
};
+ // Path from environment variable
const char* hadoop_home = std::getenv("HADOOP_HOME");
if (hadoop_home != nullptr) {
- auto path = fs::path(hadoop_home) / "lib/native/libhdfs.so";
- libhdfs_potential_paths.push_back(path);
+ auto path = fs::path(hadoop_home) / "lib/native";
+ search_paths.push_back(path);
}
+
+ // All paths with file name
+ for (auto& path : search_paths) {
+ libhdfs_potential_paths.push_back(path / file_name);
+ }
+
return libhdfs_potential_paths;
}
@@ -371,7 +408,7 @@ static std::vector<fs::path> get_potential_libjvm_paths() {
file_name = "jvm.dll";
#elif __APPLE__
search_prefixes = {""};
- search_suffixes = {""};
+ search_suffixes = {"", "/jre/lib/server"};
file_name = "libjvm.dylib";
// SFrame uses /usr/libexec/java_home to find JAVA_HOME; for now we are
@@ -513,8 +550,12 @@ Status ARROW_EXPORT ConnectLibHdfs() {
return Status::IOError("Prior attempt to load libhdfs failed");
}
- GET_SYMBOL_REQUIRED(hdfsConnect);
- GET_SYMBOL_REQUIRED(hdfsConnectAsUser);
+ GET_SYMBOL_REQUIRED(hdfsNewBuilder);
+ GET_SYMBOL_REQUIRED(hdfsBuilderSetNameNode);
+ GET_SYMBOL_REQUIRED(hdfsBuilderSetNameNodePort);
+ GET_SYMBOL_REQUIRED(hdfsBuilderSetUserName);
+ GET_SYMBOL_REQUIRED(hdfsBuilderSetKerbTicketCachePath);
+ GET_SYMBOL_REQUIRED(hdfsBuilderConnect);
GET_SYMBOL_REQUIRED(hdfsCreateDirectory);
GET_SYMBOL_REQUIRED(hdfsDelete);
GET_SYMBOL_REQUIRED(hdfsDisconnect);
http://git-wip-us.apache.org/repos/asf/arrow/blob/6178bf7b/python/pyarrow/includes/libarrow_io.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/libarrow_io.pxd b/python/pyarrow/includes/libarrow_io.pxd
index 8074915..7703415 100644
--- a/python/pyarrow/includes/libarrow_io.pxd
+++ b/python/pyarrow/includes/libarrow_io.pxd
@@ -93,6 +93,7 @@ cdef extern from "arrow/io/hdfs.h" namespace "arrow::io" nogil:
c_string host
int port
c_string user
+ c_string kerb_ticket
cdef cppclass HdfsPathInfo:
ObjectType kind;
http://git-wip-us.apache.org/repos/asf/arrow/blob/6178bf7b/python/pyarrow/io.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/io.pyx b/python/pyarrow/io.pyx
index 16ebfa1..0e6b81e 100644
--- a/python/pyarrow/io.pyx
+++ b/python/pyarrow/io.pyx
@@ -288,9 +288,6 @@ cdef class HdfsClient:
shared_ptr[CHdfsClient] client
cdef readonly:
- object host
- int port
- object user
bint is_open
def __cinit__(self):
@@ -301,6 +298,9 @@ cdef class HdfsClient:
self.close()
def close(self):
+ """
+ Disconnect from the HDFS cluster
+ """
self._ensure_client()
with nogil:
check_status(self.client.get().Disconnect())
@@ -313,14 +313,21 @@ cdef class HdfsClient:
raise IOError('HDFS client is closed')
@classmethod
- def connect(cls, host, port, user):
+ def connect(cls, host="default", port=0, user=None, kerb_ticket=None):
"""
+ Connect to an HDFS cluster. All parameters are optional and should
+ only be set if the defaults need to be overridden.
+
+ Authentication should be automatic if the HDFS cluster uses Kerberos.
+ However, if a username is specified, then the ticket cache will likely
+ be required.
Parameters
----------
- host :
- port :
- user :
+ host : NameNode. Set to "default" for fs.defaultFS from core-site.xml.
+ port : NameNode's port. Set to 0 for default or logical (HA) nodes.
+ user : Username when connecting to HDFS; None implies login user.
+ kerb_ticket : Path to Kerberos ticket cache.
Notes
-----
@@ -335,9 +342,13 @@ cdef class HdfsClient:
HdfsClient out = HdfsClient()
HdfsConnectionConfig conf
- conf.host = tobytes(host)
+ if host is not None:
+ conf.host = tobytes(host)
conf.port = port
- conf.user = tobytes(user)
+ if user is not None:
+ conf.user = tobytes(user)
+ if kerb_ticket is not None:
+ conf.kerb_ticket = tobytes(kerb_ticket)
with nogil:
check_status(CHdfsClient.Connect(&conf, &out.client))