You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/07/28 14:31:02 UTC

arrow git commit: ARROW-1281: [C++/Python] Add Docker setup for testing HDFS IO in C++ and Python

Repository: arrow
Updated Branches:
  refs/heads/master ff6c6e0f9 -> 8841bc071


ARROW-1281: [C++/Python] Add Docker setup for testing HDFS IO in C++ and Python

We aren't testing this in Travis CI because spinning up an HDFS cluster is a bit heavy weight, but this will at least enable us to do easier ongoing validation that this functionality is working properly.

Author: Wes McKinney <we...@twosigma.com>

Closes #895 from wesm/ARROW-1281 and squashes the following commits:

a96e1665 [Wes McKinney] Fix header
4effee78 [Wes McKinney] Fix license header
d12eea48 [Wes McKinney] Fix license headers
591e7c6b [Wes McKinney] Add Python tests
bbbd8c10 [Wes McKinney] Docker HDFS testing scripts, use hdfs-client.xml from Apache HAWQ (incubating)


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/8841bc07
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/8841bc07
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/8841bc07

Branch: refs/heads/master
Commit: 8841bc071b1d0a3eff2592af5ca9b5591ed9e5c5
Parents: ff6c6e0
Author: Wes McKinney <we...@twosigma.com>
Authored: Fri Jul 28 10:30:57 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Fri Jul 28 10:30:57 2017 -0400

----------------------------------------------------------------------
 python/testing/README.md                        |  26 ++
 python/testing/functions.sh                     | 100 ++++++
 python/testing/hdfs/Dockerfile                  |  50 +++
 python/testing/hdfs/libhdfs3-hdfs-client.xml    | 332 +++++++++++++++++++
 python/testing/hdfs/restart_docker_container.sh |  38 +++
 python/testing/hdfs/run_tests.sh                |  41 +++
 python/testing/set_env_common.sh                |  70 ++++
 python/testing/setup_toolchain.sh               |  65 ++++
 python/testing/test_hdfs.sh                     |  25 ++
 9 files changed, 747 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/8841bc07/python/testing/README.md
----------------------------------------------------------------------
diff --git a/python/testing/README.md b/python/testing/README.md
new file mode 100644
index 0000000..07970a2
--- /dev/null
+++ b/python/testing/README.md
@@ -0,0 +1,26 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# Testing tools for odds and ends
+
+## Testing HDFS file interface
+
+```shell
+./test_hdfs.sh
+```
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/arrow/blob/8841bc07/python/testing/functions.sh
----------------------------------------------------------------------
diff --git a/python/testing/functions.sh b/python/testing/functions.sh
new file mode 100644
index 0000000..6bc342b
--- /dev/null
+++ b/python/testing/functions.sh
@@ -0,0 +1,100 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+use_gcc() {
+  export CC=gcc-4.9
+  export CXX=g++-4.9
+}
+
+use_clang() {
+  export CC=clang-4.0
+  export CXX=clang++-4.0
+}
+
+bootstrap_python_env() {
+  PYTHON_VERSION=$1
+  CONDA_ENV_DIR=$BUILD_DIR/pyarrow-test-$PYTHON_VERSION
+
+  conda create -y -q -p $CONDA_ENV_DIR python=$PYTHON_VERSION cmake curl
+  source activate $CONDA_ENV_DIR
+
+  python --version
+  which python
+
+  # faster builds, please
+  conda install -y -q nomkl pip numpy pandas cython
+}
+
+build_pyarrow() {
+  # Other stuff pip install
+  pushd $ARROW_PYTHON_DIR
+  pip install -r requirements.txt
+  python setup.py build_ext --with-parquet --with-plasma \
+         install --single-version-externally-managed --record=record.text
+  popd
+
+  python -c "import pyarrow.parquet"
+  python -c "import pyarrow.plasma"
+
+  export PYARROW_PATH=$CONDA_PREFIX/lib/python$PYTHON_VERSION/site-packages/pyarrow
+}
+
+build_arrow() {
+  mkdir -p $ARROW_CPP_BUILD_DIR
+  pushd $ARROW_CPP_BUILD_DIR
+
+  cmake -GNinja \
+        -DCMAKE_BUILD_TYPE=$BUILD_TYPE \
+        -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \
+        -DARROW_NO_DEPRECATED_API=ON \
+        -DARROW_PYTHON=ON \
+        -DARROW_PLASMA=ON \
+        -DARROW_BOOST_USE_SHARED=off \
+        $ARROW_CPP_DIR
+
+  ninja
+  ninja install
+  popd
+}
+
+build_parquet() {
+  PARQUET_DIR=$BUILD_DIR/parquet
+  mkdir -p $PARQUET_DIR
+
+  git clone https://github.com/apache/parquet-cpp.git $PARQUET_DIR
+
+  pushd $PARQUET_DIR
+  mkdir build-dir
+  cd build-dir
+
+  cmake \
+      -GNinja \
+      -DCMAKE_BUILD_TYPE=$BUILD_TYPE \
+      -DCMAKE_INSTALL_PREFIX=$PARQUET_HOME \
+      -DPARQUET_BOOST_USE_SHARED=off \
+      -DPARQUET_BUILD_BENCHMARKS=off \
+      -DPARQUET_BUILD_EXECUTABLES=off \
+      -DPARQUET_BUILD_TESTS=off \
+      ..
+
+  ninja
+  ninja install
+
+  popd
+}

http://git-wip-us.apache.org/repos/asf/arrow/blob/8841bc07/python/testing/hdfs/Dockerfile
----------------------------------------------------------------------
diff --git a/python/testing/hdfs/Dockerfile b/python/testing/hdfs/Dockerfile
new file mode 100644
index 0000000..9735513
--- /dev/null
+++ b/python/testing/hdfs/Dockerfile
@@ -0,0 +1,50 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# TODO Replace this with a complete clean image build
+FROM cpcloud86/impala:metastore
+
+USER root
+
+RUN apt-add-repository -y ppa:ubuntu-toolchain-r/test && \
+    apt-get update && \
+    apt-get install -y \
+            gcc-4.9 \
+            g++-4.9 \
+            build-essential \
+            autotools-dev \
+            autoconf \
+            gtk-doc-tools \
+            autoconf-archive \
+            libgirepository1.0-dev \
+            libtool \
+            libjemalloc-dev \
+            ccache \
+            valgrind \
+            gdb
+
+RUN wget -O - http://llvm.org/apt/llvm-snapshot.gpg.key|sudo apt-key add - && \
+    apt-add-repository -y \
+     "deb http://llvm.org/apt/trusty/ llvm-toolchain-trusty-4.0 main" && \
+    apt-get update && \
+    apt-get install -y clang-4.0 clang-format-4.0 clang-tidy-4.0
+
+USER ubuntu
+
+RUN wget -O /tmp/miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
+    bash /tmp/miniconda.sh -b -p /home/ubuntu/miniconda && \
+    rm /tmp/miniconda.sh

http://git-wip-us.apache.org/repos/asf/arrow/blob/8841bc07/python/testing/hdfs/libhdfs3-hdfs-client.xml
----------------------------------------------------------------------
diff --git a/python/testing/hdfs/libhdfs3-hdfs-client.xml b/python/testing/hdfs/libhdfs3-hdfs-client.xml
new file mode 100644
index 0000000..f929929
--- /dev/null
+++ b/python/testing/hdfs/libhdfs3-hdfs-client.xml
@@ -0,0 +1,332 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+<!-- From Apache HAWQ (incubating) -->
+
+<configuration>
+
+<!-- KDC
+	<property>
+		<name>hadoop.security.authentication</name>
+		<value>kerberos</value>
+	</property>
+KDC -->
+
+<!-- HA
+	<property>
+		<name>dfs.nameservices</name>
+		<value>phdcluster</value>
+	</property>
+
+	<property>
+		<name>dfs.ha.namenodes.phdcluster</name>
+		<value>nn1,nn2</value>
+	</property>6
+
+	<property>
+		<name>dfs.namenode.rpc-address.phdcluster.nn1</name>
+		<value>mdw:9000</value>
+	</property>
+
+	<property>
+		<name>dfs.namenode.rpc-address.phdcluster.nn2</name>
+		<value>smdw:9000</value>
+	</property>
+
+	<property>
+		<name>dfs.namenode.http-address.phdcluster.nn1</name>
+		<value>mdw:50070</value>
+	</property>
+
+	<property>
+		<name>dfs.namenode.http-address.phdcluster.nn2</name>
+		<value>smdw:50070</value>
+	</property>
+
+HA -->
+
+	<!-- RPC client configuration -->
+	<property>
+		<name>rpc.client.timeout</name>
+		<value>3600000</value>
+		<description>
+		timeout interval of a RPC invocation in millisecond. default is 3600000.
+		</description>
+	</property>
+	<property>
+		<name>rpc.client.connect.tcpnodelay</name>
+		<value>true</value>
+		<description>
+		whether set socket TCP_NODELAY to true when connect to RPC server. default is true.
+		</description>
+	</property>
+
+	<property>
+		<name>rpc.client.max.idle</name>
+		<value>10000</value>
+		<description>
+		the max idle time of a RPC connection in millisecond. default is 10000.
+		</description>
+	</property>
+
+	<property>
+		<name>rpc.client.ping.interval</name>
+		<value>10000</value>
+		<description>
+		the interval which the RPC client send a heart beat to server. 0 means disable, default is 10000.
+		</description>
+	</property>
+
+	<property>
+		<name>rpc.client.connect.timeout</name>
+		<value>600000</value>
+		<description>
+		the timeout interval in millisecond when the RPC client is trying to setup the connection. default is 600000.
+		</description>
+	</property>
+
+	<property>
+		<name>rpc.client.connect.retry</name>
+		<value>10</value>
+		<description>
+		the max retry times if the RPC client fail to setup the connection to server. default is 10.
+		</description>
+	</property>
+
+	<property>
+		<name>rpc.client.read.timeout</name>
+		<value>3600000</value>
+		<description>
+		the timeout interval in millisecond when the RPC client is trying to read from server. default is 3600000.
+		</description>
+	</property>
+
+	<property>
+		<name>rpc.client.write.timeout</name>
+		<value>3600000</value>
+		<description>
+		the timeout interval in millisecond when the RPC client is trying to write to server. default is 3600000.
+		</description>
+	</property>
+
+	<property>
+		<name>rpc.client.socket.linger.timeout</name>
+		<value>-1</value>
+		<description>
+		set value to socket SO_LINGER when connect to RPC server. -1 means default OS value. default is -1.
+		</description>
+	</property>
+
+	<!-- dfs client configuration -->
+	<property>
+		<name>dfs.client.read.shortcircuit</name>
+		<value>false</value>
+		<description>
+		whether reading block file bypass datanode if the block and the client are on the same node. default is true.
+		</description>
+	</property>
+
+	<property>
+		<name>dfs.default.replica</name>
+		<value>1</value>
+		<description>
+		the default number of replica. default is 3.
+		</description>
+	</property>
+
+	<property>
+		<name>dfs.prefetchsize</name>
+		<value>10</value>
+		<description>
+		the default number of blocks which information will be prefetched. default is 10.
+		</description>
+	</property>
+
+	<property>
+		<name>dfs.client.failover.max.attempts</name>
+		<value>15</value>
+		<description>
+		if multiply namenodes are configured, it is the max retry times when the dfs client try to issue a RPC call. default is 15.
+		</description>
+	</property>
+
+	<property>
+		<name>dfs.default.blocksize</name>
+		<value>134217728</value>
+		<description>
+		default block size. default is 134217728.
+		</description>
+	</property>
+
+	<property>
+		<name>dfs.client.log.severity</name>
+		<value>INFO</value>
+		<description>
+		the minimal log severity level, valid values include FATAL, ERROR, INFO, DEBUG1, DEBUG2, DEBUG3. default is INFO.
+		</description>
+	</property>
+
+	<!-- input client configuration -->
+	<property>
+		<name>input.connect.timeout</name>
+		<value>600000</value>
+		<description>
+		the timeout interval in millisecond when the input stream is trying to setup the connection to datanode. default is 600000.
+		</description>
+	</property>
+
+	<property>
+		<name>input.read.timeout</name>
+		<value>3600000</value>
+		<description>
+		the timeout interval in millisecond when the input stream is trying to read from datanode. default is 3600000.
+		</description>
+	</property>
+
+	<property>
+		<name>input.write.timeout</name>
+		<value>3600000</value>
+		<description>
+		the timeout interval in millisecond when the input stream is trying to write to datanode. default is 3600000.
+		</description>
+	</property>
+
+	<property>
+		<name>input.localread.default.buffersize</name>
+		<value>2097152</value>
+		<description>
+		number of bytes of the buffer which is used to hold the data from block file and verify checksum.
+		it is only used when "dfs.client.read.shortcircuit" is set to true. default is 1048576.
+		</description>
+	</property>
+
+	<property>
+		<name>input.localread.blockinfo.cachesize</name>
+		<value>1000</value>
+		<description>
+		the size of block file path information cache. default is 1000.
+		</description>
+	</property>
+
+	<property>
+		<name>input.read.getblockinfo.retry</name>
+		<value>3</value>
+		<description>
+		the max retry times when the client fail to get block information from namenode. default is 3.
+		</description>
+	</property>
+
+	<!-- output client configuration -->
+	<property>
+		<name>output.replace-datanode-on-failure</name>
+		<value>false</value>
+		<description>
+		whether the client add new datanode into pipeline if the number of nodes in pipeline is less the specified number of replicas. default is false.
+		</description>
+	</property>
+
+	<property>
+		<name>output.default.chunksize</name>
+		<value>512</value>
+		<description>
+		the number of bytes of a chunk in pipeline. default is 512.
+		</description>
+	</property>
+
+	<property>
+		<name>output.default.packetsize</name>
+		<value>65536</value>
+		<description>
+		the number of bytes of a packet in pipeline. default is 65536.
+		</description>
+	</property>
+
+	<property>
+		<name>output.default.write.retry</name>
+		<value>10</value>
+		<description>
+		the max retry times when the client fail to setup the pipeline. default is 10.
+		</description>
+	</property>
+
+	<property>
+		<name>output.connect.timeout</name>
+		<value>600000</value>
+		<description>
+		the timeout interval in millisecond when the output stream is trying to setup the connection to datanode. default is 600000.
+		</description>
+	</property>
+
+	<property>
+		<name>output.read.timeout</name>
+		<value>3600000</value>
+		<description>
+		the timeout interval in millisecond when the output stream is trying to read from datanode. default is 3600000.
+		</description>
+	</property>
+
+	<property>
+		<name>output.write.timeout</name>
+		<value>3600000</value>
+		<description>
+		the timeout interval in millisecond when the output stream is trying to write to datanode. default is 3600000.
+		</description>
+	</property>
+
+	<property>
+		<name>output.packetpool.size</name>
+		<value>1024</value>
+		<description>
+		the max number of packets in a file's packet pool. default is 1024.
+		</description>
+	</property>
+
+	<property>
+		<name>output.close.timeout</name>
+		<value>900000</value>
+		<description>
+		the timeout interval in millisecond when close an output stream. default is 900000.
+		</description>
+	</property>
+
+	<property>
+		<name>dfs.domain.socket.path</name>
+		<value>/var/lib/hadoop-hdfs/dn_socket</value>
+		<description>
+		Optional.  This is a path to a UNIX domain socket that will be used for
+		communication between the DataNode and local HDFS clients.
+		If the string "_PORT" is present in this path, it will be replaced by the
+		TCP port of the DataNode.
+		</description>
+	</property>
+
+	<property>
+		<name>dfs.client.use.legacy.blockreader.local</name>
+		<value>false</value>
+		<description>
+		Legacy short-circuit reader implementation based on HDFS-2246 is used
+		if this configuration parameter is true.
+		This is for the platforms other than Linux
+		where the new implementation based on HDFS-347 is not available.
+		</description>
+	</property>
+
+</configuration>

http://git-wip-us.apache.org/repos/asf/arrow/blob/8841bc07/python/testing/hdfs/restart_docker_container.sh
----------------------------------------------------------------------
diff --git a/python/testing/hdfs/restart_docker_container.sh b/python/testing/hdfs/restart_docker_container.sh
new file mode 100644
index 0000000..15076cc
--- /dev/null
+++ b/python/testing/hdfs/restart_docker_container.sh
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+export ARROW_TEST_NN_HOST=arrow-hdfs
+export ARROW_TEST_IMPALA_HOST=$ARROW_TEST_NN_HOST
+export ARROW_TEST_IMPALA_PORT=21050
+export ARROW_TEST_WEBHDFS_PORT=50070
+export ARROW_TEST_WEBHDFS_USER=ubuntu
+
+docker stop $ARROW_TEST_NN_HOST
+docker rm $ARROW_TEST_NN_HOST
+
+docker run -d -it --name $ARROW_TEST_NN_HOST \
+       -v $PWD:/io \
+       --hostname $ARROW_TEST_NN_HOST \
+       --shm-size=2gb \
+       -p $ARROW_TEST_WEBHDFS_PORT -p $ARROW_TEST_IMPALA_PORT \
+       arrow-hdfs-test
+
+while ! docker exec $ARROW_TEST_NN_HOST impala-shell -q 'SELECT VERSION()'; do
+    sleep 1
+done

http://git-wip-us.apache.org/repos/asf/arrow/blob/8841bc07/python/testing/hdfs/run_tests.sh
----------------------------------------------------------------------
diff --git a/python/testing/hdfs/run_tests.sh b/python/testing/hdfs/run_tests.sh
new file mode 100755
index 0000000..e0d36df
--- /dev/null
+++ b/python/testing/hdfs/run_tests.sh
@@ -0,0 +1,41 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -ex
+
+HERE=$(cd `dirname "${BASH_SOURCE[0]:-$0}"` && pwd)
+
+source $HERE/../set_env_common.sh
+source $HERE/../setup_toolchain.sh
+source $HERE/../functions.sh
+
+git clone https://github.com/apache/arrow.git $ARROW_CHECKOUT
+
+use_clang
+
+bootstrap_python_env 3.6
+
+build_arrow
+build_parquet
+
+build_pyarrow
+
+$ARROW_CPP_BUILD_DIR/debug/io-hdfs-test
+
+python -m pytest -vv -r sxX -s $PYARROW_PATH --parquet --hdfs

http://git-wip-us.apache.org/repos/asf/arrow/blob/8841bc07/python/testing/set_env_common.sh
----------------------------------------------------------------------
diff --git a/python/testing/set_env_common.sh b/python/testing/set_env_common.sh
new file mode 100644
index 0000000..00251f9
--- /dev/null
+++ b/python/testing/set_env_common.sh
@@ -0,0 +1,70 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+export MINICONDA=$HOME/miniconda
+export CPP_TOOLCHAIN=$HOME/cpp-toolchain
+
+export PATH="$MINICONDA/bin:$PATH"
+export CONDA_PKGS_DIRS=$HOME/.conda_packages
+
+export ARROW_CHECKOUT=$HOME/arrow
+export BUILD_DIR=$ARROW_CHECKOUT
+
+export BUILD_OS_NAME=linux
+export BUILD_TYPE=debug
+
+export ARROW_CPP_DIR=$BUILD_DIR/cpp
+export ARROW_PYTHON_DIR=$BUILD_DIR/python
+export ARROW_C_GLIB_DIR=$BUILD_DIR/c_glib
+export ARROW_JAVA_DIR=${BUILD_DIR}/java
+export ARROW_JS_DIR=${BUILD_DIR}/js
+export ARROW_INTEGRATION_DIR=$BUILD_DIR/integration
+
+export CPP_BUILD_DIR=$BUILD_DIR/cpp-build
+
+export ARROW_CPP_INSTALL=$BUILD_DIR/cpp-install
+export ARROW_CPP_BUILD_DIR=$BUILD_DIR/cpp-build
+export ARROW_C_GLIB_INSTALL=$BUILD_DIR/c-glib-install
+
+export ARROW_BUILD_TOOLCHAIN=$CPP_TOOLCHAIN
+export PARQUET_BUILD_TOOLCHAIN=$CPP_TOOLCHAIN
+
+export BOOST_ROOT=$CPP_TOOLCHAIN
+export PATH=$CPP_TOOLCHAIN/bin:$PATH
+export LD_LIBRARY_PATH=$CPP_TOOLCHAIN/lib:$LD_LIBRARY_PATH
+
+export VALGRIND="valgrind --tool=memcheck"
+
+export ARROW_HOME=$CPP_TOOLCHAIN
+export PARQUET_HOME=$CPP_TOOLCHAIN
+
+# Arrow test variables
+
+export JAVA_HOME=/usr/lib/jvm/java-7-oracle
+export HADOOP_HOME=/usr/lib/hadoop
+export CLASSPATH=`$HADOOP_HOME/bin/hadoop classpath --glob`
+export HADOOP_OPTS="$HADOOP_OPTS -Djava.library.path=$HADOOP_HOME/lib/native"
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HADOOP_HOME/lib/native/
+
+export ARROW_HDFS_TEST_HOST=arrow-hdfs
+export ARROW_HDFS_TEST_PORT=9000
+export ARROW_HDFS_TEST_USER=ubuntu
+export ARROW_LIBHDFS_DIR=/usr/lib
+
+export LIBHDFS3_CONF=/io/hdfs/libhdfs3-hdfs-client.xml

http://git-wip-us.apache.org/repos/asf/arrow/blob/8841bc07/python/testing/setup_toolchain.sh
----------------------------------------------------------------------
diff --git a/python/testing/setup_toolchain.sh b/python/testing/setup_toolchain.sh
new file mode 100644
index 0000000..c3837b4
--- /dev/null
+++ b/python/testing/setup_toolchain.sh
@@ -0,0 +1,65 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+
+export PATH="$MINICONDA/bin:$PATH"
+conda update -y -q conda
+conda config --set auto_update_conda false
+conda info -a
+
+conda config --set show_channel_urls True
+
+# Help with SSL timeouts to S3
+conda config --set remote_connect_timeout_secs 12
+
+conda config --add channels https://repo.continuum.io/pkgs/free
+conda config --add channels conda-forge
+conda info -a
+
+# faster builds, please
+conda install -y nomkl
+
+conda install --y conda-build jinja2 anaconda-client cmake curl
+
+# Set up C++ toolchain
+conda create -y -q -p $CPP_TOOLCHAIN python=3.6 \
+    jemalloc=4.4.0 \
+    nomkl \
+    boost-cpp \
+    rapidjson \
+    flatbuffers \
+    gflags \
+    lz4-c \
+    snappy \
+    zstd \
+    brotli \
+    zlib \
+    git \
+    cmake \
+    curl \
+    thrift-cpp \
+    libhdfs3 \
+    ninja
+
+if [ $BUILD_OS_NAME == "osx" ]; then
+  brew update > /dev/null
+  brew install jemalloc
+  brew install ccache
+fi

http://git-wip-us.apache.org/repos/asf/arrow/blob/8841bc07/python/testing/test_hdfs.sh
----------------------------------------------------------------------
diff --git a/python/testing/test_hdfs.sh b/python/testing/test_hdfs.sh
new file mode 100755
index 0000000..016e54a
--- /dev/null
+++ b/python/testing/test_hdfs.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -ex
+
+docker build -t arrow-hdfs-test -f hdfs/Dockerfile .
+bash hdfs/restart_docker_container.sh
+docker exec -it arrow-hdfs /io/hdfs/run_tests.sh
+docker stop arrow-hdfs