You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/07/28 14:31:02 UTC
arrow git commit: ARROW-1281: [C++/Python] Add Docker setup for
testing HDFS IO in C++ and Python
Repository: arrow
Updated Branches:
refs/heads/master ff6c6e0f9 -> 8841bc071
ARROW-1281: [C++/Python] Add Docker setup for testing HDFS IO in C++ and Python
We aren't testing this in Travis CI because spinning up an HDFS cluster is a bit heavy weight, but this will at least enable us to do easier ongoing validation that this functionality is working properly.
Author: Wes McKinney <we...@twosigma.com>
Closes #895 from wesm/ARROW-1281 and squashes the following commits:
a96e1665 [Wes McKinney] Fix header
4effee78 [Wes McKinney] Fix license header
d12eea48 [Wes McKinney] Fix license headers
591e7c6b [Wes McKinney] Add Python tests
bbbd8c10 [Wes McKinney] Docker HDFS testing scripts, use hdfs-client.xml from Apache HAWQ (incubating)
Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/8841bc07
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/8841bc07
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/8841bc07
Branch: refs/heads/master
Commit: 8841bc071b1d0a3eff2592af5ca9b5591ed9e5c5
Parents: ff6c6e0
Author: Wes McKinney <we...@twosigma.com>
Authored: Fri Jul 28 10:30:57 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Fri Jul 28 10:30:57 2017 -0400
----------------------------------------------------------------------
python/testing/README.md | 26 ++
python/testing/functions.sh | 100 ++++++
python/testing/hdfs/Dockerfile | 50 +++
python/testing/hdfs/libhdfs3-hdfs-client.xml | 332 +++++++++++++++++++
python/testing/hdfs/restart_docker_container.sh | 38 +++
python/testing/hdfs/run_tests.sh | 41 +++
python/testing/set_env_common.sh | 70 ++++
python/testing/setup_toolchain.sh | 65 ++++
python/testing/test_hdfs.sh | 25 ++
9 files changed, 747 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/arrow/blob/8841bc07/python/testing/README.md
----------------------------------------------------------------------
diff --git a/python/testing/README.md b/python/testing/README.md
new file mode 100644
index 0000000..07970a2
--- /dev/null
+++ b/python/testing/README.md
@@ -0,0 +1,26 @@
+<!---
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
+# Testing tools for odds and ends
+
+## Testing HDFS file interface
+
+```shell
+./test_hdfs.sh
+```
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/arrow/blob/8841bc07/python/testing/functions.sh
----------------------------------------------------------------------
diff --git a/python/testing/functions.sh b/python/testing/functions.sh
new file mode 100644
index 0000000..6bc342b
--- /dev/null
+++ b/python/testing/functions.sh
@@ -0,0 +1,100 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+use_gcc() {
+ export CC=gcc-4.9
+ export CXX=g++-4.9
+}
+
+use_clang() {
+ export CC=clang-4.0
+ export CXX=clang++-4.0
+}
+
+bootstrap_python_env() {
+ PYTHON_VERSION=$1
+ CONDA_ENV_DIR=$BUILD_DIR/pyarrow-test-$PYTHON_VERSION
+
+ conda create -y -q -p $CONDA_ENV_DIR python=$PYTHON_VERSION cmake curl
+ source activate $CONDA_ENV_DIR
+
+ python --version
+ which python
+
+ # faster builds, please
+ conda install -y -q nomkl pip numpy pandas cython
+}
+
+build_pyarrow() {
+ # Other stuff pip install
+ pushd $ARROW_PYTHON_DIR
+ pip install -r requirements.txt
+ python setup.py build_ext --with-parquet --with-plasma \
+ install --single-version-externally-managed --record=record.text
+ popd
+
+ python -c "import pyarrow.parquet"
+ python -c "import pyarrow.plasma"
+
+ export PYARROW_PATH=$CONDA_PREFIX/lib/python$PYTHON_VERSION/site-packages/pyarrow
+}
+
+build_arrow() {
+ mkdir -p $ARROW_CPP_BUILD_DIR
+ pushd $ARROW_CPP_BUILD_DIR
+
+ cmake -GNinja \
+ -DCMAKE_BUILD_TYPE=$BUILD_TYPE \
+ -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \
+ -DARROW_NO_DEPRECATED_API=ON \
+ -DARROW_PYTHON=ON \
+ -DARROW_PLASMA=ON \
+ -DARROW_BOOST_USE_SHARED=off \
+ $ARROW_CPP_DIR
+
+ ninja
+ ninja install
+ popd
+}
+
+build_parquet() {
+ PARQUET_DIR=$BUILD_DIR/parquet
+ mkdir -p $PARQUET_DIR
+
+ git clone https://github.com/apache/parquet-cpp.git $PARQUET_DIR
+
+ pushd $PARQUET_DIR
+ mkdir build-dir
+ cd build-dir
+
+ cmake \
+ -GNinja \
+ -DCMAKE_BUILD_TYPE=$BUILD_TYPE \
+ -DCMAKE_INSTALL_PREFIX=$PARQUET_HOME \
+ -DPARQUET_BOOST_USE_SHARED=off \
+ -DPARQUET_BUILD_BENCHMARKS=off \
+ -DPARQUET_BUILD_EXECUTABLES=off \
+ -DPARQUET_BUILD_TESTS=off \
+ ..
+
+ ninja
+ ninja install
+
+ popd
+}
http://git-wip-us.apache.org/repos/asf/arrow/blob/8841bc07/python/testing/hdfs/Dockerfile
----------------------------------------------------------------------
diff --git a/python/testing/hdfs/Dockerfile b/python/testing/hdfs/Dockerfile
new file mode 100644
index 0000000..9735513
--- /dev/null
+++ b/python/testing/hdfs/Dockerfile
@@ -0,0 +1,50 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# TODO Replace this with a complete clean image build
+FROM cpcloud86/impala:metastore
+
+USER root
+
+RUN apt-add-repository -y ppa:ubuntu-toolchain-r/test && \
+ apt-get update && \
+ apt-get install -y \
+ gcc-4.9 \
+ g++-4.9 \
+ build-essential \
+ autotools-dev \
+ autoconf \
+ gtk-doc-tools \
+ autoconf-archive \
+ libgirepository1.0-dev \
+ libtool \
+ libjemalloc-dev \
+ ccache \
+ valgrind \
+ gdb
+
+RUN wget -O - http://llvm.org/apt/llvm-snapshot.gpg.key|sudo apt-key add - && \
+ apt-add-repository -y \
+ "deb http://llvm.org/apt/trusty/ llvm-toolchain-trusty-4.0 main" && \
+ apt-get update && \
+ apt-get install -y clang-4.0 clang-format-4.0 clang-tidy-4.0
+
+USER ubuntu
+
+RUN wget -O /tmp/miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
+ bash /tmp/miniconda.sh -b -p /home/ubuntu/miniconda && \
+ rm /tmp/miniconda.sh
http://git-wip-us.apache.org/repos/asf/arrow/blob/8841bc07/python/testing/hdfs/libhdfs3-hdfs-client.xml
----------------------------------------------------------------------
diff --git a/python/testing/hdfs/libhdfs3-hdfs-client.xml b/python/testing/hdfs/libhdfs3-hdfs-client.xml
new file mode 100644
index 0000000..f929929
--- /dev/null
+++ b/python/testing/hdfs/libhdfs3-hdfs-client.xml
@@ -0,0 +1,332 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements. See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership. The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied. See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+<!-- From Apache HAWQ (incubating) -->
+
+<configuration>
+
+<!-- KDC
+ <property>
+ <name>hadoop.security.authentication</name>
+ <value>kerberos</value>
+ </property>
+KDC -->
+
+<!-- HA
+ <property>
+ <name>dfs.nameservices</name>
+ <value>phdcluster</value>
+ </property>
+
+ <property>
+ <name>dfs.ha.namenodes.phdcluster</name>
+ <value>nn1,nn2</value>
+ </property>6
+
+ <property>
+ <name>dfs.namenode.rpc-address.phdcluster.nn1</name>
+ <value>mdw:9000</value>
+ </property>
+
+ <property>
+ <name>dfs.namenode.rpc-address.phdcluster.nn2</name>
+ <value>smdw:9000</value>
+ </property>
+
+ <property>
+ <name>dfs.namenode.http-address.phdcluster.nn1</name>
+ <value>mdw:50070</value>
+ </property>
+
+ <property>
+ <name>dfs.namenode.http-address.phdcluster.nn2</name>
+ <value>smdw:50070</value>
+ </property>
+
+HA -->
+
+ <!-- RPC client configuration -->
+ <property>
+ <name>rpc.client.timeout</name>
+ <value>3600000</value>
+ <description>
+ timeout interval of a RPC invocation in millisecond. default is 3600000.
+ </description>
+ </property>
+ <property>
+ <name>rpc.client.connect.tcpnodelay</name>
+ <value>true</value>
+ <description>
+ whether set socket TCP_NODELAY to true when connect to RPC server. default is true.
+ </description>
+ </property>
+
+ <property>
+ <name>rpc.client.max.idle</name>
+ <value>10000</value>
+ <description>
+ the max idle time of a RPC connection in millisecond. default is 10000.
+ </description>
+ </property>
+
+ <property>
+ <name>rpc.client.ping.interval</name>
+ <value>10000</value>
+ <description>
+ the interval which the RPC client send a heart beat to server. 0 means disable, default is 10000.
+ </description>
+ </property>
+
+ <property>
+ <name>rpc.client.connect.timeout</name>
+ <value>600000</value>
+ <description>
+ the timeout interval in millisecond when the RPC client is trying to setup the connection. default is 600000.
+ </description>
+ </property>
+
+ <property>
+ <name>rpc.client.connect.retry</name>
+ <value>10</value>
+ <description>
+ the max retry times if the RPC client fail to setup the connection to server. default is 10.
+ </description>
+ </property>
+
+ <property>
+ <name>rpc.client.read.timeout</name>
+ <value>3600000</value>
+ <description>
+ the timeout interval in millisecond when the RPC client is trying to read from server. default is 3600000.
+ </description>
+ </property>
+
+ <property>
+ <name>rpc.client.write.timeout</name>
+ <value>3600000</value>
+ <description>
+ the timeout interval in millisecond when the RPC client is trying to write to server. default is 3600000.
+ </description>
+ </property>
+
+ <property>
+ <name>rpc.client.socket.linger.timeout</name>
+ <value>-1</value>
+ <description>
+ set value to socket SO_LINGER when connect to RPC server. -1 means default OS value. default is -1.
+ </description>
+ </property>
+
+ <!-- dfs client configuration -->
+ <property>
+ <name>dfs.client.read.shortcircuit</name>
+ <value>false</value>
+ <description>
+ whether reading block file bypass datanode if the block and the client are on the same node. default is true.
+ </description>
+ </property>
+
+ <property>
+ <name>dfs.default.replica</name>
+ <value>1</value>
+ <description>
+ the default number of replica. default is 3.
+ </description>
+ </property>
+
+ <property>
+ <name>dfs.prefetchsize</name>
+ <value>10</value>
+ <description>
+ the default number of blocks which information will be prefetched. default is 10.
+ </description>
+ </property>
+
+ <property>
+ <name>dfs.client.failover.max.attempts</name>
+ <value>15</value>
+ <description>
+ if multiply namenodes are configured, it is the max retry times when the dfs client try to issue a RPC call. default is 15.
+ </description>
+ </property>
+
+ <property>
+ <name>dfs.default.blocksize</name>
+ <value>134217728</value>
+ <description>
+ default block size. default is 134217728.
+ </description>
+ </property>
+
+ <property>
+ <name>dfs.client.log.severity</name>
+ <value>INFO</value>
+ <description>
+ the minimal log severity level, valid values include FATAL, ERROR, INFO, DEBUG1, DEBUG2, DEBUG3. default is INFO.
+ </description>
+ </property>
+
+ <!-- input client configuration -->
+ <property>
+ <name>input.connect.timeout</name>
+ <value>600000</value>
+ <description>
+ the timeout interval in millisecond when the input stream is trying to setup the connection to datanode. default is 600000.
+ </description>
+ </property>
+
+ <property>
+ <name>input.read.timeout</name>
+ <value>3600000</value>
+ <description>
+ the timeout interval in millisecond when the input stream is trying to read from datanode. default is 3600000.
+ </description>
+ </property>
+
+ <property>
+ <name>input.write.timeout</name>
+ <value>3600000</value>
+ <description>
+ the timeout interval in millisecond when the input stream is trying to write to datanode. default is 3600000.
+ </description>
+ </property>
+
+ <property>
+ <name>input.localread.default.buffersize</name>
+ <value>2097152</value>
+ <description>
+ number of bytes of the buffer which is used to hold the data from block file and verify checksum.
+ it is only used when "dfs.client.read.shortcircuit" is set to true. default is 1048576.
+ </description>
+ </property>
+
+ <property>
+ <name>input.localread.blockinfo.cachesize</name>
+ <value>1000</value>
+ <description>
+ the size of block file path information cache. default is 1000.
+ </description>
+ </property>
+
+ <property>
+ <name>input.read.getblockinfo.retry</name>
+ <value>3</value>
+ <description>
+ the max retry times when the client fail to get block information from namenode. default is 3.
+ </description>
+ </property>
+
+ <!-- output client configuration -->
+ <property>
+ <name>output.replace-datanode-on-failure</name>
+ <value>false</value>
+ <description>
+ whether the client add new datanode into pipeline if the number of nodes in pipeline is less the specified number of replicas. default is false.
+ </description>
+ </property>
+
+ <property>
+ <name>output.default.chunksize</name>
+ <value>512</value>
+ <description>
+ the number of bytes of a chunk in pipeline. default is 512.
+ </description>
+ </property>
+
+ <property>
+ <name>output.default.packetsize</name>
+ <value>65536</value>
+ <description>
+ the number of bytes of a packet in pipeline. default is 65536.
+ </description>
+ </property>
+
+ <property>
+ <name>output.default.write.retry</name>
+ <value>10</value>
+ <description>
+ the max retry times when the client fail to setup the pipeline. default is 10.
+ </description>
+ </property>
+
+ <property>
+ <name>output.connect.timeout</name>
+ <value>600000</value>
+ <description>
+ the timeout interval in millisecond when the output stream is trying to setup the connection to datanode. default is 600000.
+ </description>
+ </property>
+
+ <property>
+ <name>output.read.timeout</name>
+ <value>3600000</value>
+ <description>
+ the timeout interval in millisecond when the output stream is trying to read from datanode. default is 3600000.
+ </description>
+ </property>
+
+ <property>
+ <name>output.write.timeout</name>
+ <value>3600000</value>
+ <description>
+ the timeout interval in millisecond when the output stream is trying to write to datanode. default is 3600000.
+ </description>
+ </property>
+
+ <property>
+ <name>output.packetpool.size</name>
+ <value>1024</value>
+ <description>
+ the max number of packets in a file's packet pool. default is 1024.
+ </description>
+ </property>
+
+ <property>
+ <name>output.close.timeout</name>
+ <value>900000</value>
+ <description>
+ the timeout interval in millisecond when close an output stream. default is 900000.
+ </description>
+ </property>
+
+ <property>
+ <name>dfs.domain.socket.path</name>
+ <value>/var/lib/hadoop-hdfs/dn_socket</value>
+ <description>
+ Optional. This is a path to a UNIX domain socket that will be used for
+ communication between the DataNode and local HDFS clients.
+ If the string "_PORT" is present in this path, it will be replaced by the
+ TCP port of the DataNode.
+ </description>
+ </property>
+
+ <property>
+ <name>dfs.client.use.legacy.blockreader.local</name>
+ <value>false</value>
+ <description>
+ Legacy short-circuit reader implementation based on HDFS-2246 is used
+ if this configuration parameter is true.
+ This is for the platforms other than Linux
+ where the new implementation based on HDFS-347 is not available.
+ </description>
+ </property>
+
+</configuration>
http://git-wip-us.apache.org/repos/asf/arrow/blob/8841bc07/python/testing/hdfs/restart_docker_container.sh
----------------------------------------------------------------------
diff --git a/python/testing/hdfs/restart_docker_container.sh b/python/testing/hdfs/restart_docker_container.sh
new file mode 100644
index 0000000..15076cc
--- /dev/null
+++ b/python/testing/hdfs/restart_docker_container.sh
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+export ARROW_TEST_NN_HOST=arrow-hdfs
+export ARROW_TEST_IMPALA_HOST=$ARROW_TEST_NN_HOST
+export ARROW_TEST_IMPALA_PORT=21050
+export ARROW_TEST_WEBHDFS_PORT=50070
+export ARROW_TEST_WEBHDFS_USER=ubuntu
+
+docker stop $ARROW_TEST_NN_HOST
+docker rm $ARROW_TEST_NN_HOST
+
+docker run -d -it --name $ARROW_TEST_NN_HOST \
+ -v $PWD:/io \
+ --hostname $ARROW_TEST_NN_HOST \
+ --shm-size=2gb \
+ -p $ARROW_TEST_WEBHDFS_PORT -p $ARROW_TEST_IMPALA_PORT \
+ arrow-hdfs-test
+
+while ! docker exec $ARROW_TEST_NN_HOST impala-shell -q 'SELECT VERSION()'; do
+ sleep 1
+done
http://git-wip-us.apache.org/repos/asf/arrow/blob/8841bc07/python/testing/hdfs/run_tests.sh
----------------------------------------------------------------------
diff --git a/python/testing/hdfs/run_tests.sh b/python/testing/hdfs/run_tests.sh
new file mode 100755
index 0000000..e0d36df
--- /dev/null
+++ b/python/testing/hdfs/run_tests.sh
@@ -0,0 +1,41 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -ex
+
+HERE=$(cd `dirname "${BASH_SOURCE[0]:-$0}"` && pwd)
+
+source $HERE/../set_env_common.sh
+source $HERE/../setup_toolchain.sh
+source $HERE/../functions.sh
+
+git clone https://github.com/apache/arrow.git $ARROW_CHECKOUT
+
+use_clang
+
+bootstrap_python_env 3.6
+
+build_arrow
+build_parquet
+
+build_pyarrow
+
+$ARROW_CPP_BUILD_DIR/debug/io-hdfs-test
+
+python -m pytest -vv -r sxX -s $PYARROW_PATH --parquet --hdfs
http://git-wip-us.apache.org/repos/asf/arrow/blob/8841bc07/python/testing/set_env_common.sh
----------------------------------------------------------------------
diff --git a/python/testing/set_env_common.sh b/python/testing/set_env_common.sh
new file mode 100644
index 0000000..00251f9
--- /dev/null
+++ b/python/testing/set_env_common.sh
@@ -0,0 +1,70 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+export MINICONDA=$HOME/miniconda
+export CPP_TOOLCHAIN=$HOME/cpp-toolchain
+
+export PATH="$MINICONDA/bin:$PATH"
+export CONDA_PKGS_DIRS=$HOME/.conda_packages
+
+export ARROW_CHECKOUT=$HOME/arrow
+export BUILD_DIR=$ARROW_CHECKOUT
+
+export BUILD_OS_NAME=linux
+export BUILD_TYPE=debug
+
+export ARROW_CPP_DIR=$BUILD_DIR/cpp
+export ARROW_PYTHON_DIR=$BUILD_DIR/python
+export ARROW_C_GLIB_DIR=$BUILD_DIR/c_glib
+export ARROW_JAVA_DIR=${BUILD_DIR}/java
+export ARROW_JS_DIR=${BUILD_DIR}/js
+export ARROW_INTEGRATION_DIR=$BUILD_DIR/integration
+
+export CPP_BUILD_DIR=$BUILD_DIR/cpp-build
+
+export ARROW_CPP_INSTALL=$BUILD_DIR/cpp-install
+export ARROW_CPP_BUILD_DIR=$BUILD_DIR/cpp-build
+export ARROW_C_GLIB_INSTALL=$BUILD_DIR/c-glib-install
+
+export ARROW_BUILD_TOOLCHAIN=$CPP_TOOLCHAIN
+export PARQUET_BUILD_TOOLCHAIN=$CPP_TOOLCHAIN
+
+export BOOST_ROOT=$CPP_TOOLCHAIN
+export PATH=$CPP_TOOLCHAIN/bin:$PATH
+export LD_LIBRARY_PATH=$CPP_TOOLCHAIN/lib:$LD_LIBRARY_PATH
+
+export VALGRIND="valgrind --tool=memcheck"
+
+export ARROW_HOME=$CPP_TOOLCHAIN
+export PARQUET_HOME=$CPP_TOOLCHAIN
+
+# Arrow test variables
+
+export JAVA_HOME=/usr/lib/jvm/java-7-oracle
+export HADOOP_HOME=/usr/lib/hadoop
+export CLASSPATH=`$HADOOP_HOME/bin/hadoop classpath --glob`
+export HADOOP_OPTS="$HADOOP_OPTS -Djava.library.path=$HADOOP_HOME/lib/native"
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HADOOP_HOME/lib/native/
+
+export ARROW_HDFS_TEST_HOST=arrow-hdfs
+export ARROW_HDFS_TEST_PORT=9000
+export ARROW_HDFS_TEST_USER=ubuntu
+export ARROW_LIBHDFS_DIR=/usr/lib
+
+export LIBHDFS3_CONF=/io/hdfs/libhdfs3-hdfs-client.xml
http://git-wip-us.apache.org/repos/asf/arrow/blob/8841bc07/python/testing/setup_toolchain.sh
----------------------------------------------------------------------
diff --git a/python/testing/setup_toolchain.sh b/python/testing/setup_toolchain.sh
new file mode 100644
index 0000000..c3837b4
--- /dev/null
+++ b/python/testing/setup_toolchain.sh
@@ -0,0 +1,65 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+
+export PATH="$MINICONDA/bin:$PATH"
+conda update -y -q conda
+conda config --set auto_update_conda false
+conda info -a
+
+conda config --set show_channel_urls True
+
+# Help with SSL timeouts to S3
+conda config --set remote_connect_timeout_secs 12
+
+conda config --add channels https://repo.continuum.io/pkgs/free
+conda config --add channels conda-forge
+conda info -a
+
+# faster builds, please
+conda install -y nomkl
+
+conda install --y conda-build jinja2 anaconda-client cmake curl
+
+# Set up C++ toolchain
+conda create -y -q -p $CPP_TOOLCHAIN python=3.6 \
+ jemalloc=4.4.0 \
+ nomkl \
+ boost-cpp \
+ rapidjson \
+ flatbuffers \
+ gflags \
+ lz4-c \
+ snappy \
+ zstd \
+ brotli \
+ zlib \
+ git \
+ cmake \
+ curl \
+ thrift-cpp \
+ libhdfs3 \
+ ninja
+
+if [ $BUILD_OS_NAME == "osx" ]; then
+ brew update > /dev/null
+ brew install jemalloc
+ brew install ccache
+fi
http://git-wip-us.apache.org/repos/asf/arrow/blob/8841bc07/python/testing/test_hdfs.sh
----------------------------------------------------------------------
diff --git a/python/testing/test_hdfs.sh b/python/testing/test_hdfs.sh
new file mode 100755
index 0000000..016e54a
--- /dev/null
+++ b/python/testing/test_hdfs.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -ex
+
+docker build -t arrow-hdfs-test -f hdfs/Dockerfile .
+bash hdfs/restart_docker_container.sh
+docker exec -it arrow-hdfs /io/hdfs/run_tests.sh
+docker stop arrow-hdfs