You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2018/08/18 17:45:53 UTC
[arrow] branch master updated: ARROW-1968: [C++/Python] Add basic
unit tests for ORC reader
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new c453a7a ARROW-1968: [C++/Python] Add basic unit tests for ORC reader
c453a7a is described below
commit c453a7ad3e004b3ad2a7d60a5e9853b4a63a76e0
Author: Wes McKinney <we...@apache.org>
AuthorDate: Sat Aug 18 13:45:46 2018 -0400
ARROW-1968: [C++/Python] Add basic unit tests for ORC reader
These use the example files published in the Apache ORC repository.
Author: Wes McKinney <we...@apache.org>
Author: Antoine Pitrou <an...@python.org>
Closes #2428 from pitrou/ARROW-1968-orc-tests and squashes the following commits:
6f538923 <Wes McKinney> Update quay.io image used for pandas 0.23.4
28335ff9 <Wes McKinney> Use pandas 0.23.4 everywhere
d5d34e53 <Wes McKinney> Upgrade pandas in manylinux1 image to latest
c51a77f1 <Wes McKinney> Do not use binary read mode
dd456204 <Wes McKinney> Use pandas for ORC tests, delete pickles
43b0208a <Antoine Pitrou> Add decimals test
06c0f5f4 <Antoine Pitrou> Fix tests on py2
42f01864 <Antoine Pitrou> ARROW-1968: Add basic unit tests for ORC reader
---
python/manylinux1/Dockerfile-x86_64 | 2 +-
python/manylinux1/Dockerfile-x86_64_base | 2 +-
python/manylinux1/scripts/build_virtualenvs.sh | 6 +-
python/pyarrow/tests/conftest.py | 10 +-
python/pyarrow/tests/data/orc/README.md | 22 +++
.../tests/data/orc/TestOrcFile.emptyFile.jsn.gz | Bin 0 -> 50 bytes
.../tests/data/orc/TestOrcFile.emptyFile.orc | Bin 0 -> 523 bytes
.../tests/data/orc/TestOrcFile.test1.jsn.gz | Bin 0 -> 323 bytes
.../pyarrow/tests/data/orc/TestOrcFile.test1.orc | Bin 0 -> 1711 bytes
.../tests/data/orc/TestOrcFile.testDate1900.jsn.gz | Bin 0 -> 182453 bytes
.../tests/data/orc/TestOrcFile.testDate1900.orc | Bin 0 -> 30941 bytes
python/pyarrow/tests/data/orc/decimal.jsn.gz | Bin 0 -> 19313 bytes
python/pyarrow/tests/data/orc/decimal.orc | Bin 0 -> 16337 bytes
python/pyarrow/tests/test_orc.py | 160 +++++++++++++++++++++
14 files changed, 195 insertions(+), 7 deletions(-)
diff --git a/python/manylinux1/Dockerfile-x86_64 b/python/manylinux1/Dockerfile-x86_64
index 4b26e21..306610a 100644
--- a/python/manylinux1/Dockerfile-x86_64
+++ b/python/manylinux1/Dockerfile-x86_64
@@ -14,7 +14,7 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
-FROM quay.io/xhochy/arrow_manylinux1_x86_64_base:ARROW-2850
+FROM quay.io/wesm/arrow_manylinux1_x86_64_base:ARROW-1968
ADD arrow /arrow
WORKDIR /arrow/cpp
diff --git a/python/manylinux1/Dockerfile-x86_64_base b/python/manylinux1/Dockerfile-x86_64_base
index 955abdd..1302590 100644
--- a/python/manylinux1/Dockerfile-x86_64_base
+++ b/python/manylinux1/Dockerfile-x86_64_base
@@ -17,7 +17,7 @@
FROM quay.io/pypa/manylinux1_x86_64:latest
# Install dependencies
-RUN yum install -y flex zlib-devel && yum clean all
+RUN yum install -y flex zlib-devel wget && yum clean all
ADD scripts/build_openssl.sh /
RUN /build_openssl.sh
diff --git a/python/manylinux1/scripts/build_virtualenvs.sh b/python/manylinux1/scripts/build_virtualenvs.sh
index 6eb0c5a..7361ea0 100755
--- a/python/manylinux1/scripts/build_virtualenvs.sh
+++ b/python/manylinux1/scripts/build_virtualenvs.sh
@@ -19,7 +19,7 @@
# Build upon the scripts in https://github.com/matthew-brett/manylinux-builds
# * Copyright (c) 2013-2016, Matt Terry and Matthew Brett (BSD 2-clause)
-PYTHON_VERSIONS="${PYTHON_VERSIONS:-2.7,16 2.7,32 3.4,16 3.5,16 3.6,16}"
+PYTHON_VERSIONS="${PYTHON_VERSIONS:-2.7,16 2.7,32 3.5,16 3.6,16}"
source /multibuild/manylinux_utils.sh
@@ -35,13 +35,13 @@ for PYTHON_TUPLE in ${PYTHON_VERSIONS}; do
echo "=== (${PYTHON}, ${U_WIDTH}) Installing build dependencies ==="
$PIP install "numpy==1.10.4"
$PIP install "cython==0.28.1"
- $PIP install "pandas==0.20.3"
+ $PIP install "pandas==0.23.4"
$PIP install "virtualenv==15.1.0"
echo "=== (${PYTHON}, ${U_WIDTH}) Preparing virtualenv for tests ==="
"$(cpython_path $PYTHON ${U_WIDTH})/bin/virtualenv" -p ${PYTHON_INTERPRETER} --no-download /venv-test-${PYTHON}-${U_WIDTH}
source /venv-test-${PYTHON}-${U_WIDTH}/bin/activate
- pip install pytest 'numpy==1.14.0' 'pandas==0.20.3'
+ pip install pytest 'numpy==1.14.0' 'pandas==0.23.4'
deactivate
done
diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py
index b0eff1e..e67aac1 100644
--- a/python/pyarrow/tests/conftest.py
+++ b/python/pyarrow/tests/conftest.py
@@ -21,6 +21,7 @@ from pytest import skip, mark
groups = [
'hdfs',
'large_memory',
+ 'orc',
'parquet',
'plasma',
's3',
@@ -31,20 +32,25 @@ groups = [
defaults = {
'hdfs': False,
'large_memory': False,
+ 'orc': False,
'parquet': False,
'plasma': False,
- 'large_memory': False,
's3': False,
'tensorflow': False
}
try:
+ import pyarrow.orc # noqa
+ defaults['orc'] = True
+except ImportError:
+ pass
+
+try:
import pyarrow.parquet # noqa
defaults['parquet'] = True
except ImportError:
pass
-
try:
import pyarrow.plasma as plasma # noqa
defaults['plasma'] = True
diff --git a/python/pyarrow/tests/data/orc/README.md b/python/pyarrow/tests/data/orc/README.md
new file mode 100644
index 0000000..ccbb0e8
--- /dev/null
+++ b/python/pyarrow/tests/data/orc/README.md
@@ -0,0 +1,22 @@
+<!---
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
+The ORC and JSON files come from the `examples` directory in the Apache ORC
+source tree:
+https://github.com/apache/orc/tree/master/examples
diff --git a/python/pyarrow/tests/data/orc/TestOrcFile.emptyFile.jsn.gz b/python/pyarrow/tests/data/orc/TestOrcFile.emptyFile.jsn.gz
new file mode 100644
index 0000000..91c85cd
Binary files /dev/null and b/python/pyarrow/tests/data/orc/TestOrcFile.emptyFile.jsn.gz differ
diff --git a/python/pyarrow/tests/data/orc/TestOrcFile.emptyFile.orc b/python/pyarrow/tests/data/orc/TestOrcFile.emptyFile.orc
new file mode 100644
index 0000000..ecdadcb
Binary files /dev/null and b/python/pyarrow/tests/data/orc/TestOrcFile.emptyFile.orc differ
diff --git a/python/pyarrow/tests/data/orc/TestOrcFile.test1.jsn.gz b/python/pyarrow/tests/data/orc/TestOrcFile.test1.jsn.gz
new file mode 100644
index 0000000..5eab19a
Binary files /dev/null and b/python/pyarrow/tests/data/orc/TestOrcFile.test1.jsn.gz differ
diff --git a/python/pyarrow/tests/data/orc/TestOrcFile.test1.orc b/python/pyarrow/tests/data/orc/TestOrcFile.test1.orc
new file mode 100644
index 0000000..4fb0bef
Binary files /dev/null and b/python/pyarrow/tests/data/orc/TestOrcFile.test1.orc differ
diff --git a/python/pyarrow/tests/data/orc/TestOrcFile.testDate1900.jsn.gz b/python/pyarrow/tests/data/orc/TestOrcFile.testDate1900.jsn.gz
new file mode 100644
index 0000000..62dbaba
Binary files /dev/null and b/python/pyarrow/tests/data/orc/TestOrcFile.testDate1900.jsn.gz differ
diff --git a/python/pyarrow/tests/data/orc/TestOrcFile.testDate1900.orc b/python/pyarrow/tests/data/orc/TestOrcFile.testDate1900.orc
new file mode 100644
index 0000000..f51ffdb
Binary files /dev/null and b/python/pyarrow/tests/data/orc/TestOrcFile.testDate1900.orc differ
diff --git a/python/pyarrow/tests/data/orc/decimal.jsn.gz b/python/pyarrow/tests/data/orc/decimal.jsn.gz
new file mode 100644
index 0000000..e634bd7
Binary files /dev/null and b/python/pyarrow/tests/data/orc/decimal.jsn.gz differ
diff --git a/python/pyarrow/tests/data/orc/decimal.orc b/python/pyarrow/tests/data/orc/decimal.orc
new file mode 100644
index 0000000..cb0f7b9
Binary files /dev/null and b/python/pyarrow/tests/data/orc/decimal.orc differ
diff --git a/python/pyarrow/tests/test_orc.py b/python/pyarrow/tests/test_orc.py
new file mode 100644
index 0000000..311a5d4
--- /dev/null
+++ b/python/pyarrow/tests/test_orc.py
@@ -0,0 +1,160 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import datetime
+import decimal
+import gzip
+import os
+
+from pandas.util.testing import assert_frame_equal
+import pandas as pd
+import pytest
+
+import pyarrow as pa
+
+
+# Marks all of the tests in this module
+# Ignore these with pytest ... -m 'not orc'
+pytestmark = pytest.mark.orc
+
+
+here = os.path.abspath(os.path.dirname(__file__))
+orc_data_dir = os.path.join(here, 'data', 'orc')
+
+
+def path_for_orc_example(name):
+ return os.path.join(orc_data_dir, '%s.orc' % name)
+
+
+def path_for_json_example(name):
+ return os.path.join(orc_data_dir, '%s.jsn.gz' % name)
+
+
+def fix_example_values(actual_cols, expected_cols):
+ """
+ Fix type of expected values (as read from JSON) according to
+ actual ORC datatype.
+ """
+ for name in expected_cols:
+ expected = expected_cols[name]
+ actual = actual_cols[name]
+ typ = actual[0].__class__
+ if typ is bytes:
+ # bytes fields are represented as lists of ints in JSON files
+ # (Python 2: need to use bytearray, not bytes)
+ expected = [bytearray(v) for v in expected]
+ elif issubclass(typ, datetime.datetime):
+ # timestamp fields are represented as strings in JSON files
+ expected = pd.to_datetime(expected)
+ elif issubclass(typ, datetime.date):
+ # # date fields are represented as strings in JSON files
+ expected = expected.dt.date
+ elif typ is decimal.Decimal:
+ converted_decimals = [None] * len(expected)
+ # decimal fields are represented as reals in JSON files
+ for i, (d, v) in enumerate(zip(actual, expected)):
+ if not pd.isnull(v):
+ exp = d.as_tuple().exponent
+ factor = 10 ** -exp
+ converted_decimals[i] = (
+ decimal.Decimal(round(v * factor)).scaleb(exp))
+ expected = pd.Series(converted_decimals)
+
+ expected_cols[name] = expected
+
+
+def check_example_values(orc_df, expected_df, start=None, stop=None):
+ if start is not None or stop is not None:
+ expected_df = expected_df[start:stop].reset_index(drop=True)
+ assert_frame_equal(orc_df, expected_df, check_dtype=False)
+
+
+def check_example_file(orc_path, expected_df, need_fix=False):
+ """
+ Check a ORC file against the expected columns dictionary.
+ """
+ from pyarrow import orc
+
+ orc_file = orc.ORCFile(orc_path)
+ # Exercise ORCFile.read()
+ table = orc_file.read()
+ assert isinstance(table, pa.Table)
+
+ # This workaround needed because of ARROW-3080
+ orc_df = pd.DataFrame(table.to_pydict())
+
+ assert set(expected_df.columns) == set(orc_df.columns)
+
+ # reorder columns if necessary
+ if not orc_df.columns.equals(expected_df.columns):
+ expected_df = expected_df.reindex(columns=orc_df.columns)
+
+ if need_fix:
+ fix_example_values(orc_df, expected_df)
+
+ check_example_values(orc_df, expected_df)
+ # Exercise ORCFile.read_stripe()
+ json_pos = 0
+ for i in range(orc_file.nstripes):
+ batch = orc_file.read_stripe(i)
+ check_example_values(pd.DataFrame(batch.to_pydict()),
+ expected_df,
+ start=json_pos,
+ stop=json_pos + len(batch))
+ json_pos += len(batch)
+ assert json_pos == orc_file.nrows
+
+
+def check_example_using_json(example_name):
+ """
+ Check a ORC file example against the equivalent JSON file, as given
+ in the Apache ORC repository (the JSON file has one JSON object per
+ line, corresponding to one row in the ORC file).
+ """
+ # Read JSON file
+ json_path = path_for_json_example(example_name)
+ if json_path.endswith('.gz'):
+ f = gzip.open(json_path, 'r')
+ else:
+ f = open(json_path, 'r')
+
+ table = pd.read_json(f, lines=True)
+
+ check_example_file(path_for_orc_example(example_name), table,
+ need_fix=True)
+
+
+@pytest.mark.xfail(strict=True, reason="ARROW-3049")
+def test_orcfile_empty():
+ check_example_using_json('TestOrcFile.emptyFile')
+
+
+def test_orcfile_test1_json():
+ # Exercise the JSON test path
+ check_example_using_json('TestOrcFile.test1')
+
+
+def test_orcfile_test1_pickle():
+ check_example_using_json('TestOrcFile.test1')
+
+
+def test_orcfile_dates():
+ check_example_using_json('TestOrcFile.testDate1900')
+
+
+def test_orcfile_decimals():
+ check_example_using_json('decimal')