You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2018/08/18 17:45:53 UTC

[arrow] branch master updated: ARROW-1968: [C++/Python] Add basic unit tests for ORC reader

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new c453a7a  ARROW-1968: [C++/Python] Add basic unit tests for ORC reader
c453a7a is described below

commit c453a7ad3e004b3ad2a7d60a5e9853b4a63a76e0
Author: Wes McKinney <we...@apache.org>
AuthorDate: Sat Aug 18 13:45:46 2018 -0400

    ARROW-1968: [C++/Python] Add basic unit tests for ORC reader
    
    These use the example files published in the Apache ORC repository.
    
    Author: Wes McKinney <we...@apache.org>
    Author: Antoine Pitrou <an...@python.org>
    
    Closes #2428 from pitrou/ARROW-1968-orc-tests and squashes the following commits:
    
    6f538923 <Wes McKinney> Update quay.io image used for pandas 0.23.4
    28335ff9 <Wes McKinney> Use pandas 0.23.4 everywhere
    d5d34e53 <Wes McKinney> Upgrade pandas in manylinux1 image to latest
    c51a77f1 <Wes McKinney> Do not use binary read mode
    dd456204 <Wes McKinney> Use pandas for ORC tests, delete pickles
    43b0208a <Antoine Pitrou> Add decimals test
    06c0f5f4 <Antoine Pitrou> Fix tests on py2
    42f01864 <Antoine Pitrou> ARROW-1968:  Add basic unit tests for ORC reader
---
 python/manylinux1/Dockerfile-x86_64                |   2 +-
 python/manylinux1/Dockerfile-x86_64_base           |   2 +-
 python/manylinux1/scripts/build_virtualenvs.sh     |   6 +-
 python/pyarrow/tests/conftest.py                   |  10 +-
 python/pyarrow/tests/data/orc/README.md            |  22 +++
 .../tests/data/orc/TestOrcFile.emptyFile.jsn.gz    | Bin 0 -> 50 bytes
 .../tests/data/orc/TestOrcFile.emptyFile.orc       | Bin 0 -> 523 bytes
 .../tests/data/orc/TestOrcFile.test1.jsn.gz        | Bin 0 -> 323 bytes
 .../pyarrow/tests/data/orc/TestOrcFile.test1.orc   | Bin 0 -> 1711 bytes
 .../tests/data/orc/TestOrcFile.testDate1900.jsn.gz | Bin 0 -> 182453 bytes
 .../tests/data/orc/TestOrcFile.testDate1900.orc    | Bin 0 -> 30941 bytes
 python/pyarrow/tests/data/orc/decimal.jsn.gz       | Bin 0 -> 19313 bytes
 python/pyarrow/tests/data/orc/decimal.orc          | Bin 0 -> 16337 bytes
 python/pyarrow/tests/test_orc.py                   | 160 +++++++++++++++++++++
 14 files changed, 195 insertions(+), 7 deletions(-)

diff --git a/python/manylinux1/Dockerfile-x86_64 b/python/manylinux1/Dockerfile-x86_64
index 4b26e21..306610a 100644
--- a/python/manylinux1/Dockerfile-x86_64
+++ b/python/manylinux1/Dockerfile-x86_64
@@ -14,7 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-FROM quay.io/xhochy/arrow_manylinux1_x86_64_base:ARROW-2850
+FROM quay.io/wesm/arrow_manylinux1_x86_64_base:ARROW-1968
 
 ADD arrow /arrow
 WORKDIR /arrow/cpp
diff --git a/python/manylinux1/Dockerfile-x86_64_base b/python/manylinux1/Dockerfile-x86_64_base
index 955abdd..1302590 100644
--- a/python/manylinux1/Dockerfile-x86_64_base
+++ b/python/manylinux1/Dockerfile-x86_64_base
@@ -17,7 +17,7 @@
 FROM quay.io/pypa/manylinux1_x86_64:latest
 
 # Install dependencies
-RUN yum install -y flex zlib-devel && yum clean all
+RUN yum install -y flex zlib-devel wget && yum clean all
 
 ADD scripts/build_openssl.sh /
 RUN /build_openssl.sh
diff --git a/python/manylinux1/scripts/build_virtualenvs.sh b/python/manylinux1/scripts/build_virtualenvs.sh
index 6eb0c5a..7361ea0 100755
--- a/python/manylinux1/scripts/build_virtualenvs.sh
+++ b/python/manylinux1/scripts/build_virtualenvs.sh
@@ -19,7 +19,7 @@
 # Build upon the scripts in https://github.com/matthew-brett/manylinux-builds
 # * Copyright (c) 2013-2016, Matt Terry and Matthew Brett (BSD 2-clause)
 
-PYTHON_VERSIONS="${PYTHON_VERSIONS:-2.7,16 2.7,32 3.4,16 3.5,16 3.6,16}"
+PYTHON_VERSIONS="${PYTHON_VERSIONS:-2.7,16 2.7,32 3.5,16 3.6,16}"
 
 source /multibuild/manylinux_utils.sh
 
@@ -35,13 +35,13 @@ for PYTHON_TUPLE in ${PYTHON_VERSIONS}; do
     echo "=== (${PYTHON}, ${U_WIDTH}) Installing build dependencies ==="
     $PIP install "numpy==1.10.4"
     $PIP install "cython==0.28.1"
-    $PIP install "pandas==0.20.3"
+    $PIP install "pandas==0.23.4"
     $PIP install "virtualenv==15.1.0"
 
     echo "=== (${PYTHON}, ${U_WIDTH}) Preparing virtualenv for tests ==="
     "$(cpython_path $PYTHON ${U_WIDTH})/bin/virtualenv" -p ${PYTHON_INTERPRETER} --no-download /venv-test-${PYTHON}-${U_WIDTH}
     source /venv-test-${PYTHON}-${U_WIDTH}/bin/activate
-    pip install pytest 'numpy==1.14.0' 'pandas==0.20.3'
+    pip install pytest 'numpy==1.14.0' 'pandas==0.23.4'
     deactivate
 done
 
diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py
index b0eff1e..e67aac1 100644
--- a/python/pyarrow/tests/conftest.py
+++ b/python/pyarrow/tests/conftest.py
@@ -21,6 +21,7 @@ from pytest import skip, mark
 groups = [
     'hdfs',
     'large_memory',
+    'orc',
     'parquet',
     'plasma',
     's3',
@@ -31,20 +32,25 @@ groups = [
 defaults = {
     'hdfs': False,
     'large_memory': False,
+    'orc': False,
     'parquet': False,
     'plasma': False,
-    'large_memory': False,
     's3': False,
     'tensorflow': False
 }
 
 try:
+    import pyarrow.orc # noqa
+    defaults['orc'] = True
+except ImportError:
+    pass
+
+try:
     import pyarrow.parquet  # noqa
     defaults['parquet'] = True
 except ImportError:
     pass
 
-
 try:
     import pyarrow.plasma as plasma  # noqa
     defaults['plasma'] = True
diff --git a/python/pyarrow/tests/data/orc/README.md b/python/pyarrow/tests/data/orc/README.md
new file mode 100644
index 0000000..ccbb0e8
--- /dev/null
+++ b/python/pyarrow/tests/data/orc/README.md
@@ -0,0 +1,22 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+The ORC and JSON files come from the `examples` directory in the Apache ORC
+source tree:
+https://github.com/apache/orc/tree/master/examples
diff --git a/python/pyarrow/tests/data/orc/TestOrcFile.emptyFile.jsn.gz b/python/pyarrow/tests/data/orc/TestOrcFile.emptyFile.jsn.gz
new file mode 100644
index 0000000..91c85cd
Binary files /dev/null and b/python/pyarrow/tests/data/orc/TestOrcFile.emptyFile.jsn.gz differ
diff --git a/python/pyarrow/tests/data/orc/TestOrcFile.emptyFile.orc b/python/pyarrow/tests/data/orc/TestOrcFile.emptyFile.orc
new file mode 100644
index 0000000..ecdadcb
Binary files /dev/null and b/python/pyarrow/tests/data/orc/TestOrcFile.emptyFile.orc differ
diff --git a/python/pyarrow/tests/data/orc/TestOrcFile.test1.jsn.gz b/python/pyarrow/tests/data/orc/TestOrcFile.test1.jsn.gz
new file mode 100644
index 0000000..5eab19a
Binary files /dev/null and b/python/pyarrow/tests/data/orc/TestOrcFile.test1.jsn.gz differ
diff --git a/python/pyarrow/tests/data/orc/TestOrcFile.test1.orc b/python/pyarrow/tests/data/orc/TestOrcFile.test1.orc
new file mode 100644
index 0000000..4fb0bef
Binary files /dev/null and b/python/pyarrow/tests/data/orc/TestOrcFile.test1.orc differ
diff --git a/python/pyarrow/tests/data/orc/TestOrcFile.testDate1900.jsn.gz b/python/pyarrow/tests/data/orc/TestOrcFile.testDate1900.jsn.gz
new file mode 100644
index 0000000..62dbaba
Binary files /dev/null and b/python/pyarrow/tests/data/orc/TestOrcFile.testDate1900.jsn.gz differ
diff --git a/python/pyarrow/tests/data/orc/TestOrcFile.testDate1900.orc b/python/pyarrow/tests/data/orc/TestOrcFile.testDate1900.orc
new file mode 100644
index 0000000..f51ffdb
Binary files /dev/null and b/python/pyarrow/tests/data/orc/TestOrcFile.testDate1900.orc differ
diff --git a/python/pyarrow/tests/data/orc/decimal.jsn.gz b/python/pyarrow/tests/data/orc/decimal.jsn.gz
new file mode 100644
index 0000000..e634bd7
Binary files /dev/null and b/python/pyarrow/tests/data/orc/decimal.jsn.gz differ
diff --git a/python/pyarrow/tests/data/orc/decimal.orc b/python/pyarrow/tests/data/orc/decimal.orc
new file mode 100644
index 0000000..cb0f7b9
Binary files /dev/null and b/python/pyarrow/tests/data/orc/decimal.orc differ
diff --git a/python/pyarrow/tests/test_orc.py b/python/pyarrow/tests/test_orc.py
new file mode 100644
index 0000000..311a5d4
--- /dev/null
+++ b/python/pyarrow/tests/test_orc.py
@@ -0,0 +1,160 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import datetime
+import decimal
+import gzip
+import os
+
+from pandas.util.testing import assert_frame_equal
+import pandas as pd
+import pytest
+
+import pyarrow as pa
+
+
+# Marks all of the tests in this module
+# Ignore these with pytest ... -m 'not orc'
+pytestmark = pytest.mark.orc
+
+
+here = os.path.abspath(os.path.dirname(__file__))
+orc_data_dir = os.path.join(here, 'data', 'orc')
+
+
+def path_for_orc_example(name):
+    return os.path.join(orc_data_dir, '%s.orc' % name)
+
+
+def path_for_json_example(name):
+    return os.path.join(orc_data_dir, '%s.jsn.gz' % name)
+
+
+def fix_example_values(actual_cols, expected_cols):
+    """
+    Fix type of expected values (as read from JSON) according to
+    actual ORC datatype.
+    """
+    for name in expected_cols:
+        expected = expected_cols[name]
+        actual = actual_cols[name]
+        typ = actual[0].__class__
+        if typ is bytes:
+            # bytes fields are represented as lists of ints in JSON files
+            # (Python 2: need to use bytearray, not bytes)
+            expected = [bytearray(v) for v in expected]
+        elif issubclass(typ, datetime.datetime):
+            # timestamp fields are represented as strings in JSON files
+            expected = pd.to_datetime(expected)
+        elif issubclass(typ, datetime.date):
+            # # date fields are represented as strings in JSON files
+            expected = expected.dt.date
+        elif typ is decimal.Decimal:
+            converted_decimals = [None] * len(expected)
+            # decimal fields are represented as reals in JSON files
+            for i, (d, v) in enumerate(zip(actual, expected)):
+                if not pd.isnull(v):
+                    exp = d.as_tuple().exponent
+                    factor = 10 ** -exp
+                    converted_decimals[i] = (
+                        decimal.Decimal(round(v * factor)).scaleb(exp))
+            expected = pd.Series(converted_decimals)
+
+        expected_cols[name] = expected
+
+
+def check_example_values(orc_df, expected_df, start=None, stop=None):
+    if start is not None or stop is not None:
+        expected_df = expected_df[start:stop].reset_index(drop=True)
+    assert_frame_equal(orc_df, expected_df, check_dtype=False)
+
+
+def check_example_file(orc_path, expected_df, need_fix=False):
+    """
+    Check a ORC file against the expected columns dictionary.
+    """
+    from pyarrow import orc
+
+    orc_file = orc.ORCFile(orc_path)
+    # Exercise ORCFile.read()
+    table = orc_file.read()
+    assert isinstance(table, pa.Table)
+
+    # This workaround needed because of ARROW-3080
+    orc_df = pd.DataFrame(table.to_pydict())
+
+    assert set(expected_df.columns) == set(orc_df.columns)
+
+    # reorder columns if necessary
+    if not orc_df.columns.equals(expected_df.columns):
+        expected_df = expected_df.reindex(columns=orc_df.columns)
+
+    if need_fix:
+        fix_example_values(orc_df, expected_df)
+
+    check_example_values(orc_df, expected_df)
+    # Exercise ORCFile.read_stripe()
+    json_pos = 0
+    for i in range(orc_file.nstripes):
+        batch = orc_file.read_stripe(i)
+        check_example_values(pd.DataFrame(batch.to_pydict()),
+                             expected_df,
+                             start=json_pos,
+                             stop=json_pos + len(batch))
+        json_pos += len(batch)
+    assert json_pos == orc_file.nrows
+
+
+def check_example_using_json(example_name):
+    """
+    Check a ORC file example against the equivalent JSON file, as given
+    in the Apache ORC repository (the JSON file has one JSON object per
+    line, corresponding to one row in the ORC file).
+    """
+    # Read JSON file
+    json_path = path_for_json_example(example_name)
+    if json_path.endswith('.gz'):
+        f = gzip.open(json_path, 'r')
+    else:
+        f = open(json_path, 'r')
+
+    table = pd.read_json(f, lines=True)
+
+    check_example_file(path_for_orc_example(example_name), table,
+                       need_fix=True)
+
+
+@pytest.mark.xfail(strict=True, reason="ARROW-3049")
+def test_orcfile_empty():
+    check_example_using_json('TestOrcFile.emptyFile')
+
+
+def test_orcfile_test1_json():
+    # Exercise the JSON test path
+    check_example_using_json('TestOrcFile.test1')
+
+
+def test_orcfile_test1_pickle():
+    check_example_using_json('TestOrcFile.test1')
+
+
+def test_orcfile_dates():
+    check_example_using_json('TestOrcFile.testDate1900')
+
+
+def test_orcfile_decimals():
+    check_example_using_json('decimal')