You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by uw...@apache.org on 2017/06/05 10:18:37 UTC

arrow git commit: ARROW-1051: [Python] Opt in to Parquet unit tests to avoid accidental suppression of dynamic linking errors

Repository: arrow
Updated Branches:
  refs/heads/master a81aefbd8 -> 8f2b44b89


ARROW-1051: [Python] Opt in to Parquet unit tests to avoid accidental suppression of dynamic linking errors

Author: Wes McKinney <we...@twosigma.com>

Closes #729 from wesm/ARROW-1051 and squashes the following commits:

019b9ec [Wes McKinney] Statically link boost in parquet-cpp
5103077 [Wes McKinney] See if updating conda helps
7eac948 [Wes McKinney] See if setting PATH solves problem
e246e19 [Wes McKinney] Red herring, issue was runtime library loading
6bc0492 [Wes McKinney] Set PARQUET_ARROW_VERSION in Windows build
a1f2d2b [Wes McKinney] Opt in to Parquet unit tests so that import errors from pyarrow.parquet bubble up


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/8f2b44b8
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/8f2b44b8
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/8f2b44b8

Branch: refs/heads/master
Commit: 8f2b44b897b7083ee2a296c70397dc2d7d21d95e
Parents: a81aefb
Author: Wes McKinney <we...@twosigma.com>
Authored: Mon Jun 5 12:18:32 2017 +0200
Committer: Uwe L. Korn <uw...@xhochy.com>
Committed: Mon Jun 5 12:18:32 2017 +0200

----------------------------------------------------------------------
 ci/msvc-build.bat                    |  10 ++-
 ci/travis_script_python.sh           |   2 +-
 python/pyarrow/tests/conftest.py     |   2 +-
 python/pyarrow/tests/test_parquet.py | 132 +++++++++++++++++++-----------
 4 files changed, 92 insertions(+), 54 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/8f2b44b8/ci/msvc-build.bat
----------------------------------------------------------------------
diff --git a/ci/msvc-build.bat b/ci/msvc-build.bat
index d13c11f..263d4bc 100644
--- a/ci/msvc-build.bat
+++ b/ci/msvc-build.bat
@@ -17,6 +17,8 @@
 
 @echo on
 
+conda update --yes --quiet conda
+
 conda create -n arrow -q -y python=%PYTHON% ^
       six pytest setuptools numpy pandas cython
 conda install -n arrow -q -y -c conda-forge ^
@@ -43,7 +45,7 @@ cmake -G "%GENERATOR%" ^
 cmake --build . --target INSTALL --config Release  || exit /B
 
 @rem Needed so python-test.exe works
-set PYTHONPATH=%CONDA_PREFIX%\Lib;%CONDA_PREFIX%\Lib\site-packages;%CONDA_PREFIX%\python35.zip;%CONDA_PREFIX%\DLLs;%CONDA_PREFIX%
+set PYTHONPATH=%CONDA_PREFIX%\Lib;%CONDA_PREFIX%\Lib\site-packages;%CONDA_PREFIX%\python35.zip;%CONDA_PREFIX%\DLLs;%CONDA_PREFIX%;%PYTHONPATH%
 
 ctest -VV  || exit /B
 popd
@@ -59,15 +61,17 @@ set PARQUET_HOME=%CONDA_PREFIX%\Library
 cmake -G "%GENERATOR%" ^
      -DCMAKE_INSTALL_PREFIX=%PARQUET_HOME% ^
      -DCMAKE_BUILD_TYPE=Release ^
+     -DPARQUET_BOOST_USE_SHARED=OFF ^
      -DPARQUET_ZLIB_VENDORED=off ^
      -DPARQUET_BUILD_TESTS=off .. || exit /B
 cmake --build . --target INSTALL --config Release || exit /B
 popd
 
 @rem Build and import pyarrow
-set PYTHONPATH=
+@rem parquet-cpp has some additional runtime dependencies that we need to figure out
+@rem see PARQUET-1018
 
 pushd python
 python setup.py build_ext --inplace --with-parquet --bundle-arrow-cpp bdist_wheel  || exit /B
-py.test pyarrow -v -s || exit /B
+py.test pyarrow -v -s --parquet || exit /B
 popd

http://git-wip-us.apache.org/repos/asf/arrow/blob/8f2b44b8/ci/travis_script_python.sh
----------------------------------------------------------------------
diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh
index c3735cc..904db52 100755
--- a/ci/travis_script_python.sh
+++ b/ci/travis_script_python.sh
@@ -111,7 +111,7 @@ python_version_tests() {
   python -c "import pyarrow.parquet"
   python -c "import pyarrow._jemalloc"
 
-  python -m pytest -vv -r sxX pyarrow
+  python -m pytest -vv -r sxX pyarrow --parquet
 
   # Build documentation once
   if [[ "$PYTHON_VERSION" == "3.6" ]]

http://git-wip-us.apache.org/repos/asf/arrow/blob/8f2b44b8/python/pyarrow/tests/conftest.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py
index d5b4b69..9b767fc 100644
--- a/python/pyarrow/tests/conftest.py
+++ b/python/pyarrow/tests/conftest.py
@@ -18,7 +18,7 @@
 from pytest import skip
 
 
-groups = ['hdfs']
+groups = ['hdfs', 'parquet']
 
 
 def pytest_configure(config):

http://git-wip-us.apache.org/repos/asf/arrow/blob/8f2b44b8/python/pyarrow/tests/test_parquet.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index 5f65f28..052d395 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -32,13 +32,20 @@ import pandas as pd
 
 import pandas.util.testing as tm
 
-# Skip all parquet tests if we can't import pyarrow.parquet
-pq = pytest.importorskip('pyarrow.parquet')
-
 # Ignore these with pytest ... -m 'not parquet'
 parquet = pytest.mark.parquet
 
 
+def _write_table(*args, **kwargs):
+    import pyarrow.parquet as pq
+    return pq.write_table(*args, **kwargs)
+
+
+def _read_table(*args, **kwargs):
+    import pyarrow.parquet as pq
+    return pq.read_table(*args, **kwargs)
+
+
 @parquet
 def test_single_pylist_column_roundtrip(tmpdir):
     for dtype in [int, float]:
@@ -46,8 +53,8 @@ def test_single_pylist_column_roundtrip(tmpdir):
                                .format(dtype.__name__))
         data = [pa.array(list(map(dtype, range(5))))]
         table = pa.Table.from_arrays(data, names=('a', 'b'))
-        pq.write_table(table, filename.strpath)
-        table_read = pq.read_table(filename.strpath)
+        _write_table(table, filename.strpath)
+        table_read = _read_table(filename.strpath)
         for col_written, col_read in zip(table.itercolumns(),
                                          table_read.itercolumns()):
             assert col_written.name == col_read.name
@@ -84,13 +91,14 @@ def alltypes_sample(size=10000, seed=0):
 
 @parquet
 def test_pandas_parquet_2_0_rountrip(tmpdir):
+    import pyarrow.parquet as pq
     df = alltypes_sample(size=10000)
 
     filename = tmpdir.join('pandas_rountrip.parquet')
     arrow_table = pa.Table.from_pandas(df, timestamps_to_ms=True)
     assert b'pandas' in arrow_table.schema.metadata
 
-    pq.write_table(arrow_table, filename.strpath, version="2.0")
+    _write_table(arrow_table, filename.strpath, version="2.0")
     table_read = pq.read_pandas(filename.strpath)
     assert b'pandas' in table_read.schema.metadata
 
@@ -102,13 +110,15 @@ def test_pandas_parquet_2_0_rountrip(tmpdir):
 
 @parquet
 def test_pandas_parquet_custom_metadata(tmpdir):
+    import pyarrow.parquet as pq
+
     df = alltypes_sample(size=10000)
 
     filename = tmpdir.join('pandas_rountrip.parquet')
     arrow_table = pa.Table.from_pandas(df, timestamps_to_ms=True)
     assert b'pandas' in arrow_table.schema.metadata
 
-    pq.write_table(arrow_table, filename.strpath, version="2.0")
+    _write_table(arrow_table, filename.strpath, version="2.0")
     pf = pq.ParquetFile(filename.strpath)
 
     md = pf.metadata.metadata
@@ -120,6 +130,8 @@ def test_pandas_parquet_custom_metadata(tmpdir):
 
 @parquet
 def test_pandas_parquet_2_0_rountrip_read_pandas_no_index_written(tmpdir):
+    import pyarrow.parquet as pq
+
     df = alltypes_sample(size=10000)
 
     filename = tmpdir.join('pandas_rountrip.parquet')
@@ -129,7 +141,7 @@ def test_pandas_parquet_2_0_rountrip_read_pandas_no_index_written(tmpdir):
     js = json.loads(arrow_table.schema.metadata[b'pandas'].decode('utf8'))
     assert not js['index_columns']
 
-    pq.write_table(arrow_table, filename.strpath, version="2.0")
+    _write_table(arrow_table, filename.strpath, version="2.0")
     table_read = pq.read_pandas(filename.strpath)
 
     js = json.loads(table_read.schema.metadata[b'pandas'].decode('utf8'))
@@ -163,8 +175,8 @@ def test_pandas_parquet_1_0_rountrip(tmpdir):
     })
     filename = tmpdir.join('pandas_rountrip.parquet')
     arrow_table = pa.Table.from_pandas(df)
-    pq.write_table(arrow_table, filename.strpath, version="1.0")
-    table_read = pq.read_table(filename.strpath)
+    _write_table(arrow_table, filename.strpath, version="1.0")
+    table_read = _read_table(filename.strpath)
     df_read = table_read.to_pandas()
 
     # We pass uint32_t as int64_t if we write Parquet version 1.0
@@ -183,8 +195,8 @@ def test_pandas_column_selection(tmpdir):
     })
     filename = tmpdir.join('pandas_rountrip.parquet')
     arrow_table = pa.Table.from_pandas(df)
-    pq.write_table(arrow_table, filename.strpath)
-    table_read = pq.read_table(filename.strpath, columns=['uint8'])
+    _write_table(arrow_table, filename.strpath)
+    table_read = _read_table(filename.strpath, columns=['uint8'])
     df_read = table_read.to_pandas()
 
     tm.assert_frame_equal(df[['uint8']], df_read)
@@ -223,19 +235,21 @@ def test_pandas_parquet_native_file_roundtrip(tmpdir):
     df = _test_dataframe(10000)
     arrow_table = pa.Table.from_pandas(df)
     imos = pa.BufferOutputStream()
-    pq.write_table(arrow_table, imos, version="2.0")
+    _write_table(arrow_table, imos, version="2.0")
     buf = imos.get_result()
     reader = pa.BufferReader(buf)
-    df_read = pq.read_table(reader).to_pandas()
+    df_read = _read_table(reader).to_pandas()
     tm.assert_frame_equal(df, df_read)
 
 
 @parquet
 def test_read_pandas_column_subset(tmpdir):
+    import pyarrow.parquet as pq
+
     df = _test_dataframe(10000)
     arrow_table = pa.Table.from_pandas(df)
     imos = pa.BufferOutputStream()
-    pq.write_table(arrow_table, imos, version="2.0")
+    _write_table(arrow_table, imos, version="2.0")
     buf = imos.get_result()
     reader = pa.BufferReader(buf)
     df_read = pq.read_pandas(reader, columns=['strings', 'uint8']).to_pandas()
@@ -257,11 +271,11 @@ def test_pandas_parquet_pyfile_roundtrip(tmpdir):
     arrow_table = pa.Table.from_pandas(df)
 
     with open(filename, 'wb') as f:
-        pq.write_table(arrow_table, f, version="1.0")
+        _write_table(arrow_table, f, version="1.0")
 
     data = io.BytesIO(open(filename, 'rb').read())
 
-    table_read = pq.read_table(data)
+    table_read = _read_table(data)
     df_read = table_read.to_pandas()
     tm.assert_frame_equal(df, df_read)
 
@@ -287,27 +301,29 @@ def test_pandas_parquet_configuration_options(tmpdir):
     arrow_table = pa.Table.from_pandas(df)
 
     for use_dictionary in [True, False]:
-        pq.write_table(arrow_table, filename.strpath,
-                       version="2.0",
-                       use_dictionary=use_dictionary)
-        table_read = pq.read_table(filename.strpath)
+        _write_table(arrow_table, filename.strpath,
+                     version="2.0",
+                     use_dictionary=use_dictionary)
+        table_read = _read_table(filename.strpath)
         df_read = table_read.to_pandas()
         tm.assert_frame_equal(df, df_read)
 
     for compression in ['NONE', 'SNAPPY', 'GZIP']:
-        pq.write_table(arrow_table, filename.strpath,
-                       version="2.0",
-                       compression=compression)
-        table_read = pq.read_table(filename.strpath)
+        _write_table(arrow_table, filename.strpath,
+                     version="2.0",
+                     compression=compression)
+        table_read = _read_table(filename.strpath)
         df_read = table_read.to_pandas()
         tm.assert_frame_equal(df, df_read)
 
 
 def make_sample_file(df):
+    import pyarrow.parquet as pq
+
     a_table = pa.Table.from_pandas(df, timestamps_to_ms=True)
 
     buf = io.BytesIO()
-    pq.write_table(a_table, buf, compression='SNAPPY', version='2.0')
+    _write_table(a_table, buf, compression='SNAPPY', version='2.0')
 
     buf.seek(0)
     return pq.ParquetFile(buf)
@@ -384,8 +400,8 @@ def test_column_of_arrays(tmpdir):
     filename = tmpdir.join('pandas_rountrip.parquet')
     arrow_table = pa.Table.from_pandas(df, timestamps_to_ms=True,
                                        schema=schema)
-    pq.write_table(arrow_table, filename.strpath, version="2.0")
-    table_read = pq.read_table(filename.strpath)
+    _write_table(arrow_table, filename.strpath, version="2.0")
+    table_read = _read_table(filename.strpath)
     df_read = table_read.to_pandas()
     tm.assert_frame_equal(df, df_read)
 
@@ -397,8 +413,8 @@ def test_column_of_lists(tmpdir):
     filename = tmpdir.join('pandas_rountrip.parquet')
     arrow_table = pa.Table.from_pandas(df, timestamps_to_ms=True,
                                        schema=schema)
-    pq.write_table(arrow_table, filename.strpath, version="2.0")
-    table_read = pq.read_table(filename.strpath)
+    _write_table(arrow_table, filename.strpath, version="2.0")
+    table_read = _read_table(filename.strpath)
     df_read = table_read.to_pandas()
     tm.assert_frame_equal(df, df_read)
 
@@ -449,7 +465,7 @@ def test_date_time_types():
         buf = io.BytesIO()
 
         with pytest.raises(NotImplementedError):
-            pq.write_table(table, buf, version="2.0")
+            _write_table(table, buf, version="2.0")
 
     t7 = pa.time64('ns')
     a7 = pa.Array.from_pandas(data4.astype('int64'), type=t7)
@@ -470,13 +486,13 @@ def test_fixed_size_binary():
 
 def _check_roundtrip(table, expected=None, **params):
     buf = io.BytesIO()
-    pq.write_table(table, buf, **params)
+    _write_table(table, buf, **params)
     buf.seek(0)
 
     if expected is None:
         expected = table
 
-    result = pq.read_table(buf)
+    result = _read_table(buf)
     assert result.equals(expected)
 
 
@@ -487,13 +503,13 @@ def test_multithreaded_read():
     table = pa.Table.from_pandas(df, timestamps_to_ms=True)
 
     buf = io.BytesIO()
-    pq.write_table(table, buf, compression='SNAPPY', version='2.0')
+    _write_table(table, buf, compression='SNAPPY', version='2.0')
 
     buf.seek(0)
-    table1 = pq.read_table(buf, nthreads=4)
+    table1 = _read_table(buf, nthreads=4)
 
     buf.seek(0)
-    table2 = pq.read_table(buf, nthreads=1)
+    table2 = _read_table(buf, nthreads=1)
 
     assert table1.equals(table2)
 
@@ -504,26 +520,28 @@ def test_min_chunksize():
     table = pa.Table.from_pandas(data.reset_index())
 
     buf = io.BytesIO()
-    pq.write_table(table, buf, chunk_size=-1)
+    _write_table(table, buf, chunk_size=-1)
 
     buf.seek(0)
-    result = pq.read_table(buf)
+    result = _read_table(buf)
 
     assert result.equals(table)
 
     with pytest.raises(ValueError):
-        pq.write_table(table, buf, chunk_size=0)
+        _write_table(table, buf, chunk_size=0)
 
 
 @parquet
 def test_pass_separate_metadata():
+    import pyarrow.parquet as pq
+
     # ARROW-471
     df = alltypes_sample(size=10000)
 
     a_table = pa.Table.from_pandas(df, timestamps_to_ms=True)
 
     buf = io.BytesIO()
-    pq.write_table(a_table, buf, compression='snappy', version='2.0')
+    _write_table(a_table, buf, compression='snappy', version='2.0')
 
     buf.seek(0)
     metadata = pq.ParquetFile(buf).metadata
@@ -537,6 +555,8 @@ def test_pass_separate_metadata():
 
 @parquet
 def test_read_single_row_group():
+    import pyarrow.parquet as pq
+
     # ARROW-471
     N, K = 10000, 4
     df = alltypes_sample(size=N)
@@ -544,8 +564,8 @@ def test_read_single_row_group():
     a_table = pa.Table.from_pandas(df, timestamps_to_ms=True)
 
     buf = io.BytesIO()
-    pq.write_table(a_table, buf, row_group_size=N / K,
-                   compression='snappy', version='2.0')
+    _write_table(a_table, buf, row_group_size=N / K,
+                 compression='snappy', version='2.0')
 
     buf.seek(0)
 
@@ -560,13 +580,15 @@ def test_read_single_row_group():
 
 @parquet
 def test_read_single_row_group_with_column_subset():
+    import pyarrow.parquet as pq
+
     N, K = 10000, 4
     df = alltypes_sample(size=N)
     a_table = pa.Table.from_pandas(df, timestamps_to_ms=True)
 
     buf = io.BytesIO()
-    pq.write_table(a_table, buf, row_group_size=N / K,
-                   compression='snappy', version='2.0')
+    _write_table(a_table, buf, row_group_size=N / K,
+                 compression='snappy', version='2.0')
 
     buf.seek(0)
     pf = pq.ParquetFile(buf)
@@ -579,11 +601,13 @@ def test_read_single_row_group_with_column_subset():
 
 @parquet
 def test_parquet_piece_read(tmpdir):
+    import pyarrow.parquet as pq
+
     df = _test_dataframe(1000)
     table = pa.Table.from_pandas(df)
 
     path = tmpdir.join('parquet_piece_read.parquet').strpath
-    pq.write_table(table, path, version='2.0')
+    _write_table(table, path, version='2.0')
 
     piece1 = pq.ParquetDatasetPiece(path)
 
@@ -593,6 +617,8 @@ def test_parquet_piece_read(tmpdir):
 
 @parquet
 def test_parquet_piece_basics():
+    import pyarrow.parquet as pq
+
     path = '/baz.parq'
 
     piece1 = pq.ParquetDatasetPiece(path)
@@ -612,6 +638,8 @@ def test_parquet_piece_basics():
 
 @parquet
 def test_partition_set_dictionary_type():
+    import pyarrow.parquet as pq
+
     set1 = pq.PartitionSet('key1', [u('foo'), u('bar'), u('baz')])
     set2 = pq.PartitionSet('key2', [2007, 2008, 2009])
 
@@ -625,6 +653,8 @@ def test_partition_set_dictionary_type():
 
 @parquet
 def test_read_partitioned_directory(tmpdir):
+    import pyarrow.parquet as pq
+
     foo_keys = [0, 1]
     bar_keys = ['a', 'b', 'c']
     partition_spec = [
@@ -681,7 +711,7 @@ def _generate_partition_directories(base_dir, partition_spec, df):
 
                 filtered_df = _filter_partition(df, this_part_keys)
                 part_table = pa.Table.from_pandas(filtered_df)
-                pq.write_table(part_table, file_path)
+                _write_table(part_table, file_path)
             else:
                 _visit_level(level_dir, level + 1, this_part_keys)
 
@@ -690,6 +720,8 @@ def _generate_partition_directories(base_dir, partition_spec, df):
 
 @parquet
 def test_read_common_metadata_files(tmpdir):
+    import pyarrow.parquet as pq
+
     N = 100
     df = pd.DataFrame({
         'index': np.arange(N),
@@ -700,7 +732,7 @@ def test_read_common_metadata_files(tmpdir):
     data_path = pjoin(base_path, 'data.parquet')
 
     table = pa.Table.from_pandas(df)
-    pq.write_table(table, data_path)
+    _write_table(table, data_path)
 
     metadata_path = pjoin(base_path, '_metadata')
     pq.write_metadata(table.schema, metadata_path)
@@ -729,6 +761,8 @@ def _filter_partition(df, part_keys):
 
 @parquet
 def test_read_multiple_files(tmpdir):
+    import pyarrow.parquet as pq
+
     nfiles = 10
     size = 5
 
@@ -746,7 +780,7 @@ def test_read_multiple_files(tmpdir):
         path = pjoin(dirpath, '{0}.parquet'.format(i))
 
         table = pa.Table.from_pandas(df)
-        pq.write_table(table, path)
+        _write_table(table, path)
 
         test_data.append(table)
         paths.append(path)
@@ -792,7 +826,7 @@ def test_read_multiple_files(tmpdir):
     bad_apple_path = tmpdir.join('{0}.parquet'.format(guid())).strpath
 
     t = pa.Table.from_pandas(bad_apple)
-    pq.write_table(t, bad_apple_path)
+    _write_table(t, bad_apple_path)
 
     bad_meta = pq.ParquetFile(bad_apple_path).metadata