You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2020/04/07 00:32:25 UTC

[arrow] branch master updated: ARROW-8345: [Python] Ensure feather read/write can work without pandas installed

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 535a865  ARROW-8345: [Python] Ensure feather read/write can work without pandas installed
535a865 is described below

commit 535a865cd28e57565f7f941846d604df1eaee77c
Author: Joris Van den Bossche <jo...@gmail.com>
AuthorDate: Mon Apr 6 19:32:03 2020 -0500

    ARROW-8345: [Python] Ensure feather read/write can work without pandas installed
    
    Closes #6849 from jorisvandenbossche/ARROW-8345
    
    Lead-authored-by: Joris Van den Bossche <jo...@gmail.com>
    Co-authored-by: Krisztián Szűcs <sz...@gmail.com>
    Signed-off-by: Wes McKinney <we...@apache.org>
---
 python/pyarrow/feather.py            | 15 ++++++-----
 python/pyarrow/tests/test_feather.py | 51 +++++++++++++++++++++++++++++++-----
 2 files changed, 54 insertions(+), 12 deletions(-)

diff --git a/python/pyarrow/feather.py b/python/pyarrow/feather.py
index bf04fb2..3aec187 100644
--- a/python/pyarrow/feather.py
+++ b/python/pyarrow/feather.py
@@ -137,10 +137,11 @@ def write_feather(df, dest, compression=None, compression_level=None,
         Feather file version. Version 2 is the current. Version 1 is the more
         limited legacy format
     """
-    _check_pandas_version()
-    if (_pandas_api.has_sparse
-            and isinstance(df, _pandas_api.pd.SparseDataFrame)):
-        df = df.to_dense()
+    if _pandas_api.have_pandas:
+        _check_pandas_version()
+        if (_pandas_api.has_sparse
+                and isinstance(df, _pandas_api.pd.SparseDataFrame)):
+            df = df.to_dense()
 
     if _pandas_api.is_data_frame(df):
         table = Table.from_pandas(df, preserve_index=False)
@@ -201,7 +202,10 @@ def read_feather(source, columns=None, use_threads=True):
     -------
     df : pandas.DataFrame
     """
-    return read_table(source, columns=columns).to_pandas(use_threads=True)
+    _check_pandas_version()
+    return read_table(source, columns=columns).to_pandas(
+        use_threads=use_threads
+    )
 
 
 def read_table(source, columns=None, memory_map=True):
@@ -221,7 +225,6 @@ def read_table(source, columns=None, memory_map=True):
     -------
     table : pyarrow.Table
     """
-    _check_pandas_version()
     reader = ext.FeatherReader()
     reader.open(source, use_memory_map=memory_map)
 
diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py
index 2fc0e53..b5d77fd 100644
--- a/python/pyarrow/tests/test_feather.py
+++ b/python/pyarrow/tests/test_feather.py
@@ -36,12 +36,6 @@ except ImportError:
     pass
 
 
-# TODO(wesm): The Feather tests currently are tangled with pandas
-# dependency. We should isolate the pandas-depending parts and mark those with
-# pytest.mark.pandas
-pytestmark = pytest.mark.pandas
-
-
 def random_path(prefix='feather_'):
     return tempfile.mktemp(prefix=prefix)
 
@@ -67,6 +61,7 @@ def teardown_module(module):
             pass
 
 
+@pytest.mark.pandas
 def test_file_not_exist():
     with pytest.raises(pa.ArrowIOError):
         read_feather('test_invalid_file')
@@ -107,6 +102,7 @@ def _assert_error_on_write(df, exc, path=None):
     pytest.raises(exc, f)
 
 
+@pytest.mark.pandas
 def test_dataset(version):
     num_values = (100, 100)
     num_files = 5
@@ -126,6 +122,7 @@ def test_dataset(version):
     assert_frame_equal(data, df)
 
 
+@pytest.mark.pandas
 def test_float_no_nulls(version):
     data = {}
     numpy_dtypes = ['f4', 'f8']
@@ -139,6 +136,7 @@ def test_float_no_nulls(version):
     _check_pandas_roundtrip(df, version=version)
 
 
+@pytest.mark.pandas
 def test_read_table(version):
     num_values = (100, 100)
     path = random_path()
@@ -163,6 +161,7 @@ def test_read_table(version):
     assert_frame_equal(table.to_pandas(), result.to_pandas())
 
 
+@pytest.mark.pandas
 def test_float_nulls(version):
     num_values = 100
 
@@ -192,6 +191,7 @@ def test_float_nulls(version):
     assert_frame_equal(result, ex_frame)
 
 
+@pytest.mark.pandas
 def test_integer_no_nulls(version):
     data = {}
 
@@ -207,6 +207,7 @@ def test_integer_no_nulls(version):
     _check_pandas_roundtrip(df, version=version)
 
 
+@pytest.mark.pandas
 def test_platform_numpy_integers(version):
     data = {}
 
@@ -221,6 +222,7 @@ def test_platform_numpy_integers(version):
     _check_pandas_roundtrip(df, version=version)
 
 
+@pytest.mark.pandas
 def test_integer_with_nulls(version):
     # pandas requires upcast to float dtype
     path = random_path()
@@ -251,6 +253,7 @@ def test_integer_with_nulls(version):
     assert_frame_equal(result, ex_frame)
 
 
+@pytest.mark.pandas
 def test_boolean_no_nulls(version):
     num_values = 100
 
@@ -260,6 +263,7 @@ def test_boolean_no_nulls(version):
     _check_pandas_roundtrip(df, version=version)
 
 
+@pytest.mark.pandas
 def test_boolean_nulls(version):
     # pandas requires upcast to object dtype
     path = random_path()
@@ -283,6 +287,7 @@ def test_boolean_nulls(version):
     assert_frame_equal(result, ex_frame)
 
 
+@pytest.mark.pandas
 def test_buffer_bounds_error(version):
     # ARROW-1676
     path = random_path()
@@ -300,6 +305,7 @@ def test_buffer_bounds_error(version):
         _check_pandas_roundtrip(expected, version=version)
 
 
+@pytest.mark.pandas
 def test_boolean_object_nulls(version):
     repeats = 100
     arr = np.array([False, None, True] * repeats, dtype=object)
@@ -307,6 +313,7 @@ def test_boolean_object_nulls(version):
     _check_pandas_roundtrip(df, version=version)
 
 
+@pytest.mark.pandas
 def test_delete_partial_file_on_error(version):
     if sys.platform == 'win32':
         pytest.skip('Windows hangs on to file handle for some reason')
@@ -330,6 +337,7 @@ def test_delete_partial_file_on_error(version):
     assert not os.path.exists(path)
 
 
+@pytest.mark.pandas
 def test_strings(version):
     repeats = 1000
 
@@ -353,16 +361,19 @@ def test_strings(version):
     _check_pandas_roundtrip(df, expected, version=version)
 
 
+@pytest.mark.pandas
 def test_empty_strings(version):
     df = pd.DataFrame({'strings': [''] * 10})
     _check_pandas_roundtrip(df, version=version)
 
 
+@pytest.mark.pandas
 def test_all_none(version):
     df = pd.DataFrame({'all_none': [None] * 10})
     _check_pandas_roundtrip(df, version=version)
 
 
+@pytest.mark.pandas
 def test_all_null_category(version):
     # ARROW-1188
     df = pd.DataFrame({"A": (1, 2, 3), "B": (None, None, None)})
@@ -370,6 +381,7 @@ def test_all_null_category(version):
     _check_pandas_roundtrip(df, version=version)
 
 
+@pytest.mark.pandas
 def test_multithreaded_read(version):
     data = {'c{}'.format(i): [''] * 10
             for i in range(100)}
@@ -377,6 +389,7 @@ def test_multithreaded_read(version):
     _check_pandas_roundtrip(df, use_threads=True, version=version)
 
 
+@pytest.mark.pandas
 def test_nan_as_null(version):
     # Create a nan that is not numpy.nan
     values = np.array(['foo', np.nan, np.nan * 2, 'bar'] * 10)
@@ -384,6 +397,7 @@ def test_nan_as_null(version):
     _check_pandas_roundtrip(df, version=version)
 
 
+@pytest.mark.pandas
 def test_category(version):
     repeats = 1000
     values = ['foo', None, 'bar', 'qux', np.nan]
@@ -395,6 +409,7 @@ def test_category(version):
     _check_pandas_roundtrip(df, expected, version=version)
 
 
+@pytest.mark.pandas
 def test_timestamp(version):
     df = pd.DataFrame({'naive': pd.date_range('2016-03-28', periods=10)})
     df['with_tz'] = (df.naive.dt.tz_localize('utc')
@@ -403,6 +418,7 @@ def test_timestamp(version):
     _check_pandas_roundtrip(df, version=version)
 
 
+@pytest.mark.pandas
 def test_timestamp_with_nulls(version):
     df = pd.DataFrame({'test': [pd.Timestamp(2016, 1, 1),
                                 None,
@@ -412,6 +428,7 @@ def test_timestamp_with_nulls(version):
     _check_pandas_roundtrip(df, version=version)
 
 
+@pytest.mark.pandas
 @pytest.mark.xfail(reason="not supported", raises=TypeError)
 def test_timedelta_with_nulls_v1():
     df = pd.DataFrame({'test': [pd.Timedelta('1 day'),
@@ -420,6 +437,7 @@ def test_timedelta_with_nulls_v1():
     _check_pandas_roundtrip(df, version=1)
 
 
+@pytest.mark.pandas
 def test_timedelta_with_nulls():
     df = pd.DataFrame({'test': [pd.Timedelta('1 day'),
                                 None,
@@ -427,6 +445,7 @@ def test_timedelta_with_nulls():
     _check_pandas_roundtrip(df, version=2)
 
 
+@pytest.mark.pandas
 def test_out_of_float64_timestamp_with_nulls(version):
     df = pd.DataFrame(
         {'test': pd.DatetimeIndex([1451606400000000001,
@@ -435,6 +454,7 @@ def test_out_of_float64_timestamp_with_nulls(version):
     _check_pandas_roundtrip(df, version=version)
 
 
+@pytest.mark.pandas
 def test_non_string_columns(version):
     df = pd.DataFrame({0: [1, 2, 3, 4],
                        1: [True, False, True, False]})
@@ -443,6 +463,7 @@ def test_non_string_columns(version):
     _check_pandas_roundtrip(df, expected, version=version)
 
 
+@pytest.mark.pandas
 @pytest.mark.skipif(not os.path.supports_unicode_filenames,
                     reason='unicode filenames not supported')
 def test_unicode_filename(version):
@@ -453,6 +474,7 @@ def test_unicode_filename(version):
                             version=version)
 
 
+@pytest.mark.pandas
 def test_read_columns(version):
     df = pd.DataFrame({
         'foo': [1, 2, 3, 4],
@@ -465,6 +487,7 @@ def test_read_columns(version):
                             columns=['boo', 'woo'])
 
 
+@pytest.mark.pandas
 def test_overwritten_file(version):
     path = random_path()
     TEST_FILES.append(path)
@@ -479,6 +502,7 @@ def test_overwritten_file(version):
     _check_pandas_roundtrip(df, path=path, version=version)
 
 
+@pytest.mark.pandas
 def test_filelike_objects(version):
     buf = io.BytesIO()
 
@@ -493,6 +517,7 @@ def test_filelike_objects(version):
     assert_frame_equal(result, df)
 
 
+@pytest.mark.pandas
 @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning")
 @pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning")
 def test_sparse_dataframe(version):
@@ -506,6 +531,7 @@ def test_sparse_dataframe(version):
     _check_pandas_roundtrip(df, expected, version=version)
 
 
+@pytest.mark.pandas
 def test_duplicate_columns():
 
     # https://github.com/wesm/feather/issues/53
@@ -515,6 +541,7 @@ def test_duplicate_columns():
     _assert_error_on_write(df, ValueError)
 
 
+@pytest.mark.pandas
 def test_unsupported():
     # https://github.com/wesm/feather/issues/240
     # serializing actual python objects
@@ -531,6 +558,7 @@ def test_unsupported():
     _assert_error_on_write(df, TypeError)
 
 
+@pytest.mark.pandas
 def test_v2_set_chunksize():
     df = pd.DataFrame({'A': np.arange(1000)})
     table = pa.table(df)
@@ -545,6 +573,7 @@ def test_v2_set_chunksize():
     assert len(ipc_file.get_batch(0)) == 250
 
 
+@pytest.mark.pandas
 def test_v2_compression_options():
     df = pd.DataFrame({'A': np.arange(1000)})
 
@@ -597,12 +626,14 @@ def test_v1_unsupported_types():
 
 
 @pytest.mark.slow
+@pytest.mark.pandas
 def test_large_dataframe(version):
     df = pd.DataFrame({'A': np.arange(400000000)})
     _check_pandas_roundtrip(df, version=version)
 
 
 @pytest.mark.large_memory
+@pytest.mark.pandas
 def test_chunked_binary_error_message():
     # ARROW-3058: As Feather does not yet support chunked columns, we at least
     # make sure it's clear to the user what is going on
@@ -623,3 +654,11 @@ def test_chunked_binary_error_message():
                        "capacity of a Feather binary column. This restriction "
                        "may be lifted in the future"):
         write_feather(df, io.BytesIO(), version=1)
+
+
+def test_feather_without_pandas(tempdir, version):
+    # ARROW-8345
+    table = pa.table([pa.array([1, 2, 3])], names=['f0'])
+    write_feather(table, str(tempdir / "data.feather"), version=version)
+    result = read_table(str(tempdir / "data.feather"))
+    assert result.equals(table)