You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by jo...@apache.org on 2022/04/21 16:48:25 UTC
[arrow] branch master updated: ARROW-7914: [Python] Allow pandas datetime as index for feather

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 20ec0fda70 ARROW-7914: [Python] Allow pandas datetime as index for feather
20ec0fda70 is described below

commit 20ec0fda708b72e4398e422f8bc3ee8ef0a76528
Author: Salonijain27 <sa...@gmail.com>
AuthorDate: Thu Apr 21 18:48:14 2022 +0200

    ARROW-7914: [Python] Allow pandas datetime as index for feather
    
    Closes #12821 from Salonijain27/ARROW-7914_fetch_update
    
    Lead-authored-by: Salonijain27 <sa...@gmail.com>
    Co-authored-by: salonijain27 <sa...@Salonis-MacBook-Pro.local>
    Signed-off-by: Joris Van den Bossche <jo...@gmail.com>
---
 python/pyarrow/feather.py            | 12 +++++++++++-
 python/pyarrow/tests/test_feather.py | 22 ++++++++++++++++++++--
 2 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/python/pyarrow/feather.py b/python/pyarrow/feather.py
index 6824f4ba96..f20302d67b 100644
--- a/python/pyarrow/feather.py
+++ b/python/pyarrow/feather.py
@@ -151,7 +151,17 @@ def write_feather(df, dest, compression=None, compression_level=None,
             df = df.to_dense()
 
     if _pandas_api.is_data_frame(df):
-        table = Table.from_pandas(df, preserve_index=False)
+        # Feather v1 creates a new column in the resultant Table to
+        # store index information if index type is not RangeIndex
+
+        if version == 1:
+            preserve_index = False
+        elif version == 2:
+            preserve_index = None
+        else:
+            raise ValueError("Version value should either be 1 or 2")
+
+        table = Table.from_pandas(df, preserve_index=preserve_index)
 
         if version == 1:
             # Version 1 does not chunking
diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py
index 7dab732557..97696fa6a9 100644
--- a/python/pyarrow/tests/test_feather.py
+++ b/python/pyarrow/tests/test_feather.py
@@ -30,7 +30,6 @@ import pyarrow.tests.strategies as past
 from pyarrow.feather import (read_feather, write_feather, read_table,
                              FeatherDataset)
 
-
 try:
     from pandas.testing import assert_frame_equal
     import pandas as pd
@@ -90,13 +89,18 @@ def _check_pandas_roundtrip(df, expected=None, path=None,
     if path is None:
         path = random_path()
 
+    if version is None:
+        version = 2
+
     TEST_FILES.append(path)
     write_feather(df, path, compression=compression,
                   compression_level=compression_level, version=version)
+
     if not os.path.exists(path):
         raise Exception('file not written')
 
     result = read_feather(path, columns, use_threads=use_threads)
+
     if expected is None:
         expected = df
 
@@ -504,8 +508,10 @@ def test_out_of_float64_timestamp_with_nulls(version):
 def test_non_string_columns(version):
     df = pd.DataFrame({0: [1, 2, 3, 4],
                        1: [True, False, True, False]})
+    expected = df
 
-    expected = df.rename(columns=str)
+    if version == 1:
+        expected = df.rename(columns=str)
     _check_pandas_roundtrip(df, expected, version=version)
 
 
@@ -820,3 +826,15 @@ def test_feather_v017_experimental_compression_backward_compatibility(datadir):
     expected = pa.table({'a': range(5)})
     result = read_table(datadir / "v0.17.0.version.2-compression.lz4.feather")
     assert result.equals(expected)
+
+
+@pytest.mark.pandas
+def test_preserve_index_pandas(version):
+    df = pd.DataFrame({'a': [1, 2, 3]}, index=['a', 'b', 'c'])
+
+    if version == 1:
+        expected = df.reset_index(drop=True).rename(columns=str)
+    else:
+        expected = df
+
+    _check_pandas_roundtrip(df, expected, version=version)