You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by li...@apache.org on 2022/06/24 19:31:46 UTC

[arrow] branch master updated: ARROW-16898: [Python] Fix pandas conversion failure when using non-str index name (#13402)

This is an automated email from the ASF dual-hosted git repository.

lidavidm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 65a6929c18 ARROW-16898: [Python] Fix pandas conversion failure when using non-str index name (#13402)
65a6929c18 is described below

commit 65a6929c184e9e613982d5f18125deb68b311258
Author: Martin Liu <14...@users.noreply.github.com>
AuthorDate: Fri Jun 24 12:31:40 2022 -0700

    ARROW-16898: [Python] Fix pandas conversion failure when using non-str index name (#13402)
    
    Ticket: https://issues.apache.org/jira/browse/ARROW-16898
    
    When do `Table.from_pandas(df)`, current code didn't convert `index` name to str (it did [convert `column` name to str](https://github.com/apache/arrow/blob/apache-arrow-8.0.0/python/pyarrow/pandas_compat.py#L356)), so that it will fail if **non-str index name** in df.
    
    Code to reproduce:
    ```python
    import pandas as pd
    import pyarrow as pa
    
    df = pd.DataFrame({0: [1, 2, 3], 1: [4, 5, 6]})
    df = df.set_index(0)
    pa.Table.from_pandas(df)
    ```
    
    Error:
    ```python
    ---------------------------------------------------------------------------
    TypeError                                 Traceback (most recent call last)
    Input In [3], in <module>
          4 df = pd.DataFrame({0: [1, 2, 3], 1: [4, 5, 6]})
          5 df = df.set_index(0)
    ----> 6 pa.Table.from_pandas(df)
    
    File ~/src/mlpsandboxrt/venv/lib/python3.8/site-packages/pyarrow/table.pxi:1394, in pyarrow.lib.Table.from_pandas()
    
    File ~/src/mlpsandboxrt/venv/lib/python3.8/site-packages/pyarrow/pandas_compat.py:610, in dataframe_to_arrays(df, schema, preserve_index, nthreads, columns, safe)
        608     for name, type_ in zip(all_names, types):
        609         name = name if name is not None else 'None'
    --> 610         fields.append(pa.field(name, type_))
        611     schema = pa.schema(fields)
        613 pandas_metadata = construct_metadata(df, column_names, index_columns,
        614                                      index_descriptors, preserve_index,
        615                                      types)
    
    File ~/src/mlpsandboxrt/venv/lib/python3.8/site-packages/pyarrow/types.pxi:1698, in pyarrow.lib.field()
    
    File stringsource:15, in string.from_py.__pyx_convert_string_from_py_std__in_string()
    
    TypeError: expected bytes, int found
    ```
    
    This PR uses `_column_name_to_strings` to convert the index name to str before use it.
    
    Lead-authored-by: Martin Liu <ma...@lyft.com>
    Co-authored-by: Martin Liu <14...@users.noreply.github.com>
    Signed-off-by: David Li <li...@gmail.com>
---
 python/pyarrow/pandas_compat.py     | 23 +++++++++++++++++++----
 python/pyarrow/tests/test_pandas.py | 16 ++++++++++++++++
 2 files changed, 35 insertions(+), 4 deletions(-)

diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index f2f67c1cb0..689cbca6b7 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -217,17 +217,32 @@ def construct_metadata(columns_to_convert, df, column_names, index_levels,
 
     index_column_metadata = []
     if preserve_index is not False:
+        non_str_index_names = []
         for level, arrow_type, descriptor in zip(index_levels, index_types,
                                                  index_descriptors):
             if isinstance(descriptor, dict):
                 # The index is represented in a non-serialized fashion,
                 # e.g. RangeIndex
                 continue
-            metadata = get_column_metadata(level, name=level.name,
-                                           arrow_type=arrow_type,
-                                           field_name=descriptor)
+
+            if level.name is not None and not isinstance(level.name, str):
+                non_str_index_names.append(level.name)
+
+            metadata = get_column_metadata(
+                level,
+                name=_column_name_to_strings(level.name),
+                arrow_type=arrow_type,
+                field_name=descriptor,
+            )
             index_column_metadata.append(metadata)
 
+        if len(non_str_index_names) > 0:
+            warnings.warn(
+                f"The DataFrame has non-str index name `{non_str_index_names}`"
+                " which will be converted to string"
+                " and not roundtrip correctly.",
+                UserWarning, stacklevel=4)
+
         column_indexes = []
 
         levels = getattr(df.columns, 'levels', [df.columns])
@@ -325,7 +340,7 @@ def _index_level_name(index, i, column_names):
     name : str
     """
     if index.name is not None and index.name not in column_names:
-        return index.name
+        return _column_name_to_strings(index.name)
     else:
         return '__index_level_{:d}__'.format(i)
 
diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py
index 143bb0e33e..215bf2fb8c 100644
--- a/python/pyarrow/tests/test_pandas.py
+++ b/python/pyarrow/tests/test_pandas.py
@@ -149,6 +149,22 @@ class TestConvertMetadata:
         table = pa.Table.from_pandas(df)
         assert table.field(0).name == '0'
 
+    def test_non_string_columns_with_index(self):
+        df = pd.DataFrame({0: [1.0, 2.0, 3.0], 1: [4.0, 5.0, 6.0]})
+        df = df.set_index(0)
+
+        # assert that the from_pandas raises the warning
+        with pytest.warns(UserWarning):
+            table = pa.Table.from_pandas(df)
+            assert table.field(0).name == '1'
+
+        expected = df.copy()
+        # non-str index name will be converted to str
+        expected.index.name = str(expected.index.name)
+        with pytest.warns(UserWarning):
+            _check_pandas_roundtrip(df, expected=expected,
+                                    preserve_index=True)
+
     def test_from_pandas_with_columns(self):
         df = pd.DataFrame({0: [1, 2, 3], 1: [1, 3, 3], 2: [2, 4, 5]},
                           columns=[1, 0])