You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by li...@apache.org on 2022/06/24 19:31:46 UTC
[arrow] branch master updated: ARROW-16898: [Python] Fix pandas conversion failure when using non-str index name (#13402)
This is an automated email from the ASF dual-hosted git repository.
lidavidm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 65a6929c18 ARROW-16898: [Python] Fix pandas conversion failure when using non-str index name (#13402)
65a6929c18 is described below
commit 65a6929c184e9e613982d5f18125deb68b311258
Author: Martin Liu <14...@users.noreply.github.com>
AuthorDate: Fri Jun 24 12:31:40 2022 -0700
ARROW-16898: [Python] Fix pandas conversion failure when using non-str index name (#13402)
Ticket: https://issues.apache.org/jira/browse/ARROW-16898
When do `Table.from_pandas(df)`, current code didn't convert `index` name to str (it did [convert `column` name to str](https://github.com/apache/arrow/blob/apache-arrow-8.0.0/python/pyarrow/pandas_compat.py#L356)), so that it will fail if **non-str index name** in df.
Code to reproduce:
```python
import pandas as pd
import pyarrow as pa
df = pd.DataFrame({0: [1, 2, 3], 1: [4, 5, 6]})
df = df.set_index(0)
pa.Table.from_pandas(df)
```
Error:
```python
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Input In [3], in <module>
4 df = pd.DataFrame({0: [1, 2, 3], 1: [4, 5, 6]})
5 df = df.set_index(0)
----> 6 pa.Table.from_pandas(df)
File ~/src/mlpsandboxrt/venv/lib/python3.8/site-packages/pyarrow/table.pxi:1394, in pyarrow.lib.Table.from_pandas()
File ~/src/mlpsandboxrt/venv/lib/python3.8/site-packages/pyarrow/pandas_compat.py:610, in dataframe_to_arrays(df, schema, preserve_index, nthreads, columns, safe)
608 for name, type_ in zip(all_names, types):
609 name = name if name is not None else 'None'
--> 610 fields.append(pa.field(name, type_))
611 schema = pa.schema(fields)
613 pandas_metadata = construct_metadata(df, column_names, index_columns,
614 index_descriptors, preserve_index,
615 types)
File ~/src/mlpsandboxrt/venv/lib/python3.8/site-packages/pyarrow/types.pxi:1698, in pyarrow.lib.field()
File stringsource:15, in string.from_py.__pyx_convert_string_from_py_std__in_string()
TypeError: expected bytes, int found
```
This PR uses `_column_name_to_strings` to convert the index name to str before use it.
Lead-authored-by: Martin Liu <ma...@lyft.com>
Co-authored-by: Martin Liu <14...@users.noreply.github.com>
Signed-off-by: David Li <li...@gmail.com>
---
python/pyarrow/pandas_compat.py | 23 +++++++++++++++++++----
python/pyarrow/tests/test_pandas.py | 16 ++++++++++++++++
2 files changed, 35 insertions(+), 4 deletions(-)
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index f2f67c1cb0..689cbca6b7 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -217,17 +217,32 @@ def construct_metadata(columns_to_convert, df, column_names, index_levels,
index_column_metadata = []
if preserve_index is not False:
+ non_str_index_names = []
for level, arrow_type, descriptor in zip(index_levels, index_types,
index_descriptors):
if isinstance(descriptor, dict):
# The index is represented in a non-serialized fashion,
# e.g. RangeIndex
continue
- metadata = get_column_metadata(level, name=level.name,
- arrow_type=arrow_type,
- field_name=descriptor)
+
+ if level.name is not None and not isinstance(level.name, str):
+ non_str_index_names.append(level.name)
+
+ metadata = get_column_metadata(
+ level,
+ name=_column_name_to_strings(level.name),
+ arrow_type=arrow_type,
+ field_name=descriptor,
+ )
index_column_metadata.append(metadata)
+ if len(non_str_index_names) > 0:
+ warnings.warn(
+ f"The DataFrame has non-str index name `{non_str_index_names}`"
+ " which will be converted to string"
+ " and not roundtrip correctly.",
+ UserWarning, stacklevel=4)
+
column_indexes = []
levels = getattr(df.columns, 'levels', [df.columns])
@@ -325,7 +340,7 @@ def _index_level_name(index, i, column_names):
name : str
"""
if index.name is not None and index.name not in column_names:
- return index.name
+ return _column_name_to_strings(index.name)
else:
return '__index_level_{:d}__'.format(i)
diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py
index 143bb0e33e..215bf2fb8c 100644
--- a/python/pyarrow/tests/test_pandas.py
+++ b/python/pyarrow/tests/test_pandas.py
@@ -149,6 +149,22 @@ class TestConvertMetadata:
table = pa.Table.from_pandas(df)
assert table.field(0).name == '0'
+ def test_non_string_columns_with_index(self):
+ df = pd.DataFrame({0: [1.0, 2.0, 3.0], 1: [4.0, 5.0, 6.0]})
+ df = df.set_index(0)
+
+ # assert that the from_pandas raises the warning
+ with pytest.warns(UserWarning):
+ table = pa.Table.from_pandas(df)
+ assert table.field(0).name == '1'
+
+ expected = df.copy()
+ # non-str index name will be converted to str
+ expected.index.name = str(expected.index.name)
+ with pytest.warns(UserWarning):
+ _check_pandas_roundtrip(df, expected=expected,
+ preserve_index=True)
+
def test_from_pandas_with_columns(self):
df = pd.DataFrame({0: [1, 2, 3], 1: [1, 3, 3], 2: [2, 4, 5]},
columns=[1, 0])