You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2019/06/11 17:59:13 UTC
[arrow] branch master updated: ARROW-3650: [Python] warn on
converting DataFrame with mixed type column names
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 27daba0 ARROW-3650: [Python] warn on converting DataFrame with mixed type column names
27daba0 is described below
commit 27daba047533bf4e9e1cf4485cc9d4bc5c416ec9
Author: Joris Van den Bossche <jo...@gmail.com>
AuthorDate: Tue Jun 11 12:59:03 2019 -0500
ARROW-3650: [Python] warn on converting DataFrame with mixed type column names
https://issues.apache.org/jira/browse/ARROW-3650
Author: Joris Van den Bossche <jo...@gmail.com>
Closes #4244 from jorisvandenbossche/ARROW-3650-mixed-column-names and squashes the following commits:
e5503eac2 <Joris Van den Bossche> lint
bae270adb <Joris Van den Bossche> add comment
0dd474ae6 <Joris Van den Bossche> Merge remote-tracking branch 'upstream/master' into ARROW-3650-mixed-column-names
dd3359522 <Joris Van den Bossche> fix python 2
5e4acd028 <Joris Van den Bossche> consolidate tests
c7e33ea4a <Joris Van den Bossche> fix bug for mixed-integer-float
c362ee3d3 <Joris Van den Bossche> ARROW-3650: warn on converting DataFrame with mixed type column names
---
python/pyarrow/pandas_compat.py | 11 +++++++++--
python/pyarrow/tests/test_pandas.py | 29 +++++++++++++++++------------
2 files changed, 26 insertions(+), 14 deletions(-)
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 6eb713e..4ec3a56 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -21,6 +21,7 @@ import ast
import json
import operator
import re
+import warnings
import numpy as np
@@ -248,6 +249,11 @@ def construct_metadata(df, column_names, index_levels, index_descriptors,
def _get_simple_index_descriptor(level):
string_dtype, extra_metadata = get_extension_dtype_info(level)
pandas_type = get_logical_type_from_numpy(level)
+ if 'mixed' in pandas_type:
+ warnings.warn(
+ "The DataFrame has column names of mixed type. They will be "
+ "converted to strings and not roundtrip correctly.",
+ UserWarning, stacklevel=4)
if pandas_type == 'unicode':
assert not extra_metadata
extra_metadata = {'encoding': 'UTF-8'}
@@ -789,8 +795,6 @@ _pandas_logical_type_map = {
'bytes': np.bytes_,
'string': np.str_,
'empty': np.object_,
- 'mixed': np.object_,
- 'mixed-integer': np.object_
}
@@ -810,6 +814,9 @@ def _pandas_type_to_numpy_type(pandas_type):
try:
return _pandas_logical_type_map[pandas_type]
except KeyError:
+ if 'mixed' in pandas_type:
+ # catching 'mixed', 'mixed-integer' and 'mixed-integer-float'
+ return np.object_
return np.dtype(pandas_type)
diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py
index 6e2c9f1..3b5bd57 100644
--- a/python/pyarrow/tests/test_pandas.py
+++ b/python/pyarrow/tests/test_pandas.py
@@ -239,7 +239,8 @@ class TestConvertMetadata(object):
),
columns=['a', None, '__index_level_0__'],
)
- t = pa.Table.from_pandas(df, preserve_index=True)
+ with pytest.warns(UserWarning):
+ t = pa.Table.from_pandas(df, preserve_index=True)
js = t.schema.pandas_metadata
col1, col2, col3, idx0, foo = js['columns']
@@ -363,12 +364,22 @@ class TestConvertMetadata(object):
_check_pandas_roundtrip(df, preserve_index=True)
- def test_mixed_unicode_column_names(self):
- df = pd.DataFrame({u'あ': [u'い'], b'a': 1}, index=[u'う'])
+ def test_mixed_column_names(self):
+ # mixed type column names are not reconstructed exactly
+ df = pd.DataFrame({'a': [1, 2], 'b': [3, 4]})
- # TODO(phillipc): Should this raise?
- with pytest.raises(AssertionError):
- _check_pandas_roundtrip(df, preserve_index=True)
+ for cols in [[u'あ', b'a'], [1, '2'], [1, 1.5]]:
+ df.columns = pd.Index(cols, dtype=object)
+
+ # assert that the from_pandas raises the warning
+ with pytest.warns(UserWarning):
+ pa.Table.from_pandas(df)
+
+ expected = df.copy()
+ expected.columns = df.columns.astype(six.text_type)
+ with pytest.warns(UserWarning):
+ _check_pandas_roundtrip(df, expected=expected,
+ preserve_index=True)
def test_binary_column_name(self):
column_data = [u'い']
@@ -2277,12 +2288,6 @@ class TestConvertMisc(object):
arr = pa.array(data['y'], type=pa.int16())
assert arr.to_pylist() == [-1, 2]
- def test_mixed_integer_columns(self):
- row = [[], []]
- df = pd.DataFrame(data=[row], columns=['foo', 123])
- expected_df = pd.DataFrame(data=[row], columns=['foo', '123'])
- _check_pandas_roundtrip(df, expected=expected_df, preserve_index=True)
-
def test_safe_unsafe_casts(self):
# ARROW-2799
df = pd.DataFrame({