You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2019/06/11 17:59:13 UTC
[arrow] branch master updated: ARROW-3650: [Python] warn on converting DataFrame with mixed type column names

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 27daba0  ARROW-3650: [Python] warn on converting DataFrame with mixed type column names
27daba0 is described below

commit 27daba047533bf4e9e1cf4485cc9d4bc5c416ec9
Author: Joris Van den Bossche <jo...@gmail.com>
AuthorDate: Tue Jun 11 12:59:03 2019 -0500

    ARROW-3650: [Python] warn on converting DataFrame with mixed type column names
    
    https://issues.apache.org/jira/browse/ARROW-3650
    
    Author: Joris Van den Bossche <jo...@gmail.com>
    
    Closes #4244 from jorisvandenbossche/ARROW-3650-mixed-column-names and squashes the following commits:
    
    e5503eac2 <Joris Van den Bossche> lint
    bae270adb <Joris Van den Bossche> add comment
    0dd474ae6 <Joris Van den Bossche> Merge remote-tracking branch 'upstream/master' into ARROW-3650-mixed-column-names
    dd3359522 <Joris Van den Bossche> fix python 2
    5e4acd028 <Joris Van den Bossche> consolidate tests
    c7e33ea4a <Joris Van den Bossche> fix bug for mixed-integer-float
    c362ee3d3 <Joris Van den Bossche> ARROW-3650:  warn on converting DataFrame with mixed type column names
---
 python/pyarrow/pandas_compat.py     | 11 +++++++++--
 python/pyarrow/tests/test_pandas.py | 29 +++++++++++++++++------------
 2 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 6eb713e..4ec3a56 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -21,6 +21,7 @@ import ast
 import json
 import operator
 import re
+import warnings
 
 import numpy as np
 
@@ -248,6 +249,11 @@ def construct_metadata(df, column_names, index_levels, index_descriptors,
 def _get_simple_index_descriptor(level):
     string_dtype, extra_metadata = get_extension_dtype_info(level)
     pandas_type = get_logical_type_from_numpy(level)
+    if 'mixed' in pandas_type:
+        warnings.warn(
+            "The DataFrame has column names of mixed type. They will be "
+            "converted to strings and not roundtrip correctly.",
+            UserWarning, stacklevel=4)
     if pandas_type == 'unicode':
         assert not extra_metadata
         extra_metadata = {'encoding': 'UTF-8'}
@@ -789,8 +795,6 @@ _pandas_logical_type_map = {
     'bytes': np.bytes_,
     'string': np.str_,
     'empty': np.object_,
-    'mixed': np.object_,
-    'mixed-integer': np.object_
 }
 
 
@@ -810,6 +814,9 @@ def _pandas_type_to_numpy_type(pandas_type):
     try:
         return _pandas_logical_type_map[pandas_type]
     except KeyError:
+        if 'mixed' in pandas_type:
+            # catching 'mixed', 'mixed-integer' and 'mixed-integer-float'
+            return np.object_
         return np.dtype(pandas_type)
 
 
diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py
index 6e2c9f1..3b5bd57 100644
--- a/python/pyarrow/tests/test_pandas.py
+++ b/python/pyarrow/tests/test_pandas.py
@@ -239,7 +239,8 @@ class TestConvertMetadata(object):
             ),
             columns=['a', None, '__index_level_0__'],
         )
-        t = pa.Table.from_pandas(df, preserve_index=True)
+        with pytest.warns(UserWarning):
+            t = pa.Table.from_pandas(df, preserve_index=True)
         js = t.schema.pandas_metadata
 
         col1, col2, col3, idx0, foo = js['columns']
@@ -363,12 +364,22 @@ class TestConvertMetadata(object):
 
         _check_pandas_roundtrip(df, preserve_index=True)
 
-    def test_mixed_unicode_column_names(self):
-        df = pd.DataFrame({u'あ': [u'い'], b'a': 1}, index=[u'う'])
+    def test_mixed_column_names(self):
+        # mixed type column names are not reconstructed exactly
+        df = pd.DataFrame({'a': [1, 2], 'b': [3, 4]})
 
-        # TODO(phillipc): Should this raise?
-        with pytest.raises(AssertionError):
-            _check_pandas_roundtrip(df, preserve_index=True)
+        for cols in [[u'あ', b'a'], [1, '2'], [1, 1.5]]:
+            df.columns = pd.Index(cols, dtype=object)
+
+            # assert that the from_pandas raises the warning
+            with pytest.warns(UserWarning):
+                pa.Table.from_pandas(df)
+
+            expected = df.copy()
+            expected.columns = df.columns.astype(six.text_type)
+            with pytest.warns(UserWarning):
+                _check_pandas_roundtrip(df, expected=expected,
+                                        preserve_index=True)
 
     def test_binary_column_name(self):
         column_data = [u'い']
@@ -2277,12 +2288,6 @@ class TestConvertMisc(object):
         arr = pa.array(data['y'], type=pa.int16())
         assert arr.to_pylist() == [-1, 2]
 
-    def test_mixed_integer_columns(self):
-        row = [[], []]
-        df = pd.DataFrame(data=[row], columns=['foo', 123])
-        expected_df = pd.DataFrame(data=[row], columns=['foo', '123'])
-        _check_pandas_roundtrip(df, expected=expected_df, preserve_index=True)
-
     def test_safe_unsafe_casts(self):
         # ARROW-2799
         df = pd.DataFrame({