You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by uw...@apache.org on 2017/11/05 12:32:19 UTC
[arrow] branch master updated: ARROW-1714: [Python] Fix invalid serialization/deserialization None name Series

This is an automated email from the ASF dual-hosted git repository.

uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new ea4a8f5  ARROW-1714: [Python] Fix invalid serialization/deserialization None name Series
ea4a8f5 is described below

commit ea4a8f5a01c0c028bbec1e199ca70efaffbf068b
Author: Licht-T <li...@outlook.jp>
AuthorDate: Sun Nov 5 13:32:13 2017 +0100

    ARROW-1714: [Python] Fix invalid serialization/deserialization None name Series
    
    This closes [ARROW-1714](https://issues.apache.org/jira/projects/ARROW/issues/ARROW-1714).
    
    Author: Licht-T <li...@outlook.jp>
    Author: Wes McKinney <we...@twosigma.com>
    
    Closes #1263 from Licht-T/fix-invaid-conversion-none-column-name and squashes the following commits:
    
    3afa60d [Wes McKinney] Be robust to pandas 0.21 conventions for null column labels
    b353260 [Wes McKinney] Don't use locals()
    fe6a075 [Licht-T] TST: Add test for None name Series serialization
    3535dc4 [Licht-T] BUG: Fix invalid deserialization of None column name
---
 python/pyarrow/pandas_compat.py  | 52 ++++++++++++++++++++++++++++++----------
 python/pyarrow/table.pxi         |  5 +++-
 python/pyarrow/tests/test_ipc.py |  9 ++++++-
 3 files changed, 52 insertions(+), 14 deletions(-)

diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 1984598..87b47b8 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -263,6 +263,8 @@ def _column_name_to_strings(name):
         return tuple(map(_column_name_to_strings, name))
     elif isinstance(name, collections.Sequence):
         raise TypeError("Unsupported type for MultiIndex level")
+    elif name is None:
+        return None
     return str(name)
 
 
@@ -280,7 +282,9 @@ def dataframe_to_arrays(df, schema, preserve_index, nthreads=1):
     for name in df.columns:
         col = df[name]
         if not isinstance(name, six.string_types):
-            name = str(_column_name_to_strings(name))
+            name = _column_name_to_strings(name)
+            if name is not None:
+                name = str(name)
 
         if schema is not None:
             field = schema.field_by_name(name)
@@ -361,6 +365,7 @@ def table_to_blockmanager(options, table, memory_pool, nthreads=1):
     schema = table.schema
     row_count = table.num_rows
     metadata = schema.metadata
+    columns_metadata = None
 
     has_pandas_metadata = metadata is not None and b'pandas' in metadata
 
@@ -370,6 +375,7 @@ def table_to_blockmanager(options, table, memory_pool, nthreads=1):
         columns = pandas_metadata['columns']
         column_indexes = pandas_metadata.get('column_indexes', [])
         table = _add_any_metadata(table, pandas_metadata)
+        columns_metadata = pandas_metadata.get('columns', None)
 
     block_table = table
 
@@ -428,6 +434,18 @@ def table_to_blockmanager(options, table, memory_pool, nthreads=1):
         index = pd.RangeIndex(row_count)
 
     column_strings = [x.name for x in block_table.itercolumns()]
+    if columns_metadata is not None:
+        columns_name_dict = dict(
+            (str(x['name']), x['name'])
+            for x in columns_metadata
+        )
+        columns_values = [
+            columns_name_dict[y]
+            if y in columns_name_dict.keys() else y
+            for y in column_strings
+        ]
+    else:
+        columns_values = column_strings
 
     # If we're passed multiple column indexes then evaluate with
     # ast.literal_eval, since the column index values show up as a list of
@@ -437,11 +455,11 @@ def table_to_blockmanager(options, table, memory_pool, nthreads=1):
     # Create the column index
 
     # Construct the base index
-    if not column_strings:
-        columns = pd.Index(column_strings)
+    if not columns_values:
+        columns = pd.Index(columns_values)
     else:
         columns = pd.MultiIndex.from_tuples(
-            list(map(to_pair, column_strings)),
+            list(map(to_pair, columns_values)),
             names=[col_index['name'] for col_index in column_indexes] or None,
         )
 
@@ -466,25 +484,35 @@ def table_to_blockmanager(options, table, memory_pool, nthreads=1):
             _level if _level.dtype == _dtype else _level.astype(_dtype)
             for _level, _dtype in levels_dtypes
         ]
+
         columns = pd.MultiIndex(
             levels=new_levels,
             labels=labels,
             names=columns.names
         )
 
-    # flatten a single level column MultiIndex for pandas 0.21.0 :(
-    if isinstance(columns, pd.MultiIndex) and columns.nlevels == 1:
-        levels, = columns.levels
-        labels, = columns.labels
-
-        # Cheaply check that we do not somehow have duplicate column names
-        assert len(levels) == len(labels), 'Found non-unique column index'
-        columns = levels[labels]
+    # ARROW-1751: flatten a single level column MultiIndex for pandas 0.21.0
+    columns = _flatten_single_level_multiindex(columns)
 
     axes = [columns, index]
     return _int.BlockManager(blocks, axes)
 
 
+def _flatten_single_level_multiindex(index):
+    if isinstance(index, pd.MultiIndex) and index.nlevels == 1:
+        levels, = index.levels
+        labels, = index.labels
+
+        # Cheaply check that we do not somehow have duplicate column names
+        if not index.is_unique:
+            raise ValueError('Found non-unique column index')
+
+        return pd.Index([levels[_label] if _label != -1 else None
+                         for _label in labels],
+                        name=index.names[0])
+    return index
+
+
 def _add_any_metadata(table, pandas_metadata):
     modified_columns = {}
 
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index 6165a66..5ba5f83 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -345,7 +345,10 @@ cdef _schema_from_arrays(arrays, names, dict metadata,
             else:
                 raise TypeError(type(val))
 
-            c_name = tobytes(names[i])
+            if names[i] is None:
+                c_name = tobytes(u'None')
+            else:
+                c_name = tobytes(names[i])
             fields[i].reset(new CField(c_name, type_, True))
 
     schema.reset(new CSchema(fields, unbox_metadata(metadata)))
diff --git a/python/pyarrow/tests/test_ipc.py b/python/pyarrow/tests/test_ipc.py
index 68c0c80..5033ea9 100644
--- a/python/pyarrow/tests/test_ipc.py
+++ b/python/pyarrow/tests/test_ipc.py
@@ -432,16 +432,23 @@ def test_serialize_pandas_no_preserve_index():
 
 def test_serialize_with_pandas_objects():
     df = pd.DataFrame({'a': [1, 2, 3]}, index=[1, 2, 3])
+    s = pd.Series([1, 2, 3, 4])
 
     data = {
         'a_series': df['a'],
-        'a_frame': df
+        'a_frame': df,
+        's_series': s
     }
 
     serialized = pa.serialize(data).to_buffer()
     deserialized = pa.deserialize(serialized)
     assert_frame_equal(deserialized['a_frame'], df)
+
     assert_series_equal(deserialized['a_series'], df['a'])
+    assert deserialized['a_series'].name == 'a'
+
+    assert_series_equal(deserialized['s_series'], s)
+    assert deserialized['s_series'].name is None
 
 
 def test_schema_batch_serialize_methods():

-- 
To stop receiving notification emails like this one, please contact
['"commits@arrow.apache.org" <co...@arrow.apache.org>'].