You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by uw...@apache.org on 2017/11/05 12:32:19 UTC
[arrow] branch master updated: ARROW-1714: [Python] Fix invalid
serialization/deserialization None name Series
This is an automated email from the ASF dual-hosted git repository.
uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new ea4a8f5 ARROW-1714: [Python] Fix invalid serialization/deserialization None name Series
ea4a8f5 is described below
commit ea4a8f5a01c0c028bbec1e199ca70efaffbf068b
Author: Licht-T <li...@outlook.jp>
AuthorDate: Sun Nov 5 13:32:13 2017 +0100
ARROW-1714: [Python] Fix invalid serialization/deserialization None name Series
This closes [ARROW-1714](https://issues.apache.org/jira/projects/ARROW/issues/ARROW-1714).
Author: Licht-T <li...@outlook.jp>
Author: Wes McKinney <we...@twosigma.com>
Closes #1263 from Licht-T/fix-invaid-conversion-none-column-name and squashes the following commits:
3afa60d [Wes McKinney] Be robust to pandas 0.21 conventions for null column labels
b353260 [Wes McKinney] Don't use locals()
fe6a075 [Licht-T] TST: Add test for None name Series serialization
3535dc4 [Licht-T] BUG: Fix invalid deserialization of None column name
---
python/pyarrow/pandas_compat.py | 52 ++++++++++++++++++++++++++++++----------
python/pyarrow/table.pxi | 5 +++-
python/pyarrow/tests/test_ipc.py | 9 ++++++-
3 files changed, 52 insertions(+), 14 deletions(-)
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 1984598..87b47b8 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -263,6 +263,8 @@ def _column_name_to_strings(name):
return tuple(map(_column_name_to_strings, name))
elif isinstance(name, collections.Sequence):
raise TypeError("Unsupported type for MultiIndex level")
+ elif name is None:
+ return None
return str(name)
@@ -280,7 +282,9 @@ def dataframe_to_arrays(df, schema, preserve_index, nthreads=1):
for name in df.columns:
col = df[name]
if not isinstance(name, six.string_types):
- name = str(_column_name_to_strings(name))
+ name = _column_name_to_strings(name)
+ if name is not None:
+ name = str(name)
if schema is not None:
field = schema.field_by_name(name)
@@ -361,6 +365,7 @@ def table_to_blockmanager(options, table, memory_pool, nthreads=1):
schema = table.schema
row_count = table.num_rows
metadata = schema.metadata
+ columns_metadata = None
has_pandas_metadata = metadata is not None and b'pandas' in metadata
@@ -370,6 +375,7 @@ def table_to_blockmanager(options, table, memory_pool, nthreads=1):
columns = pandas_metadata['columns']
column_indexes = pandas_metadata.get('column_indexes', [])
table = _add_any_metadata(table, pandas_metadata)
+ columns_metadata = pandas_metadata.get('columns', None)
block_table = table
@@ -428,6 +434,18 @@ def table_to_blockmanager(options, table, memory_pool, nthreads=1):
index = pd.RangeIndex(row_count)
column_strings = [x.name for x in block_table.itercolumns()]
+ if columns_metadata is not None:
+ columns_name_dict = dict(
+ (str(x['name']), x['name'])
+ for x in columns_metadata
+ )
+ columns_values = [
+ columns_name_dict[y]
+ if y in columns_name_dict.keys() else y
+ for y in column_strings
+ ]
+ else:
+ columns_values = column_strings
# If we're passed multiple column indexes then evaluate with
# ast.literal_eval, since the column index values show up as a list of
@@ -437,11 +455,11 @@ def table_to_blockmanager(options, table, memory_pool, nthreads=1):
# Create the column index
# Construct the base index
- if not column_strings:
- columns = pd.Index(column_strings)
+ if not columns_values:
+ columns = pd.Index(columns_values)
else:
columns = pd.MultiIndex.from_tuples(
- list(map(to_pair, column_strings)),
+ list(map(to_pair, columns_values)),
names=[col_index['name'] for col_index in column_indexes] or None,
)
@@ -466,25 +484,35 @@ def table_to_blockmanager(options, table, memory_pool, nthreads=1):
_level if _level.dtype == _dtype else _level.astype(_dtype)
for _level, _dtype in levels_dtypes
]
+
columns = pd.MultiIndex(
levels=new_levels,
labels=labels,
names=columns.names
)
- # flatten a single level column MultiIndex for pandas 0.21.0 :(
- if isinstance(columns, pd.MultiIndex) and columns.nlevels == 1:
- levels, = columns.levels
- labels, = columns.labels
-
- # Cheaply check that we do not somehow have duplicate column names
- assert len(levels) == len(labels), 'Found non-unique column index'
- columns = levels[labels]
+ # ARROW-1751: flatten a single level column MultiIndex for pandas 0.21.0
+ columns = _flatten_single_level_multiindex(columns)
axes = [columns, index]
return _int.BlockManager(blocks, axes)
+def _flatten_single_level_multiindex(index):
+ if isinstance(index, pd.MultiIndex) and index.nlevels == 1:
+ levels, = index.levels
+ labels, = index.labels
+
+ # Cheaply check that we do not somehow have duplicate column names
+ if not index.is_unique:
+ raise ValueError('Found non-unique column index')
+
+ return pd.Index([levels[_label] if _label != -1 else None
+ for _label in labels],
+ name=index.names[0])
+ return index
+
+
def _add_any_metadata(table, pandas_metadata):
modified_columns = {}
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index 6165a66..5ba5f83 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -345,7 +345,10 @@ cdef _schema_from_arrays(arrays, names, dict metadata,
else:
raise TypeError(type(val))
- c_name = tobytes(names[i])
+ if names[i] is None:
+ c_name = tobytes(u'None')
+ else:
+ c_name = tobytes(names[i])
fields[i].reset(new CField(c_name, type_, True))
schema.reset(new CSchema(fields, unbox_metadata(metadata)))
diff --git a/python/pyarrow/tests/test_ipc.py b/python/pyarrow/tests/test_ipc.py
index 68c0c80..5033ea9 100644
--- a/python/pyarrow/tests/test_ipc.py
+++ b/python/pyarrow/tests/test_ipc.py
@@ -432,16 +432,23 @@ def test_serialize_pandas_no_preserve_index():
def test_serialize_with_pandas_objects():
df = pd.DataFrame({'a': [1, 2, 3]}, index=[1, 2, 3])
+ s = pd.Series([1, 2, 3, 4])
data = {
'a_series': df['a'],
- 'a_frame': df
+ 'a_frame': df,
+ 's_series': s
}
serialized = pa.serialize(data).to_buffer()
deserialized = pa.deserialize(serialized)
assert_frame_equal(deserialized['a_frame'], df)
+
assert_series_equal(deserialized['a_series'], df['a'])
+ assert deserialized['a_series'].name == 'a'
+
+ assert_series_equal(deserialized['s_series'], s)
+ assert deserialized['s_series'].name is None
def test_schema_batch_serialize_methods():
--
To stop receiving notification emails like this one, please contact
['"commits@arrow.apache.org" <co...@arrow.apache.org>'].