You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2018/12/13 12:36:30 UTC
[arrow] branch master updated: ARROW-3866: [Python] Column metadata
is not transferred to tables in pyarrow
This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 1882a07 ARROW-3866: [Python] Column metadata is not transferred to tables in pyarrow
1882a07 is described below
commit 1882a0727ba275fbced9ed0754c5fe99f841bed4
Author: Tanya Schlusser <ta...@tickel.net>
AuthorDate: Thu Dec 13 13:36:21 2018 +0100
ARROW-3866: [Python] Column metadata is not transferred to tables in pyarrow
Use columns' existing metadata to create the new fields in `Table.from_arrays()`.
Also persists the original `nullable` value.
Happy to change things! Thank you for putting a newbie label on it.
Author: Tanya Schlusser <ta...@tickel.net>
Author: Krisztián Szűcs <sz...@gmail.com>
Closes #3160 from tanyaschlusser/ARROW-3866 and squashes the following commits:
005940ea <Tanya Schlusser> Move the test for preserved metadata to a separate function. Add a test that nullable=False is preserved.
e4256a17 <Krisztián Szűcs> use column.field()
76216eae <Tanya Schlusser> Arrow-3866: keep field matadata for columns passed to pa.Table.from_arrays()
33950a83 <Tanya Schlusser> ARROW-3866: test to confirm column metadata is added when calling pa.Table.from_arrays(column_list)
---
python/pyarrow/table.pxi | 24 +++++++++++-------------
python/pyarrow/tests/test_table.py | 15 +++++++++++++++
2 files changed, 26 insertions(+), 13 deletions(-)
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index fd565af..cf3411d 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -636,12 +636,12 @@ cdef class Column:
cdef _schema_from_arrays(arrays, names, metadata, shared_ptr[CSchema]* schema):
cdef:
- Column col
- c_string c_name
- vector[shared_ptr[CField]] fields
- shared_ptr[CDataType] type_
Py_ssize_t K = len(arrays)
+ c_string c_name
+ CColumn* c_column
+ shared_ptr[CDataType] c_type
shared_ptr[CKeyValueMetadata] c_meta
+ vector[shared_ptr[CField]] c_fields
if metadata is not None:
if not isinstance(metadata, dict):
@@ -649,17 +649,15 @@ cdef _schema_from_arrays(arrays, names, metadata, shared_ptr[CSchema]* schema):
c_meta = pyarrow_unwrap_metadata(metadata)
if K == 0:
- schema.reset(new CSchema(fields, c_meta))
+ schema.reset(new CSchema(c_fields, c_meta))
return
- fields.resize(K)
+ c_fields.resize(K)
if isinstance(arrays[0], Column):
for i in range(K):
- col = arrays[i]
- type_ = col.sp_column.get().type()
- c_name = tobytes(col.name)
- fields[i].reset(new CField(c_name, type_, True))
+ c_column = (<Column>arrays[i]).column
+ c_fields[i] = c_column.field()
else:
if names is None:
raise ValueError('Must pass names when constructing '
@@ -670,7 +668,7 @@ cdef _schema_from_arrays(arrays, names, metadata, shared_ptr[CSchema]* schema):
for i in range(K):
val = arrays[i]
if isinstance(val, (Array, ChunkedArray)):
- type_ = (<DataType> val.type).sp_type
+ c_type = (<DataType> val.type).sp_type
else:
raise TypeError(type(val))
@@ -678,9 +676,9 @@ cdef _schema_from_arrays(arrays, names, metadata, shared_ptr[CSchema]* schema):
c_name = tobytes(u'None')
else:
c_name = tobytes(names[i])
- fields[i].reset(new CField(c_name, type_, True))
+ c_fields[i].reset(new CField(c_name, c_type, True))
- schema.reset(new CSchema(fields, c_meta))
+ schema.reset(new CSchema(c_fields, c_meta))
cdef class RecordBatch:
diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py
index 9c9828d..ecbf93b 100644
--- a/python/pyarrow/tests/test_table.py
+++ b/python/pyarrow/tests/test_table.py
@@ -579,6 +579,21 @@ def test_table_basics():
assert table.columns == columns
+def test_table_from_arrays_preserves_column_metadata():
+ # Added to test https://issues.apache.org/jira/browse/ARROW-3866
+ arr0 = pa.array([1, 2])
+ arr1 = pa.array([3, 4])
+ field0 = pa.field('field1', pa.int64(), metadata=dict(a="A", b="B"))
+ field1 = pa.field('field2', pa.int64(), nullable=False)
+ columns = [
+ pa.column(field0, arr0),
+ pa.column(field1, arr1)
+ ]
+ table = pa.Table.from_arrays(columns)
+ assert b"a" in table.column(0).field.metadata
+ assert table.column(1).field.nullable is False
+
+
def test_table_from_arrays_invalid_names():
data = [
pa.array(range(5)),