You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2018/12/13 12:36:30 UTC
[arrow] branch master updated: ARROW-3866: [Python] Column metadata is not transferred to tables in pyarrow

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 1882a07  ARROW-3866: [Python] Column metadata is not transferred to tables in pyarrow
1882a07 is described below

commit 1882a0727ba275fbced9ed0754c5fe99f841bed4
Author: Tanya Schlusser <ta...@tickel.net>
AuthorDate: Thu Dec 13 13:36:21 2018 +0100

    ARROW-3866: [Python] Column metadata is not transferred to tables in pyarrow
    
    Use columns' existing metadata to create the new fields in `Table.from_arrays()`.
    Also persists the original `nullable` value.
    
    Happy to change things! Thank you for putting a newbie label on it.
    
    Author: Tanya Schlusser <ta...@tickel.net>
    Author: Krisztián Szűcs <sz...@gmail.com>
    
    Closes #3160 from tanyaschlusser/ARROW-3866 and squashes the following commits:
    
    005940ea <Tanya Schlusser> Move the test for preserved metadata to a separate function. Add a test that nullable=False is preserved.
    e4256a17 <Krisztián Szűcs> use column.field()
    76216eae <Tanya Schlusser> Arrow-3866:  keep field matadata for columns passed to pa.Table.from_arrays()
    33950a83 <Tanya Schlusser> ARROW-3866:  test to confirm column metadata is added when calling pa.Table.from_arrays(column_list)
---
 python/pyarrow/table.pxi           | 24 +++++++++++-------------
 python/pyarrow/tests/test_table.py | 15 +++++++++++++++
 2 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index fd565af..cf3411d 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -636,12 +636,12 @@ cdef class Column:
 
 cdef _schema_from_arrays(arrays, names, metadata, shared_ptr[CSchema]* schema):
     cdef:
-        Column col
-        c_string c_name
-        vector[shared_ptr[CField]] fields
-        shared_ptr[CDataType] type_
         Py_ssize_t K = len(arrays)
+        c_string c_name
+        CColumn* c_column
+        shared_ptr[CDataType] c_type
         shared_ptr[CKeyValueMetadata] c_meta
+        vector[shared_ptr[CField]] c_fields
 
     if metadata is not None:
         if not isinstance(metadata, dict):
@@ -649,17 +649,15 @@ cdef _schema_from_arrays(arrays, names, metadata, shared_ptr[CSchema]* schema):
         c_meta = pyarrow_unwrap_metadata(metadata)
 
     if K == 0:
-        schema.reset(new CSchema(fields, c_meta))
+        schema.reset(new CSchema(c_fields, c_meta))
         return
 
-    fields.resize(K)
+    c_fields.resize(K)
 
     if isinstance(arrays[0], Column):
         for i in range(K):
-            col = arrays[i]
-            type_ = col.sp_column.get().type()
-            c_name = tobytes(col.name)
-            fields[i].reset(new CField(c_name, type_, True))
+            c_column = (<Column>arrays[i]).column
+            c_fields[i] = c_column.field()
     else:
         if names is None:
             raise ValueError('Must pass names when constructing '
@@ -670,7 +668,7 @@ cdef _schema_from_arrays(arrays, names, metadata, shared_ptr[CSchema]* schema):
         for i in range(K):
             val = arrays[i]
             if isinstance(val, (Array, ChunkedArray)):
-                type_ = (<DataType> val.type).sp_type
+                c_type = (<DataType> val.type).sp_type
             else:
                 raise TypeError(type(val))
 
@@ -678,9 +676,9 @@ cdef _schema_from_arrays(arrays, names, metadata, shared_ptr[CSchema]* schema):
                 c_name = tobytes(u'None')
             else:
                 c_name = tobytes(names[i])
-            fields[i].reset(new CField(c_name, type_, True))
+            c_fields[i].reset(new CField(c_name, c_type, True))
 
-    schema.reset(new CSchema(fields, c_meta))
+    schema.reset(new CSchema(c_fields, c_meta))
 
 
 cdef class RecordBatch:
diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py
index 9c9828d..ecbf93b 100644
--- a/python/pyarrow/tests/test_table.py
+++ b/python/pyarrow/tests/test_table.py
@@ -579,6 +579,21 @@ def test_table_basics():
     assert table.columns == columns
 
 
+def test_table_from_arrays_preserves_column_metadata():
+    # Added to test https://issues.apache.org/jira/browse/ARROW-3866
+    arr0 = pa.array([1, 2])
+    arr1 = pa.array([3, 4])
+    field0 = pa.field('field1', pa.int64(), metadata=dict(a="A", b="B"))
+    field1 = pa.field('field2', pa.int64(), nullable=False)
+    columns = [
+        pa.column(field0, arr0),
+        pa.column(field1, arr1)
+    ]
+    table = pa.Table.from_arrays(columns)
+    assert b"a" in table.column(0).field.metadata
+    assert table.column(1).field.nullable is False
+
+
 def test_table_from_arrays_invalid_names():
     data = [
         pa.array(range(5)),