You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by uw...@apache.org on 2016/12/07 14:00:37 UTC

arrow git commit: ARROW-409: [Python] Change record batches conversion to Table

Repository: arrow
Updated Branches:
  refs/heads/master 4b72329fe -> 72f80d450


ARROW-409: [Python] Change record batches conversion to Table

>From discussion in ARROW-369, it is more consistent and flexible for the pyarrow.Table API to convert a RecordBatch list first a Table, then Table to pandas.DataFrame.  For example:
```
table = pa.Table.from_batches(batches)
df = table.to_pandas()
```
Also updated conversion to print schemas in exception message if not equal.

Author: Bryan Cutler <cu...@gmail.com>

Closes #229 from BryanCutler/pyarrow-table-from_batches-ARROW-409 and squashes the following commits:

f5751e0 [Bryan Cutler] fixed schema check to print out if not equal
72ea875 [Bryan Cutler] changed batches conversion to Table instead


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/72f80d45
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/72f80d45
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/72f80d45

Branch: refs/heads/master
Commit: 72f80d450e0e8e20812fd80571b0c1d18e88114a
Parents: 4b72329
Author: Bryan Cutler <cu...@gmail.com>
Authored: Wed Dec 7 15:00:18 2016 +0100
Committer: Uwe L. Korn <uw...@xhochy.com>
Committed: Wed Dec 7 15:00:18 2016 +0100

----------------------------------------------------------------------
 python/pyarrow/__init__.py         |  3 +-
 python/pyarrow/table.pyx           | 94 +++++++++++++++++----------------
 python/pyarrow/tests/test_table.py |  5 +-
 3 files changed, 52 insertions(+), 50 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/72f80d45/python/pyarrow/__init__.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index f366317..5af93fb 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -49,5 +49,4 @@ from pyarrow.schema import (null, bool_,
                             list_, struct, field,
                             DataType, Field, Schema, schema)
 
-from pyarrow.table import (Column, RecordBatch, dataframe_from_batches, Table,
-                           from_pandas_dataframe)
+from pyarrow.table import Column, RecordBatch, Table, from_pandas_dataframe

http://git-wip-us.apache.org/repos/asf/arrow/blob/72f80d45/python/pyarrow/table.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/table.pyx b/python/pyarrow/table.pyx
index 45cf7be..0a9805c 100644
--- a/python/pyarrow/table.pyx
+++ b/python/pyarrow/table.pyx
@@ -415,52 +415,6 @@ cdef class RecordBatch:
         return result
 
 
-def dataframe_from_batches(batches):
-    """
-    Convert a list of Arrow RecordBatches to a pandas.DataFrame
-
-    Parameters
-    ----------
-
-    batches: list of RecordBatch
-        RecordBatch list to be converted, schemas must be equal
-    """
-
-    cdef:
-        vector[shared_ptr[CArray]] c_array_chunks
-        vector[shared_ptr[CColumn]] c_columns
-        shared_ptr[CTable] c_table
-        Array arr
-        Schema schema
-
-    import pandas as pd
-
-    schema = batches[0].schema
-
-    # check schemas are equal
-    if any((not schema.equals(other.schema) for other in batches[1:])):
-        raise ArrowException("Error converting list of RecordBatches to "
-                "DataFrame, not all schemas are equal")
-
-    cdef int K = batches[0].num_columns
-
-    # create chunked columns from the batches
-    c_columns.resize(K)
-    for i in range(K):
-        for batch in batches:
-            arr = batch[i]
-            c_array_chunks.push_back(arr.sp_array)
-        c_columns[i].reset(new CColumn(schema.sp_schema.get().field(i),
-                           c_array_chunks))
-        c_array_chunks.clear()
-
-    # create a Table from columns and convert to DataFrame
-    c_table.reset(new CTable('', schema.sp_schema, c_columns))
-    table = Table()
-    table.init(c_table)
-    return table.to_pandas()
-
-
 cdef class Table:
     """
     A collection of top-level named, equal length Arrow arrays.
@@ -567,6 +521,54 @@ cdef class Table:
 
         return result
 
+    @staticmethod
+    def from_batches(batches):
+        """
+        Construct a Table from a list of Arrow RecordBatches
+
+        Parameters
+        ----------
+
+        batches: list of RecordBatch
+            RecordBatch list to be converted, schemas must be equal
+        """
+
+        cdef:
+            vector[shared_ptr[CArray]] c_array_chunks
+            vector[shared_ptr[CColumn]] c_columns
+            shared_ptr[CTable] c_table
+            Array arr
+            Schema schema
+
+        import pandas as pd
+
+        schema = batches[0].schema
+
+        # check schemas are equal
+        for other in batches[1:]:
+            if not schema.equals(other.schema):
+                raise ArrowException("Error converting list of RecordBatches "
+                        "to DataFrame, not all schemas are equal: {%s} != {%s}"
+                        % (str(schema), str(other.schema)))
+
+        cdef int K = batches[0].num_columns
+
+        # create chunked columns from the batches
+        c_columns.resize(K)
+        for i in range(K):
+            for batch in batches:
+                arr = batch[i]
+                c_array_chunks.push_back(arr.sp_array)
+            c_columns[i].reset(new CColumn(schema.sp_schema.get().field(i),
+                               c_array_chunks))
+            c_array_chunks.clear()
+
+        # create a Table from columns and convert to DataFrame
+        c_table.reset(new CTable('', schema.sp_schema, c_columns))
+        table = Table()
+        table.init(c_table)
+        return table
+
     def to_pandas(self):
         """
         Convert the arrow::Table to a pandas DataFrame

http://git-wip-us.apache.org/repos/asf/arrow/blob/72f80d45/python/pyarrow/tests/test_table.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py
index dc4f37a..2546314 100644
--- a/python/pyarrow/tests/test_table.py
+++ b/python/pyarrow/tests/test_table.py
@@ -69,7 +69,8 @@ def test_recordbatchlist_to_pandas():
     batch1 = pa.RecordBatch.from_pandas(data1)
     batch2 = pa.RecordBatch.from_pandas(data2)
 
-    result = pa.dataframe_from_batches([batch1, batch2])
+    table = pa.Table.from_batches([batch1, batch2])
+    result = table.to_pandas()
     data = pd.concat([data1, data2], ignore_index=True)
     assert_frame_equal(data, result)
 
@@ -82,7 +83,7 @@ def test_recordbatchlist_schema_equals():
     batch2 = pa.RecordBatch.from_pandas(data2)
 
     with pytest.raises(pa.ArrowException):
-        pa.dataframe_from_batches([batch1, batch2])
+        pa.Table.from_batches([batch1, batch2])
 
 
 def test_table_basics():