You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by uw...@apache.org on 2016/12/07 14:00:37 UTC
arrow git commit: ARROW-409: [Python] Change record batches
conversion to Table
Repository: arrow
Updated Branches:
refs/heads/master 4b72329fe -> 72f80d450
ARROW-409: [Python] Change record batches conversion to Table
>From discussion in ARROW-369, it is more consistent and flexible for the pyarrow.Table API to convert a RecordBatch list first a Table, then Table to pandas.DataFrame. For example:
```
table = pa.Table.from_batches(batches)
df = table.to_pandas()
```
Also updated conversion to print schemas in exception message if not equal.
Author: Bryan Cutler <cu...@gmail.com>
Closes #229 from BryanCutler/pyarrow-table-from_batches-ARROW-409 and squashes the following commits:
f5751e0 [Bryan Cutler] fixed schema check to print out if not equal
72ea875 [Bryan Cutler] changed batches conversion to Table instead
Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/72f80d45
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/72f80d45
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/72f80d45
Branch: refs/heads/master
Commit: 72f80d450e0e8e20812fd80571b0c1d18e88114a
Parents: 4b72329
Author: Bryan Cutler <cu...@gmail.com>
Authored: Wed Dec 7 15:00:18 2016 +0100
Committer: Uwe L. Korn <uw...@xhochy.com>
Committed: Wed Dec 7 15:00:18 2016 +0100
----------------------------------------------------------------------
python/pyarrow/__init__.py | 3 +-
python/pyarrow/table.pyx | 94 +++++++++++++++++----------------
python/pyarrow/tests/test_table.py | 5 +-
3 files changed, 52 insertions(+), 50 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/arrow/blob/72f80d45/python/pyarrow/__init__.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index f366317..5af93fb 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -49,5 +49,4 @@ from pyarrow.schema import (null, bool_,
list_, struct, field,
DataType, Field, Schema, schema)
-from pyarrow.table import (Column, RecordBatch, dataframe_from_batches, Table,
- from_pandas_dataframe)
+from pyarrow.table import Column, RecordBatch, Table, from_pandas_dataframe
http://git-wip-us.apache.org/repos/asf/arrow/blob/72f80d45/python/pyarrow/table.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/table.pyx b/python/pyarrow/table.pyx
index 45cf7be..0a9805c 100644
--- a/python/pyarrow/table.pyx
+++ b/python/pyarrow/table.pyx
@@ -415,52 +415,6 @@ cdef class RecordBatch:
return result
-def dataframe_from_batches(batches):
- """
- Convert a list of Arrow RecordBatches to a pandas.DataFrame
-
- Parameters
- ----------
-
- batches: list of RecordBatch
- RecordBatch list to be converted, schemas must be equal
- """
-
- cdef:
- vector[shared_ptr[CArray]] c_array_chunks
- vector[shared_ptr[CColumn]] c_columns
- shared_ptr[CTable] c_table
- Array arr
- Schema schema
-
- import pandas as pd
-
- schema = batches[0].schema
-
- # check schemas are equal
- if any((not schema.equals(other.schema) for other in batches[1:])):
- raise ArrowException("Error converting list of RecordBatches to "
- "DataFrame, not all schemas are equal")
-
- cdef int K = batches[0].num_columns
-
- # create chunked columns from the batches
- c_columns.resize(K)
- for i in range(K):
- for batch in batches:
- arr = batch[i]
- c_array_chunks.push_back(arr.sp_array)
- c_columns[i].reset(new CColumn(schema.sp_schema.get().field(i),
- c_array_chunks))
- c_array_chunks.clear()
-
- # create a Table from columns and convert to DataFrame
- c_table.reset(new CTable('', schema.sp_schema, c_columns))
- table = Table()
- table.init(c_table)
- return table.to_pandas()
-
-
cdef class Table:
"""
A collection of top-level named, equal length Arrow arrays.
@@ -567,6 +521,54 @@ cdef class Table:
return result
+ @staticmethod
+ def from_batches(batches):
+ """
+ Construct a Table from a list of Arrow RecordBatches
+
+ Parameters
+ ----------
+
+ batches: list of RecordBatch
+ RecordBatch list to be converted, schemas must be equal
+ """
+
+ cdef:
+ vector[shared_ptr[CArray]] c_array_chunks
+ vector[shared_ptr[CColumn]] c_columns
+ shared_ptr[CTable] c_table
+ Array arr
+ Schema schema
+
+ import pandas as pd
+
+ schema = batches[0].schema
+
+ # check schemas are equal
+ for other in batches[1:]:
+ if not schema.equals(other.schema):
+ raise ArrowException("Error converting list of RecordBatches "
+ "to DataFrame, not all schemas are equal: {%s} != {%s}"
+ % (str(schema), str(other.schema)))
+
+ cdef int K = batches[0].num_columns
+
+ # create chunked columns from the batches
+ c_columns.resize(K)
+ for i in range(K):
+ for batch in batches:
+ arr = batch[i]
+ c_array_chunks.push_back(arr.sp_array)
+ c_columns[i].reset(new CColumn(schema.sp_schema.get().field(i),
+ c_array_chunks))
+ c_array_chunks.clear()
+
+ # create a Table from columns and convert to DataFrame
+ c_table.reset(new CTable('', schema.sp_schema, c_columns))
+ table = Table()
+ table.init(c_table)
+ return table
+
def to_pandas(self):
"""
Convert the arrow::Table to a pandas DataFrame
http://git-wip-us.apache.org/repos/asf/arrow/blob/72f80d45/python/pyarrow/tests/test_table.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py
index dc4f37a..2546314 100644
--- a/python/pyarrow/tests/test_table.py
+++ b/python/pyarrow/tests/test_table.py
@@ -69,7 +69,8 @@ def test_recordbatchlist_to_pandas():
batch1 = pa.RecordBatch.from_pandas(data1)
batch2 = pa.RecordBatch.from_pandas(data2)
- result = pa.dataframe_from_batches([batch1, batch2])
+ table = pa.Table.from_batches([batch1, batch2])
+ result = table.to_pandas()
data = pd.concat([data1, data2], ignore_index=True)
assert_frame_equal(data, result)
@@ -82,7 +83,7 @@ def test_recordbatchlist_schema_equals():
batch2 = pa.RecordBatch.from_pandas(data2)
with pytest.raises(pa.ArrowException):
- pa.dataframe_from_batches([batch1, batch2])
+ pa.Table.from_batches([batch1, batch2])
def test_table_basics():