You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by jo...@apache.org on 2023/04/27 13:18:49 UTC
[arrow] branch main updated: GH-34979: [Python] Create a base class for Table and RecordBatch (#34980)
This is an automated email from the ASF dual-hosted git repository.
jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 7bf1dec7f8 GH-34979: [Python] Create a base class for Table and RecordBatch (#34980)
7bf1dec7f8 is described below
commit 7bf1dec7f8bab44522fcbf84263045cc5a24e534
Author: Dane Pitkin <48...@users.noreply.github.com>
AuthorDate: Thu Apr 27 09:18:37 2023 -0400
GH-34979: [Python] Create a base class for Table and RecordBatch (#34980)
### Rationale for this change
This is an incremental first step towards https://github.com/apache/arrow/issues/30559
### What changes are included in this PR?
Introduce `class _Table` in `table.pxi`.
### Are these changes tested?
Existing pytests will check for regression.
### Are there any user-facing changes?
No
* Closes: #34979
Authored-by: Dane Pitkin <da...@voltrondata.com>
Signed-off-by: Joris Van den Bossche <jo...@gmail.com>
---
python/pyarrow/lib.pxd | 8 +-
python/pyarrow/table.pxi | 303 +++++++++++++++++++++--------------------------
2 files changed, 138 insertions(+), 173 deletions(-)
diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd
index 54e14005f6..e8c89cf0d5 100644
--- a/python/pyarrow/lib.pxd
+++ b/python/pyarrow/lib.pxd
@@ -474,7 +474,11 @@ cdef class ChunkedArray(_PandasConvertible):
cdef getitem(self, int64_t i)
-cdef class Table(_PandasConvertible):
+cdef class _Tabular(_PandasConvertible):
+ pass
+
+
+cdef class Table(_Tabular):
cdef:
shared_ptr[CTable] sp_table
CTable* table
@@ -482,7 +486,7 @@ cdef class Table(_PandasConvertible):
cdef void init(self, const shared_ptr[CTable]& table)
-cdef class RecordBatch(_PandasConvertible):
+cdef class RecordBatch(_Tabular):
cdef:
shared_ptr[CRecordBatch] sp_batch
CRecordBatch* batch
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index e8baa79250..893c0c2f3d 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -1450,8 +1450,129 @@ cdef _sanitize_arrays(arrays, names, schema, metadata,
converted_arrays.append(item)
return converted_arrays
+cdef class _Tabular(_PandasConvertible):
+ """Internal: An interface for common operations on tabular objects."""
-cdef class RecordBatch(_PandasConvertible):
+ def __init__(self):
+ raise TypeError("This object is not instantiable, "
+ "use a subclass instead.")
+
+ def __repr__(self):
+ if not self._is_initialized():
+ raise ValueError("This object's internal pointer is NULL, do not "
+ "use any methods or attributes on this object")
+ return self.to_string(preview_cols=10)
+
+ def _is_initialized(self):
+ raise NotImplementedError
+
+ def drop_null(self):
+ """
+ Remove rows that contain missing values from a Table or RecordBatch.
+
+ See :func:`pyarrow.compute.drop_null` for full usage.
+
+ Returns
+ -------
+ Table or RecordBatch
+ A tabular object with the same schema, with rows containing
+ no missing values.
+
+ Examples
+ --------
+ Table (works similarly for RecordBatch)
+
+ >>> import pyarrow as pa
+ >>> import pandas as pd
+ >>> df = pd.DataFrame({'year': [None, 2022, 2019, 2021],
+ ... 'n_legs': [2, 4, 5, 100],
+ ... 'animals': ["Flamingo", "Horse", None, "Centipede"]})
+ >>> table = pa.Table.from_pandas(df)
+ >>> table.drop_null()
+ pyarrow.Table
+ year: double
+ n_legs: int64
+ animals: string
+ ----
+ year: [[2022,2021]]
+ n_legs: [[4,100]]
+ animals: [["Horse","Centipede"]]
+ """
+ return _pc().drop_null(self)
+
+ def take(self, object indices):
+ """
+ Select rows from a Table or RecordBatch.
+
+ See :func:`pyarrow.compute.take` for full usage.
+
+ Parameters
+ ----------
+ indices : Array or array-like
+ The indices in the tabular object whose rows will be returned.
+
+ Returns
+ -------
+ Table or RecordBatch
+ A tabular object with the same schema, containing the taken rows.
+
+ Examples
+ --------
+ Table (works similarly for RecordBatch)
+
+ >>> import pyarrow as pa
+ >>> import pandas as pd
+ >>> df = pd.DataFrame({'year': [2020, 2022, 2019, 2021],
+ ... 'n_legs': [2, 4, 5, 100],
+ ... 'animals': ["Flamingo", "Horse", "Brittle stars", "Centipede"]})
+ >>> table = pa.Table.from_pandas(df)
+ >>> table.take([1,3])
+ pyarrow.Table
+ year: int64
+ n_legs: int64
+ animals: string
+ ----
+ year: [[2022,2021]]
+ n_legs: [[4,100]]
+ animals: [["Horse","Centipede"]]
+ """
+ return _pc().take(self, indices)
+
+ def to_string(self, *, show_metadata=False, preview_cols=0):
+ """
+ Return human-readable string representation of Table or RecordBatch.
+
+ Parameters
+ ----------
+ show_metadata : bool, default False
+ Display Field-level and Schema-level KeyValueMetadata.
+ preview_cols : int, default 0
+ Display values of the columns for the first N columns.
+
+ Returns
+ -------
+ str
+ """
+ # Use less verbose schema output.
+ schema_as_string = self.schema.to_string(
+ show_field_metadata=show_metadata,
+ show_schema_metadata=show_metadata
+ )
+ title = 'pyarrow.{}\n{}'.format(type(self).__name__, schema_as_string)
+ pieces = [title]
+ if preview_cols:
+ pieces.append('----')
+ for i in range(min(self.num_columns, preview_cols)):
+ pieces.append('{}: {}'.format(
+ self.field(i).name,
+ self.column(i).to_string(indent=0, skip_new_lines=True)
+ ))
+ if preview_cols < self.num_columns:
+ pieces.append('...')
+ return '\n'.join(pieces)
+
+
+cdef class RecordBatch(_Tabular):
"""
Batch of rows of columns of equal length
@@ -1545,6 +1666,9 @@ cdef class RecordBatch(_PandasConvertible):
self.sp_batch = batch
self.batch = batch.get()
+ def _is_initialized(self):
+ return self.batch != NULL
+
# ----------------------------------------------------------------------
def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True):
"""
@@ -1696,15 +1820,10 @@ cdef class RecordBatch(_PandasConvertible):
except TypeError:
return NotImplemented
- def to_string(self, show_metadata=False):
- # Use less verbose schema output.
- schema_as_string = self.schema.to_string(
- show_field_metadata=show_metadata,
- show_schema_metadata=show_metadata
- )
- return 'pyarrow.{}\n{}'.format(type(self).__name__, schema_as_string)
-
def __repr__(self):
+ # TODO remove this and update pytests/doctests for
+ # RecordBatch.to_string(preview_cols=10) usage in
+ # parent class
return self.to_string()
def validate(self, *, full=False):
@@ -2254,67 +2373,6 @@ cdef class RecordBatch(_PandasConvertible):
return result
- def take(self, object indices):
- """
- Select rows from the record batch.
-
- See :func:`pyarrow.compute.take` for full usage.
-
- Parameters
- ----------
- indices : Array or array-like
- The indices in the record batch whose rows will be returned.
-
- Returns
- -------
- taken : RecordBatch
- A record batch with the same schema, containing the taken rows.
-
- Examples
- --------
- >>> import pyarrow as pa
- >>> n_legs = pa.array([2, 2, 4, 4, 5, 100])
- >>> animals = pa.array(["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"])
- >>> batch = pa.RecordBatch.from_arrays([n_legs, animals],
- ... names=["n_legs", "animals"])
- >>> batch.take([1,3,4]).to_pandas()
- n_legs animals
- 0 2 Parrot
- 1 4 Horse
- 2 5 Brittle stars
- """
- return _pc().take(self, indices)
-
- def drop_null(self):
- """
- Remove missing values from a RecordBatch.
- See :func:`pyarrow.compute.drop_null` for full usage.
-
- Examples
- --------
- >>> import pyarrow as pa
- >>> n_legs = pa.array([2, 2, 4, 4, 5, 100])
- >>> animals = pa.array(["Flamingo", "Parrot", "Dog", "Horse", None, "Centipede"])
- >>> batch = pa.RecordBatch.from_arrays([n_legs, animals],
- ... names=["n_legs", "animals"])
- >>> batch.to_pandas()
- n_legs animals
- 0 2 Flamingo
- 1 2 Parrot
- 2 4 Dog
- 3 4 Horse
- 4 5 None
- 5 100 Centipede
- >>> batch.drop_null().to_pandas()
- n_legs animals
- 0 2 Flamingo
- 1 2 Parrot
- 2 4 Dog
- 3 4 Horse
- 4 100 Centipede
- """
- return _pc().drop_null(self)
-
def select(self, object columns):
"""
Select columns of the RecordBatch.
@@ -2776,7 +2834,7 @@ def table_to_blocks(options, Table table, categories, extension_columns):
return PyObject_to_object(result_obj)
-cdef class Table(_PandasConvertible):
+cdef class Table(_Tabular):
"""
A collection of top-level named, equal length Arrow arrays.
@@ -2895,49 +2953,13 @@ cdef class Table(_PandasConvertible):
raise TypeError("Do not call Table's constructor directly, use one of "
"the `Table.from_*` functions instead.")
- def to_string(self, *, show_metadata=False, preview_cols=0):
- """
- Return human-readable string representation of Table.
-
- Parameters
- ----------
- show_metadata : bool, default False
- Display Field-level and Schema-level KeyValueMetadata.
- preview_cols : int, default 0
- Display values of the columns for the first N columns.
-
- Returns
- -------
- str
- """
- # Use less verbose schema output.
- schema_as_string = self.schema.to_string(
- show_field_metadata=show_metadata,
- show_schema_metadata=show_metadata
- )
- title = 'pyarrow.{}\n{}'.format(type(self).__name__, schema_as_string)
- pieces = [title]
- if preview_cols:
- pieces.append('----')
- for i in range(min(self.num_columns, preview_cols)):
- pieces.append('{}: {}'.format(
- self.field(i).name,
- self.column(i).to_string(indent=0, skip_new_lines=True)
- ))
- if preview_cols < self.num_columns:
- pieces.append('...')
- return '\n'.join(pieces)
-
- def __repr__(self):
- if self.table == NULL:
- raise ValueError("Table's internal pointer is NULL, do not use "
- "any methods or attributes on this object")
- return self.to_string(preview_cols=10)
-
cdef void init(self, const shared_ptr[CTable]& table):
self.sp_table = table
self.table = table.get()
+ def _is_initialized(self):
+ return self.table != NULL
+
def validate(self, *, full=False):
"""
Perform validation checks. An exception is raised if validation fails.
@@ -3153,67 +3175,6 @@ cdef class Table(_PandasConvertible):
else:
return _pc().filter(self, mask, null_selection_behavior)
- def take(self, object indices):
- """
- Select rows from the table.
-
- See :func:`pyarrow.compute.take` for full usage.
-
- Parameters
- ----------
- indices : Array or array-like
- The indices in the table whose rows will be returned.
-
- Returns
- -------
- taken : Table
- A table with the same schema, containing the taken rows.
-
- Examples
- --------
- >>> import pyarrow as pa
- >>> import pandas as pd
- >>> df = pd.DataFrame({'year': [2020, 2022, 2019, 2021],
- ... 'n_legs': [2, 4, 5, 100],
- ... 'animals': ["Flamingo", "Horse", "Brittle stars", "Centipede"]})
- >>> table = pa.Table.from_pandas(df)
- >>> table.take([1,3])
- pyarrow.Table
- year: int64
- n_legs: int64
- animals: string
- ----
- year: [[2022,2021]]
- n_legs: [[4,100]]
- animals: [["Horse","Centipede"]]
- """
- return _pc().take(self, indices)
-
- def drop_null(self):
- """
- Remove missing values from a Table.
- See :func:`pyarrow.compute.drop_null` for full usage.
-
- Examples
- --------
- >>> import pyarrow as pa
- >>> import pandas as pd
- >>> df = pd.DataFrame({'year': [None, 2022, 2019, 2021],
- ... 'n_legs': [2, 4, 5, 100],
- ... 'animals': ["Flamingo", "Horse", None, "Centipede"]})
- >>> table = pa.Table.from_pandas(df)
- >>> table.drop_null()
- pyarrow.Table
- year: double
- n_legs: int64
- animals: string
- ----
- year: [[2022,2021]]
- n_legs: [[4,100]]
- animals: [["Horse","Centipede"]]
- """
- return _pc().drop_null(self)
-
def select(self, object columns):
"""
Select columns of the Table.