You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by jo...@apache.org on 2023/04/27 13:18:49 UTC

[arrow] branch main updated: GH-34979: [Python] Create a base class for Table and RecordBatch (#34980)

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 7bf1dec7f8 GH-34979: [Python] Create a base class for Table and RecordBatch (#34980)
7bf1dec7f8 is described below

commit 7bf1dec7f8bab44522fcbf84263045cc5a24e534
Author: Dane Pitkin <48...@users.noreply.github.com>
AuthorDate: Thu Apr 27 09:18:37 2023 -0400

    GH-34979: [Python] Create a base class for Table and RecordBatch (#34980)
    
    ### Rationale for this change
    
    This is an incremental first step towards https://github.com/apache/arrow/issues/30559
    
    ### What changes are included in this PR?
    
    Introduce `class _Table` in `table.pxi`.
    
    ### Are these changes tested?
    
    Existing pytests will check for regression.
    
    ### Are there any user-facing changes?
    
    No
    * Closes: #34979
    
    Authored-by: Dane Pitkin <da...@voltrondata.com>
    Signed-off-by: Joris Van den Bossche <jo...@gmail.com>
---
 python/pyarrow/lib.pxd   |   8 +-
 python/pyarrow/table.pxi | 303 +++++++++++++++++++++--------------------------
 2 files changed, 138 insertions(+), 173 deletions(-)

diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd
index 54e14005f6..e8c89cf0d5 100644
--- a/python/pyarrow/lib.pxd
+++ b/python/pyarrow/lib.pxd
@@ -474,7 +474,11 @@ cdef class ChunkedArray(_PandasConvertible):
     cdef getitem(self, int64_t i)
 
 
-cdef class Table(_PandasConvertible):
+cdef class _Tabular(_PandasConvertible):
+    pass
+
+
+cdef class Table(_Tabular):
     cdef:
         shared_ptr[CTable] sp_table
         CTable* table
@@ -482,7 +486,7 @@ cdef class Table(_PandasConvertible):
     cdef void init(self, const shared_ptr[CTable]& table)
 
 
-cdef class RecordBatch(_PandasConvertible):
+cdef class RecordBatch(_Tabular):
     cdef:
         shared_ptr[CRecordBatch] sp_batch
         CRecordBatch* batch
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index e8baa79250..893c0c2f3d 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -1450,8 +1450,129 @@ cdef _sanitize_arrays(arrays, names, schema, metadata,
             converted_arrays.append(item)
     return converted_arrays
 
+cdef class _Tabular(_PandasConvertible):
+    """Internal: An interface for common operations on tabular objects."""
 
-cdef class RecordBatch(_PandasConvertible):
+    def __init__(self):
+        raise TypeError("This object is not instantiable, "
+                        "use a subclass instead.")
+
+    def __repr__(self):
+        if not self._is_initialized():
+            raise ValueError("This object's internal pointer is NULL, do not "
+                             "use any methods or attributes on this object")
+        return self.to_string(preview_cols=10)
+
+    def _is_initialized(self):
+        raise NotImplementedError
+
+    def drop_null(self):
+        """
+        Remove rows that contain missing values from a Table or RecordBatch.
+
+        See :func:`pyarrow.compute.drop_null` for full usage.
+
+        Returns
+        -------
+        Table or RecordBatch
+            A tabular object with the same schema, with rows containing
+            no missing values.
+
+        Examples
+        --------
+        Table (works similarly for RecordBatch)
+
+        >>> import pyarrow as pa
+        >>> import pandas as pd
+        >>> df = pd.DataFrame({'year': [None, 2022, 2019, 2021],
+        ...                   'n_legs': [2, 4, 5, 100],
+        ...                   'animals': ["Flamingo", "Horse", None, "Centipede"]})
+        >>> table = pa.Table.from_pandas(df)
+        >>> table.drop_null()
+        pyarrow.Table
+        year: double
+        n_legs: int64
+        animals: string
+        ----
+        year: [[2022,2021]]
+        n_legs: [[4,100]]
+        animals: [["Horse","Centipede"]]
+        """
+        return _pc().drop_null(self)
+
+    def take(self, object indices):
+        """
+        Select rows from a Table or RecordBatch.
+
+        See :func:`pyarrow.compute.take` for full usage.
+
+        Parameters
+        ----------
+        indices : Array or array-like
+            The indices in the tabular object whose rows will be returned.
+
+        Returns
+        -------
+        Table or RecordBatch
+            A tabular object with the same schema, containing the taken rows.
+
+        Examples
+        --------
+        Table (works similarly for RecordBatch)
+
+        >>> import pyarrow as pa
+        >>> import pandas as pd
+        >>> df = pd.DataFrame({'year': [2020, 2022, 2019, 2021],
+        ...                    'n_legs': [2, 4, 5, 100],
+        ...                    'animals': ["Flamingo", "Horse", "Brittle stars", "Centipede"]})
+        >>> table = pa.Table.from_pandas(df)
+        >>> table.take([1,3])
+        pyarrow.Table
+        year: int64
+        n_legs: int64
+        animals: string
+        ----
+        year: [[2022,2021]]
+        n_legs: [[4,100]]
+        animals: [["Horse","Centipede"]]
+        """
+        return _pc().take(self, indices)
+
+    def to_string(self, *, show_metadata=False, preview_cols=0):
+        """
+        Return human-readable string representation of Table or RecordBatch.
+
+        Parameters
+        ----------
+        show_metadata : bool, default False
+            Display Field-level and Schema-level KeyValueMetadata.
+        preview_cols : int, default 0
+            Display values of the columns for the first N columns.
+
+        Returns
+        -------
+        str
+        """
+        # Use less verbose schema output.
+        schema_as_string = self.schema.to_string(
+            show_field_metadata=show_metadata,
+            show_schema_metadata=show_metadata
+        )
+        title = 'pyarrow.{}\n{}'.format(type(self).__name__, schema_as_string)
+        pieces = [title]
+        if preview_cols:
+            pieces.append('----')
+            for i in range(min(self.num_columns, preview_cols)):
+                pieces.append('{}: {}'.format(
+                    self.field(i).name,
+                    self.column(i).to_string(indent=0, skip_new_lines=True)
+                ))
+            if preview_cols < self.num_columns:
+                pieces.append('...')
+        return '\n'.join(pieces)
+
+
+cdef class RecordBatch(_Tabular):
     """
     Batch of rows of columns of equal length
 
@@ -1545,6 +1666,9 @@ cdef class RecordBatch(_PandasConvertible):
         self.sp_batch = batch
         self.batch = batch.get()
 
+    def _is_initialized(self):
+        return self.batch != NULL
+
     # ----------------------------------------------------------------------
     def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True):
         """
@@ -1696,15 +1820,10 @@ cdef class RecordBatch(_PandasConvertible):
         except TypeError:
             return NotImplemented
 
-    def to_string(self, show_metadata=False):
-        # Use less verbose schema output.
-        schema_as_string = self.schema.to_string(
-            show_field_metadata=show_metadata,
-            show_schema_metadata=show_metadata
-        )
-        return 'pyarrow.{}\n{}'.format(type(self).__name__, schema_as_string)
-
     def __repr__(self):
+        # TODO remove this and update pytests/doctests for
+        # RecordBatch.to_string(preview_cols=10) usage in
+        # parent class
         return self.to_string()
 
     def validate(self, *, full=False):
@@ -2254,67 +2373,6 @@ cdef class RecordBatch(_PandasConvertible):
 
         return result
 
-    def take(self, object indices):
-        """
-        Select rows from the record batch.
-
-        See :func:`pyarrow.compute.take` for full usage.
-
-        Parameters
-        ----------
-        indices : Array or array-like
-            The indices in the record batch whose rows will be returned.
-
-        Returns
-        -------
-        taken : RecordBatch
-            A record batch with the same schema, containing the taken rows.
-
-        Examples
-        --------
-        >>> import pyarrow as pa
-        >>> n_legs = pa.array([2, 2, 4, 4, 5, 100])
-        >>> animals = pa.array(["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"])
-        >>> batch = pa.RecordBatch.from_arrays([n_legs, animals],
-        ...                                     names=["n_legs", "animals"])
-        >>> batch.take([1,3,4]).to_pandas()
-           n_legs        animals
-        0       2         Parrot
-        1       4          Horse
-        2       5  Brittle stars
-        """
-        return _pc().take(self, indices)
-
-    def drop_null(self):
-        """
-        Remove missing values from a RecordBatch.
-        See :func:`pyarrow.compute.drop_null` for full usage.
-
-        Examples
-        --------
-        >>> import pyarrow as pa
-        >>> n_legs = pa.array([2, 2, 4, 4, 5, 100])
-        >>> animals = pa.array(["Flamingo", "Parrot", "Dog", "Horse", None, "Centipede"])
-        >>> batch = pa.RecordBatch.from_arrays([n_legs, animals],
-        ...                                     names=["n_legs", "animals"])
-        >>> batch.to_pandas()
-           n_legs    animals
-        0       2   Flamingo
-        1       2     Parrot
-        2       4        Dog
-        3       4      Horse
-        4       5       None
-        5     100  Centipede
-        >>> batch.drop_null().to_pandas()
-           n_legs    animals
-        0       2   Flamingo
-        1       2     Parrot
-        2       4        Dog
-        3       4      Horse
-        4     100  Centipede
-        """
-        return _pc().drop_null(self)
-
     def select(self, object columns):
         """
         Select columns of the RecordBatch.
@@ -2776,7 +2834,7 @@ def table_to_blocks(options, Table table, categories, extension_columns):
     return PyObject_to_object(result_obj)
 
 
-cdef class Table(_PandasConvertible):
+cdef class Table(_Tabular):
     """
     A collection of top-level named, equal length Arrow arrays.
 
@@ -2895,49 +2953,13 @@ cdef class Table(_PandasConvertible):
         raise TypeError("Do not call Table's constructor directly, use one of "
                         "the `Table.from_*` functions instead.")
 
-    def to_string(self, *, show_metadata=False, preview_cols=0):
-        """
-        Return human-readable string representation of Table.
-
-        Parameters
-        ----------
-        show_metadata : bool, default False
-            Display Field-level and Schema-level KeyValueMetadata.
-        preview_cols : int, default 0
-            Display values of the columns for the first N columns.
-
-        Returns
-        -------
-        str
-        """
-        # Use less verbose schema output.
-        schema_as_string = self.schema.to_string(
-            show_field_metadata=show_metadata,
-            show_schema_metadata=show_metadata
-        )
-        title = 'pyarrow.{}\n{}'.format(type(self).__name__, schema_as_string)
-        pieces = [title]
-        if preview_cols:
-            pieces.append('----')
-            for i in range(min(self.num_columns, preview_cols)):
-                pieces.append('{}: {}'.format(
-                    self.field(i).name,
-                    self.column(i).to_string(indent=0, skip_new_lines=True)
-                ))
-            if preview_cols < self.num_columns:
-                pieces.append('...')
-        return '\n'.join(pieces)
-
-    def __repr__(self):
-        if self.table == NULL:
-            raise ValueError("Table's internal pointer is NULL, do not use "
-                             "any methods or attributes on this object")
-        return self.to_string(preview_cols=10)
-
     cdef void init(self, const shared_ptr[CTable]& table):
         self.sp_table = table
         self.table = table.get()
 
+    def _is_initialized(self):
+        return self.table != NULL
+
     def validate(self, *, full=False):
         """
         Perform validation checks.  An exception is raised if validation fails.
@@ -3153,67 +3175,6 @@ cdef class Table(_PandasConvertible):
         else:
             return _pc().filter(self, mask, null_selection_behavior)
 
-    def take(self, object indices):
-        """
-        Select rows from the table.
-
-        See :func:`pyarrow.compute.take` for full usage.
-
-        Parameters
-        ----------
-        indices : Array or array-like
-            The indices in the table whose rows will be returned.
-
-        Returns
-        -------
-        taken : Table
-            A table with the same schema, containing the taken rows.
-
-        Examples
-        --------
-        >>> import pyarrow as pa
-        >>> import pandas as pd
-        >>> df = pd.DataFrame({'year': [2020, 2022, 2019, 2021],
-        ...                    'n_legs': [2, 4, 5, 100],
-        ...                    'animals': ["Flamingo", "Horse", "Brittle stars", "Centipede"]})
-        >>> table = pa.Table.from_pandas(df)
-        >>> table.take([1,3])
-        pyarrow.Table
-        year: int64
-        n_legs: int64
-        animals: string
-        ----
-        year: [[2022,2021]]
-        n_legs: [[4,100]]
-        animals: [["Horse","Centipede"]]
-        """
-        return _pc().take(self, indices)
-
-    def drop_null(self):
-        """
-        Remove missing values from a Table.
-        See :func:`pyarrow.compute.drop_null` for full usage.
-
-        Examples
-        --------
-        >>> import pyarrow as pa
-        >>> import pandas as pd
-        >>> df = pd.DataFrame({'year': [None, 2022, 2019, 2021],
-        ...                   'n_legs': [2, 4, 5, 100],
-        ...                   'animals': ["Flamingo", "Horse", None, "Centipede"]})
-        >>> table = pa.Table.from_pandas(df)
-        >>> table.drop_null()
-        pyarrow.Table
-        year: double
-        n_legs: int64
-        animals: string
-        ----
-        year: [[2022,2021]]
-        n_legs: [[4,100]]
-        animals: [["Horse","Centipede"]]
-        """
-        return _pc().drop_null(self)
-
     def select(self, object columns):
         """
         Select columns of the Table.