You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by li...@apache.org on 2022/04/21 12:40:39 UTC

[arrow] branch master updated: ARROW-16114: [Docs][Python] Document Parquet FileMetaData

This is an automated email from the ASF dual-hosted git repository.

lidavidm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 2e7acabf7b ARROW-16114: [Docs][Python] Document Parquet FileMetaData
2e7acabf7b is described below

commit 2e7acabf7ba9c4df8621918e82fe795a227a3b31
Author: Will Jones <wi...@gmail.com>
AuthorDate: Thu Apr 21 08:40:23 2022 -0400

    ARROW-16114: [Docs][Python] Document Parquet FileMetaData
    
    Recently was using this class in a third-party library and had a hard time finding information about these classes.
    
    Closes #12901 from wjones127/ARROW-16114-document-parquet-metadata
    
    Authored-by: Will Jones <wi...@gmail.com>
    Signed-off-by: David Li <li...@gmail.com>
---
 docs/source/python/api/formats.rst |  14 +++
 python/pyarrow/_parquet.pyx        | 237 ++++++++++++++++++++++++++++++++++++-
 python/pyarrow/parquet/__init__.py |   5 +-
 python/pyarrow/table.pxi           |   1 -
 4 files changed, 248 insertions(+), 9 deletions(-)

diff --git a/docs/source/python/api/formats.rst b/docs/source/python/api/formats.rst
index 55f6d568d9..9ca499c097 100644
--- a/docs/source/python/api/formats.rst
+++ b/docs/source/python/api/formats.rst
@@ -89,6 +89,20 @@ Parquet Files
    write_table
    write_to_dataset
 
+Parquet Metadata
+~~~~~~~~~~~~~~~~
+
+.. autosummary::
+   :toctree: ../generated/
+
+   FileMetaData
+   RowGroupMetaData
+   ColumnChunkMetaData
+   Statistics
+   ParquetSchema
+   ColumnSchema
+   ParquetLogicalType
+
 Encrypted Parquet Files
 ~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index 1e53b99d05..c634ea9669 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -50,6 +50,8 @@ cimport cpython as cp
 
 
 cdef class Statistics(_Weakrefable):
+    """Statistics for a single column in a single row group."""
+
     def __cinit__(self):
         pass
 
@@ -75,6 +77,14 @@ cdef class Statistics(_Weakrefable):
                                         self.converted_type)
 
     def to_dict(self):
+        """
+        Get dictionary represenation of statistics.
+
+        Returns
+        -------
+        dict
+            Dictionary with a key for each attribute of this class.
+        """
         d = dict(
             has_min_max=self.has_min_max,
             min=self.min,
@@ -93,22 +103,38 @@ cdef class Statistics(_Weakrefable):
             return NotImplemented
 
     def equals(self, Statistics other):
+        """
+        Return whether the two column statistics objects are equal.
+
+        Parameters
+        ----------
+        other : Statistics
+            Statistics to compare against.
+
+        Returns
+        -------
+        are_equal : bool
+        """
         return self.statistics.get().Equals(deref(other.statistics.get()))
 
     @property
     def has_min_max(self):
+        """Whether min and max are present (bool)."""
         return self.statistics.get().HasMinMax()
 
     @property
     def has_null_count(self):
+        """Whether null count is present (bool)."""
         return self.statistics.get().HasNullCount()
 
     @property
     def has_distinct_count(self):
+        """Whether distinct count is preset (bool)."""
         return self.statistics.get().HasDistinctCount()
 
     @property
     def min_raw(self):
+        """Min value as physical type (bool, int, float, or bytes)."""
         if self.has_min_max:
             return _cast_statistic_raw_min(self.statistics.get())
         else:
@@ -116,6 +142,7 @@ cdef class Statistics(_Weakrefable):
 
     @property
     def max_raw(self):
+        """Max value as physical type (bool, int, float, or bytes)."""
         if self.has_min_max:
             return _cast_statistic_raw_max(self.statistics.get())
         else:
@@ -123,6 +150,12 @@ cdef class Statistics(_Weakrefable):
 
     @property
     def min(self):
+        """
+        Min value as logical type.
+
+        Returned as the Python equivalent of logical type, such as datetime.date
+        for dates and decimal.Decimal for decimals.
+        """
         if self.has_min_max:
             min_scalar, _ = _cast_statistics(self.statistics.get())
             return min_scalar.as_py()
@@ -131,6 +164,12 @@ cdef class Statistics(_Weakrefable):
 
     @property
     def max(self):
+        """
+        Max value as logical type.
+
+        Returned as the Python equivalent of logical type, such as datetime.date
+        for dates and decimal.Decimal for decimals.
+        """
         if self.has_min_max:
             _, max_scalar = _cast_statistics(self.statistics.get())
             return max_scalar.as_py()
@@ -139,32 +178,44 @@ cdef class Statistics(_Weakrefable):
 
     @property
     def null_count(self):
+        """Number of null values in chunk (int)."""
         return self.statistics.get().null_count()
 
     @property
     def distinct_count(self):
+        """
+        Distinct number of values in chunk (int).
+
+        If this is not set, will return 0.
+        """
+        # This seems to be zero if not set. See: ARROW-11793
         return self.statistics.get().distinct_count()
 
     @property
     def num_values(self):
+        """Number of non-null values (int)."""
         return self.statistics.get().num_values()
 
     @property
     def physical_type(self):
+        """Physical type of column (str)."""
         raw_physical_type = self.statistics.get().physical_type()
         return physical_type_name_from_enum(raw_physical_type)
 
     @property
     def logical_type(self):
+        """Logical type of column (:class:`ParquetLogicalType`)."""
         return wrap_logical_type(self.statistics.get().descr().logical_type())
 
     @property
     def converted_type(self):
+        """Legacy converted type (str or None)."""
         raw_converted_type = self.statistics.get().descr().converted_type()
         return converted_type_name_from_enum(raw_converted_type)
 
 
 cdef class ParquetLogicalType(_Weakrefable):
+    """Logical type of parquet type."""
     cdef:
         shared_ptr[const CParquetLogicalType] type
 
@@ -174,14 +225,29 @@ cdef class ParquetLogicalType(_Weakrefable):
     cdef init(self, const shared_ptr[const CParquetLogicalType]& type):
         self.type = type
 
+    def __repr__(self):
+        return "{}\n  {}".format(object.__repr__(self), str(self))
+
     def __str__(self):
         return frombytes(self.type.get().ToString(), safe=True)
 
     def to_json(self):
+        """
+        Get a JSON string containing type and type parameters.
+
+        Returns
+        -------
+        json : str
+            JSON representation of type, with at least a field called 'Type'
+            which contains the type name. If the type is parameterized, such 
+            as a decimal with scale and precision, will contain those as fields
+            as well.
+        """
         return frombytes(self.type.get().ToJSON())
 
     @property
     def type(self):
+        """Name of the logical type (str)."""
         return logical_type_name_from_enum(self.type.get().type())
 
 
@@ -246,6 +312,8 @@ cdef _box_flba(ParquetFLBA val, uint32_t len):
 
 
 cdef class ColumnChunkMetaData(_Weakrefable):
+    """Column metadata for a single row group."""
+
     def __cinit__(self):
         pass
 
@@ -283,6 +351,14 @@ cdef class ColumnChunkMetaData(_Weakrefable):
                                           self.total_uncompressed_size)
 
     def to_dict(self):
+        """
+        Get dictionary represenation of the column chunk metadata.
+
+        Returns
+        -------
+        dict
+            Dictionary with a key for each attribute of this class.
+        """
         statistics = self.statistics.to_dict() if self.is_stats_set else None
         d = dict(
             file_offset=self.file_offset,
@@ -309,35 +385,54 @@ cdef class ColumnChunkMetaData(_Weakrefable):
             return NotImplemented
 
     def equals(self, ColumnChunkMetaData other):
+        """
+        Return whether the two column chunk metadata objects are equal.
+
+        Parameters
+        ----------
+        other : ColumnChunkMetaData
+            Metadata to compare against.
+
+        Returns
+        -------
+        are_equal : bool
+        """
         return self.metadata.Equals(deref(other.metadata))
 
     @property
     def file_offset(self):
+        """Offset into file where column chunk is located (int)."""
         return self.metadata.file_offset()
 
     @property
     def file_path(self):
+        """Optional file path if set (str or None)."""
         return frombytes(self.metadata.file_path())
 
     @property
     def physical_type(self):
+        """Physical type of column (str)."""
         return physical_type_name_from_enum(self.metadata.type())
 
     @property
     def num_values(self):
+        """Total number of values (int)."""
         return self.metadata.num_values()
 
     @property
     def path_in_schema(self):
+        """Nested path to field, separated by periods (str)."""
         path = self.metadata.path_in_schema().get().ToDotString()
         return frombytes(path)
 
     @property
     def is_stats_set(self):
+        """Whether or not statistics are present in metadata (bool)."""
         return self.metadata.is_stats_set()
 
     @property
     def statistics(self):
+        """Statistics for column chunk (:class:`Statistics`)."""
         if not self.metadata.is_stats_set():
             return None
         statistics = Statistics()
@@ -346,18 +441,32 @@ cdef class ColumnChunkMetaData(_Weakrefable):
 
     @property
     def compression(self):
+        """
+        Type of compression used for column (str).
+
+        One of 'UNCOMPRESSED', 'SNAPPY', 'GZIP', 'LZO', 'BROTLI', 'LZ4', 'ZSTD', 
+        or 'UNKNOWN'.
+        """
         return compression_name_from_enum(self.metadata.compression())
 
     @property
     def encodings(self):
+        """
+        Encodings used for column (tuple of str).
+
+        One of 'PLAIN', 'BIT_PACKED', 'RLE', 'BYTE_STREAM_SPLIT', 'DELTA_BINARY_PACKED',
+        'DELTA_BYTE_ARRAY'.
+        """
         return tuple(map(encoding_name_from_enum, self.metadata.encodings()))
 
     @property
     def has_dictionary_page(self):
+        """Whether there is dictionary data present in the column chunk (bool)."""
         return bool(self.metadata.has_dictionary_page())
 
     @property
     def dictionary_page_offset(self):
+        """Offset of dictionary page reglative to column chunk offset (int)."""
         if self.has_dictionary_page:
             return self.metadata.dictionary_page_offset()
         else:
@@ -365,26 +474,33 @@ cdef class ColumnChunkMetaData(_Weakrefable):
 
     @property
     def data_page_offset(self):
+        """Offset of data page reglative to column chunk offset (int)."""
         return self.metadata.data_page_offset()
 
     @property
     def has_index_page(self):
+        """Not yet supported."""
         raise NotImplementedError('not supported in parquet-cpp')
 
     @property
     def index_page_offset(self):
+        """Not yet supported."""
         raise NotImplementedError("parquet-cpp doesn't return valid values")
 
     @property
     def total_compressed_size(self):
+        """Compresssed size in bytes (int)."""
         return self.metadata.total_compressed_size()
 
     @property
     def total_uncompressed_size(self):
+        """Uncompressed size in bytes (int)."""
         return self.metadata.total_uncompressed_size()
 
 
 cdef class RowGroupMetaData(_Weakrefable):
+    """Metadata for a single row group."""
+
     def __cinit__(self, FileMetaData parent, int index):
         if index < 0 or index >= parent.num_row_groups:
             raise IndexError('{0} out of bounds'.format(index))
@@ -403,9 +519,34 @@ cdef class RowGroupMetaData(_Weakrefable):
             return NotImplemented
 
     def equals(self, RowGroupMetaData other):
+        """
+        Return whether the two row group metadata objects are equal.
+
+        Parameters
+        ----------
+        other : RowGroupMetaData
+            Metadata to compare against.
+
+        Returns
+        -------
+        are_equal : bool
+        """
         return self.metadata.Equals(deref(other.metadata))
 
     def column(self, int i):
+        """
+        Get column metadata at given index.
+
+        Parameters
+        ----------
+        i : int
+            Index of column to get metadata for.
+
+        Returns
+        -------
+        ColumnChunkMetaData
+            Metadata for column within this chunk.
+        """
         if i < 0 or i >= self.num_columns:
             raise IndexError('{0} out of bounds'.format(i))
         chunk = ColumnChunkMetaData()
@@ -422,6 +563,14 @@ cdef class RowGroupMetaData(_Weakrefable):
                                  self.total_byte_size)
 
     def to_dict(self):
+        """
+        Get dictionary represenation of the row group metadata.
+
+        Returns
+        -------
+        dict
+            Dictionary with a key for each attribute of this class.
+        """
         columns = []
         d = dict(
             num_columns=self.num_columns,
@@ -435,14 +584,17 @@ cdef class RowGroupMetaData(_Weakrefable):
 
     @property
     def num_columns(self):
+        """Number of columns in this row group (int)."""
         return self.metadata.num_columns()
 
     @property
     def num_rows(self):
+        """Number of rows in this row group (int)."""
         return self.metadata.num_rows()
 
     @property
     def total_byte_size(self):
+        """Total byte size of all the uncompressed column data in this row group (int)."""
         return self.metadata.total_byte_size()
 
 
@@ -458,6 +610,8 @@ def _reconstruct_filemetadata(Buffer serialized):
 
 
 cdef class FileMetaData(_Weakrefable):
+    """Parquet metadata for a single file."""
+
     def __cinit__(self):
         pass
 
@@ -485,6 +639,14 @@ cdef class FileMetaData(_Weakrefable):
                                  self.serialized_size)
 
     def to_dict(self):
+        """
+        Get dictionary represenation of the file metadata.
+
+        Returns
+        -------
+        dict
+            Dictionary with a key for each attribute of this class.
+        """
         row_groups = []
         d = dict(
             created_by=self.created_by,
@@ -506,32 +668,54 @@ cdef class FileMetaData(_Weakrefable):
             return NotImplemented
 
     def equals(self, FileMetaData other):
+        """
+        Return whether the two file metadata objects are equal.
+
+        Parameters
+        ----------
+        other : FileMetaData
+            Metadata to compare against.
+
+        Returns
+        -------
+        are_equal : bool
+        """
         return self._metadata.Equals(deref(other._metadata))
 
     @property
     def schema(self):
+        """Schema of the file (:class:`ParquetSchema`)."""
         if self._schema is None:
             self._schema = ParquetSchema(self)
         return self._schema
 
     @property
     def serialized_size(self):
+        """Size of the original thrift encoded metadata footer (int)."""
         return self._metadata.size()
 
     @property
     def num_columns(self):
+        """Number of columns in file (int)."""
         return self._metadata.num_columns()
 
     @property
     def num_rows(self):
+        """Total number of rows in file (int)."""
         return self._metadata.num_rows()
 
     @property
     def num_row_groups(self):
+        """Number of row groups in file (int)."""
         return self._metadata.num_row_groups()
 
     @property
     def format_version(self):
+        """
+        Parquet format version used in file (str, such as '1.0', '2.4').
+
+        If version is missing or unparsable, will default to assuming '1.0'.
+        """
         cdef ParquetVersion version = self._metadata.version()
         if version == ParquetVersion_V1:
             return '1.0'
@@ -548,10 +732,17 @@ cdef class FileMetaData(_Weakrefable):
 
     @property
     def created_by(self):
+        """
+        String describing source of the parquet file (str).
+
+        This typically includes library name and version number. For example, Arrow 7.0's
+        writer returns 'parquet-cpp-arrow version 7.0.0'.
+        """
         return frombytes(self._metadata.created_by())
 
     @property
     def metadata(self):
+        """Additional metadata as key value pairs (dict[bytes, bytes])."""
         cdef:
             unordered_map[c_string, c_string] metadata
             const CKeyValueMetadata* underlying_metadata
@@ -563,6 +754,18 @@ cdef class FileMetaData(_Weakrefable):
             return None
 
     def row_group(self, int i):
+        """
+        Get metadata for row group at index i.
+
+        Parameters
+        ----------
+        i : int
+            Row group index to get.
+
+        Returns
+        -------
+        row_group_metadata : RowGroupMetaData
+        """
         return RowGroupMetaData(self, i)
 
     def set_file_path(self, path):
@@ -624,6 +827,8 @@ cdef class FileMetaData(_Weakrefable):
 
 
 cdef class ParquetSchema(_Weakrefable):
+    """A Parquet schema."""
+
     def __cinit__(self, FileMetaData container):
         self.parent = container
         self.schema = container._metadata.schema()
@@ -644,15 +849,16 @@ cdef class ParquetSchema(_Weakrefable):
 
     @property
     def names(self):
+        """Name of each field (list of str)."""
         return [self[i].name for i in range(len(self))]
 
     def to_arrow_schema(self):
         """
-        Convert Parquet schema to effective Arrow schema
+        Convert Parquet schema to effective Arrow schema.
 
         Returns
         -------
-        schema : pyarrow.Schema
+        schema : Schema
         """
         cdef shared_ptr[CSchema] sp_arrow_schema
 
@@ -686,6 +892,18 @@ cdef class ParquetSchema(_Weakrefable):
         return self.schema.Equals(deref(other.schema))
 
     def column(self, i):
+        """
+        Return the schema for a single column.
+
+        Parameters
+        ----------
+        i : int
+            Index of column in schema.
+
+        Returns
+        -------
+        column_schema : ColumnSchema
+        """
         if i < 0 or i >= len(self):
             raise IndexError('{0} out of bounds'.format(i))
 
@@ -693,6 +911,7 @@ cdef class ParquetSchema(_Weakrefable):
 
 
 cdef class ColumnSchema(_Weakrefable):
+    """Schema for a single column."""
     cdef:
         int index
         ParquetSchema parent
@@ -753,48 +972,54 @@ cdef class ColumnSchema(_Weakrefable):
 
     @property
     def name(self):
+        """Name of field (str)."""
         return frombytes(self.descr.name())
 
     @property
     def path(self):
+        """Nested path to field, separated by periods (str)."""
         return frombytes(self.descr.path().get().ToDotString())
 
     @property
     def max_definition_level(self):
+        """Maximum definition level (int)."""
         return self.descr.max_definition_level()
 
     @property
     def max_repetition_level(self):
+        """Maximum repetition level (int)."""
         return self.descr.max_repetition_level()
 
     @property
     def physical_type(self):
+        """Name of physical type (str)."""
         return physical_type_name_from_enum(self.descr.physical_type())
 
     @property
     def logical_type(self):
+        """Logical type of column (:class:`ParquetLogicalType`)."""
         return wrap_logical_type(self.descr.logical_type())
 
     @property
     def converted_type(self):
+        """Legacy converted type (str or None)."""
         return converted_type_name_from_enum(self.descr.converted_type())
 
-    @property
-    def logical_type(self):
-        return wrap_logical_type(self.descr.logical_type())
-
     # FIXED_LEN_BYTE_ARRAY attribute
     @property
     def length(self):
+        """Array length if fixed length byte array type, None otherwise (int or None)."""
         return self.descr.type_length()
 
     # Decimal attributes
     @property
     def precision(self):
+        """Precision if decimal type, None otherwise (int or None)."""
         return self.descr.type_precision()
 
     @property
     def scale(self):
+        """Scale if decimal type, None otherwise (int or None)."""
         return self.descr.type_scale()
 
 
diff --git a/python/pyarrow/parquet/__init__.py b/python/pyarrow/parquet/__init__.py
index f616b04e1d..4e6b85f22c 100644
--- a/python/pyarrow/parquet/__init__.py
+++ b/python/pyarrow/parquet/__init__.py
@@ -37,6 +37,7 @@ from pyarrow._parquet import (ParquetReader, Statistics,  # noqa
                               FileMetaData, RowGroupMetaData,
                               ColumnChunkMetaData,
                               ParquetSchema, ColumnSchema,
+                              ParquetLogicalType,
                               FileEncryptionProperties,
                               FileDecryptionProperties)
 from pyarrow.fs import (LocalFileSystem, FileSystem,
@@ -3230,7 +3231,7 @@ def write_metadata(schema, where, metadata_collector=None, **kwargs):
 
 def read_metadata(where, memory_map=False, decryption_properties=None):
     """
-    Read FileMetadata from footer of a single Parquet file.
+    Read FileMetaData from footer of a single Parquet file.
 
     Parameters
     ----------
@@ -3242,7 +3243,7 @@ def read_metadata(where, memory_map=False, decryption_properties=None):
 
     Returns
     -------
-    metadata : FileMetadata
+    metadata : FileMetaData
 
     Examples
     --------
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index b12016c7d5..17e787149a 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -4092,7 +4092,6 @@ cdef class Table(_PandasConvertible):
         Yields
         ------
         ChunkedArray
-        ChunkedArray
 
         Examples
         --------