You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by li...@apache.org on 2022/04/21 12:40:39 UTC
[arrow] branch master updated: ARROW-16114: [Docs][Python] Document Parquet FileMetaData
This is an automated email from the ASF dual-hosted git repository.
lidavidm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 2e7acabf7b ARROW-16114: [Docs][Python] Document Parquet FileMetaData
2e7acabf7b is described below
commit 2e7acabf7ba9c4df8621918e82fe795a227a3b31
Author: Will Jones <wi...@gmail.com>
AuthorDate: Thu Apr 21 08:40:23 2022 -0400
ARROW-16114: [Docs][Python] Document Parquet FileMetaData
Recently was using this class in a third-party library and had a hard time finding information about these classes.
Closes #12901 from wjones127/ARROW-16114-document-parquet-metadata
Authored-by: Will Jones <wi...@gmail.com>
Signed-off-by: David Li <li...@gmail.com>
---
docs/source/python/api/formats.rst | 14 +++
python/pyarrow/_parquet.pyx | 237 ++++++++++++++++++++++++++++++++++++-
python/pyarrow/parquet/__init__.py | 5 +-
python/pyarrow/table.pxi | 1 -
4 files changed, 248 insertions(+), 9 deletions(-)
diff --git a/docs/source/python/api/formats.rst b/docs/source/python/api/formats.rst
index 55f6d568d9..9ca499c097 100644
--- a/docs/source/python/api/formats.rst
+++ b/docs/source/python/api/formats.rst
@@ -89,6 +89,20 @@ Parquet Files
write_table
write_to_dataset
+Parquet Metadata
+~~~~~~~~~~~~~~~~
+
+.. autosummary::
+ :toctree: ../generated/
+
+ FileMetaData
+ RowGroupMetaData
+ ColumnChunkMetaData
+ Statistics
+ ParquetSchema
+ ColumnSchema
+ ParquetLogicalType
+
Encrypted Parquet Files
~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index 1e53b99d05..c634ea9669 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -50,6 +50,8 @@ cimport cpython as cp
cdef class Statistics(_Weakrefable):
+ """Statistics for a single column in a single row group."""
+
def __cinit__(self):
pass
@@ -75,6 +77,14 @@ cdef class Statistics(_Weakrefable):
self.converted_type)
def to_dict(self):
+ """
+ Get dictionary represenation of statistics.
+
+ Returns
+ -------
+ dict
+ Dictionary with a key for each attribute of this class.
+ """
d = dict(
has_min_max=self.has_min_max,
min=self.min,
@@ -93,22 +103,38 @@ cdef class Statistics(_Weakrefable):
return NotImplemented
def equals(self, Statistics other):
+ """
+ Return whether the two column statistics objects are equal.
+
+ Parameters
+ ----------
+ other : Statistics
+ Statistics to compare against.
+
+ Returns
+ -------
+ are_equal : bool
+ """
return self.statistics.get().Equals(deref(other.statistics.get()))
@property
def has_min_max(self):
+ """Whether min and max are present (bool)."""
return self.statistics.get().HasMinMax()
@property
def has_null_count(self):
+ """Whether null count is present (bool)."""
return self.statistics.get().HasNullCount()
@property
def has_distinct_count(self):
+ """Whether distinct count is preset (bool)."""
return self.statistics.get().HasDistinctCount()
@property
def min_raw(self):
+ """Min value as physical type (bool, int, float, or bytes)."""
if self.has_min_max:
return _cast_statistic_raw_min(self.statistics.get())
else:
@@ -116,6 +142,7 @@ cdef class Statistics(_Weakrefable):
@property
def max_raw(self):
+ """Max value as physical type (bool, int, float, or bytes)."""
if self.has_min_max:
return _cast_statistic_raw_max(self.statistics.get())
else:
@@ -123,6 +150,12 @@ cdef class Statistics(_Weakrefable):
@property
def min(self):
+ """
+ Min value as logical type.
+
+ Returned as the Python equivalent of logical type, such as datetime.date
+ for dates and decimal.Decimal for decimals.
+ """
if self.has_min_max:
min_scalar, _ = _cast_statistics(self.statistics.get())
return min_scalar.as_py()
@@ -131,6 +164,12 @@ cdef class Statistics(_Weakrefable):
@property
def max(self):
+ """
+ Max value as logical type.
+
+ Returned as the Python equivalent of logical type, such as datetime.date
+ for dates and decimal.Decimal for decimals.
+ """
if self.has_min_max:
_, max_scalar = _cast_statistics(self.statistics.get())
return max_scalar.as_py()
@@ -139,32 +178,44 @@ cdef class Statistics(_Weakrefable):
@property
def null_count(self):
+ """Number of null values in chunk (int)."""
return self.statistics.get().null_count()
@property
def distinct_count(self):
+ """
+ Distinct number of values in chunk (int).
+
+ If this is not set, will return 0.
+ """
+ # This seems to be zero if not set. See: ARROW-11793
return self.statistics.get().distinct_count()
@property
def num_values(self):
+ """Number of non-null values (int)."""
return self.statistics.get().num_values()
@property
def physical_type(self):
+ """Physical type of column (str)."""
raw_physical_type = self.statistics.get().physical_type()
return physical_type_name_from_enum(raw_physical_type)
@property
def logical_type(self):
+ """Logical type of column (:class:`ParquetLogicalType`)."""
return wrap_logical_type(self.statistics.get().descr().logical_type())
@property
def converted_type(self):
+ """Legacy converted type (str or None)."""
raw_converted_type = self.statistics.get().descr().converted_type()
return converted_type_name_from_enum(raw_converted_type)
cdef class ParquetLogicalType(_Weakrefable):
+ """Logical type of parquet type."""
cdef:
shared_ptr[const CParquetLogicalType] type
@@ -174,14 +225,29 @@ cdef class ParquetLogicalType(_Weakrefable):
cdef init(self, const shared_ptr[const CParquetLogicalType]& type):
self.type = type
+ def __repr__(self):
+ return "{}\n {}".format(object.__repr__(self), str(self))
+
def __str__(self):
return frombytes(self.type.get().ToString(), safe=True)
def to_json(self):
+ """
+ Get a JSON string containing type and type parameters.
+
+ Returns
+ -------
+ json : str
+ JSON representation of type, with at least a field called 'Type'
+ which contains the type name. If the type is parameterized, such
+ as a decimal with scale and precision, will contain those as fields
+ as well.
+ """
return frombytes(self.type.get().ToJSON())
@property
def type(self):
+ """Name of the logical type (str)."""
return logical_type_name_from_enum(self.type.get().type())
@@ -246,6 +312,8 @@ cdef _box_flba(ParquetFLBA val, uint32_t len):
cdef class ColumnChunkMetaData(_Weakrefable):
+ """Column metadata for a single row group."""
+
def __cinit__(self):
pass
@@ -283,6 +351,14 @@ cdef class ColumnChunkMetaData(_Weakrefable):
self.total_uncompressed_size)
def to_dict(self):
+ """
+ Get dictionary represenation of the column chunk metadata.
+
+ Returns
+ -------
+ dict
+ Dictionary with a key for each attribute of this class.
+ """
statistics = self.statistics.to_dict() if self.is_stats_set else None
d = dict(
file_offset=self.file_offset,
@@ -309,35 +385,54 @@ cdef class ColumnChunkMetaData(_Weakrefable):
return NotImplemented
def equals(self, ColumnChunkMetaData other):
+ """
+ Return whether the two column chunk metadata objects are equal.
+
+ Parameters
+ ----------
+ other : ColumnChunkMetaData
+ Metadata to compare against.
+
+ Returns
+ -------
+ are_equal : bool
+ """
return self.metadata.Equals(deref(other.metadata))
@property
def file_offset(self):
+ """Offset into file where column chunk is located (int)."""
return self.metadata.file_offset()
@property
def file_path(self):
+ """Optional file path if set (str or None)."""
return frombytes(self.metadata.file_path())
@property
def physical_type(self):
+ """Physical type of column (str)."""
return physical_type_name_from_enum(self.metadata.type())
@property
def num_values(self):
+ """Total number of values (int)."""
return self.metadata.num_values()
@property
def path_in_schema(self):
+ """Nested path to field, separated by periods (str)."""
path = self.metadata.path_in_schema().get().ToDotString()
return frombytes(path)
@property
def is_stats_set(self):
+ """Whether or not statistics are present in metadata (bool)."""
return self.metadata.is_stats_set()
@property
def statistics(self):
+ """Statistics for column chunk (:class:`Statistics`)."""
if not self.metadata.is_stats_set():
return None
statistics = Statistics()
@@ -346,18 +441,32 @@ cdef class ColumnChunkMetaData(_Weakrefable):
@property
def compression(self):
+ """
+ Type of compression used for column (str).
+
+ One of 'UNCOMPRESSED', 'SNAPPY', 'GZIP', 'LZO', 'BROTLI', 'LZ4', 'ZSTD',
+ or 'UNKNOWN'.
+ """
return compression_name_from_enum(self.metadata.compression())
@property
def encodings(self):
+ """
+ Encodings used for column (tuple of str).
+
+ One of 'PLAIN', 'BIT_PACKED', 'RLE', 'BYTE_STREAM_SPLIT', 'DELTA_BINARY_PACKED',
+ 'DELTA_BYTE_ARRAY'.
+ """
return tuple(map(encoding_name_from_enum, self.metadata.encodings()))
@property
def has_dictionary_page(self):
+ """Whether there is dictionary data present in the column chunk (bool)."""
return bool(self.metadata.has_dictionary_page())
@property
def dictionary_page_offset(self):
+ """Offset of dictionary page reglative to column chunk offset (int)."""
if self.has_dictionary_page:
return self.metadata.dictionary_page_offset()
else:
@@ -365,26 +474,33 @@ cdef class ColumnChunkMetaData(_Weakrefable):
@property
def data_page_offset(self):
+ """Offset of data page reglative to column chunk offset (int)."""
return self.metadata.data_page_offset()
@property
def has_index_page(self):
+ """Not yet supported."""
raise NotImplementedError('not supported in parquet-cpp')
@property
def index_page_offset(self):
+ """Not yet supported."""
raise NotImplementedError("parquet-cpp doesn't return valid values")
@property
def total_compressed_size(self):
+ """Compresssed size in bytes (int)."""
return self.metadata.total_compressed_size()
@property
def total_uncompressed_size(self):
+ """Uncompressed size in bytes (int)."""
return self.metadata.total_uncompressed_size()
cdef class RowGroupMetaData(_Weakrefable):
+ """Metadata for a single row group."""
+
def __cinit__(self, FileMetaData parent, int index):
if index < 0 or index >= parent.num_row_groups:
raise IndexError('{0} out of bounds'.format(index))
@@ -403,9 +519,34 @@ cdef class RowGroupMetaData(_Weakrefable):
return NotImplemented
def equals(self, RowGroupMetaData other):
+ """
+ Return whether the two row group metadata objects are equal.
+
+ Parameters
+ ----------
+ other : RowGroupMetaData
+ Metadata to compare against.
+
+ Returns
+ -------
+ are_equal : bool
+ """
return self.metadata.Equals(deref(other.metadata))
def column(self, int i):
+ """
+ Get column metadata at given index.
+
+ Parameters
+ ----------
+ i : int
+ Index of column to get metadata for.
+
+ Returns
+ -------
+ ColumnChunkMetaData
+ Metadata for column within this chunk.
+ """
if i < 0 or i >= self.num_columns:
raise IndexError('{0} out of bounds'.format(i))
chunk = ColumnChunkMetaData()
@@ -422,6 +563,14 @@ cdef class RowGroupMetaData(_Weakrefable):
self.total_byte_size)
def to_dict(self):
+ """
+ Get dictionary represenation of the row group metadata.
+
+ Returns
+ -------
+ dict
+ Dictionary with a key for each attribute of this class.
+ """
columns = []
d = dict(
num_columns=self.num_columns,
@@ -435,14 +584,17 @@ cdef class RowGroupMetaData(_Weakrefable):
@property
def num_columns(self):
+ """Number of columns in this row group (int)."""
return self.metadata.num_columns()
@property
def num_rows(self):
+ """Number of rows in this row group (int)."""
return self.metadata.num_rows()
@property
def total_byte_size(self):
+ """Total byte size of all the uncompressed column data in this row group (int)."""
return self.metadata.total_byte_size()
@@ -458,6 +610,8 @@ def _reconstruct_filemetadata(Buffer serialized):
cdef class FileMetaData(_Weakrefable):
+ """Parquet metadata for a single file."""
+
def __cinit__(self):
pass
@@ -485,6 +639,14 @@ cdef class FileMetaData(_Weakrefable):
self.serialized_size)
def to_dict(self):
+ """
+ Get dictionary represenation of the file metadata.
+
+ Returns
+ -------
+ dict
+ Dictionary with a key for each attribute of this class.
+ """
row_groups = []
d = dict(
created_by=self.created_by,
@@ -506,32 +668,54 @@ cdef class FileMetaData(_Weakrefable):
return NotImplemented
def equals(self, FileMetaData other):
+ """
+ Return whether the two file metadata objects are equal.
+
+ Parameters
+ ----------
+ other : FileMetaData
+ Metadata to compare against.
+
+ Returns
+ -------
+ are_equal : bool
+ """
return self._metadata.Equals(deref(other._metadata))
@property
def schema(self):
+ """Schema of the file (:class:`ParquetSchema`)."""
if self._schema is None:
self._schema = ParquetSchema(self)
return self._schema
@property
def serialized_size(self):
+ """Size of the original thrift encoded metadata footer (int)."""
return self._metadata.size()
@property
def num_columns(self):
+ """Number of columns in file (int)."""
return self._metadata.num_columns()
@property
def num_rows(self):
+ """Total number of rows in file (int)."""
return self._metadata.num_rows()
@property
def num_row_groups(self):
+ """Number of row groups in file (int)."""
return self._metadata.num_row_groups()
@property
def format_version(self):
+ """
+ Parquet format version used in file (str, such as '1.0', '2.4').
+
+ If version is missing or unparsable, will default to assuming '1.0'.
+ """
cdef ParquetVersion version = self._metadata.version()
if version == ParquetVersion_V1:
return '1.0'
@@ -548,10 +732,17 @@ cdef class FileMetaData(_Weakrefable):
@property
def created_by(self):
+ """
+ String describing source of the parquet file (str).
+
+ This typically includes library name and version number. For example, Arrow 7.0's
+ writer returns 'parquet-cpp-arrow version 7.0.0'.
+ """
return frombytes(self._metadata.created_by())
@property
def metadata(self):
+ """Additional metadata as key value pairs (dict[bytes, bytes])."""
cdef:
unordered_map[c_string, c_string] metadata
const CKeyValueMetadata* underlying_metadata
@@ -563,6 +754,18 @@ cdef class FileMetaData(_Weakrefable):
return None
def row_group(self, int i):
+ """
+ Get metadata for row group at index i.
+
+ Parameters
+ ----------
+ i : int
+ Row group index to get.
+
+ Returns
+ -------
+ row_group_metadata : RowGroupMetaData
+ """
return RowGroupMetaData(self, i)
def set_file_path(self, path):
@@ -624,6 +827,8 @@ cdef class FileMetaData(_Weakrefable):
cdef class ParquetSchema(_Weakrefable):
+ """A Parquet schema."""
+
def __cinit__(self, FileMetaData container):
self.parent = container
self.schema = container._metadata.schema()
@@ -644,15 +849,16 @@ cdef class ParquetSchema(_Weakrefable):
@property
def names(self):
+ """Name of each field (list of str)."""
return [self[i].name for i in range(len(self))]
def to_arrow_schema(self):
"""
- Convert Parquet schema to effective Arrow schema
+ Convert Parquet schema to effective Arrow schema.
Returns
-------
- schema : pyarrow.Schema
+ schema : Schema
"""
cdef shared_ptr[CSchema] sp_arrow_schema
@@ -686,6 +892,18 @@ cdef class ParquetSchema(_Weakrefable):
return self.schema.Equals(deref(other.schema))
def column(self, i):
+ """
+ Return the schema for a single column.
+
+ Parameters
+ ----------
+ i : int
+ Index of column in schema.
+
+ Returns
+ -------
+ column_schema : ColumnSchema
+ """
if i < 0 or i >= len(self):
raise IndexError('{0} out of bounds'.format(i))
@@ -693,6 +911,7 @@ cdef class ParquetSchema(_Weakrefable):
cdef class ColumnSchema(_Weakrefable):
+ """Schema for a single column."""
cdef:
int index
ParquetSchema parent
@@ -753,48 +972,54 @@ cdef class ColumnSchema(_Weakrefable):
@property
def name(self):
+ """Name of field (str)."""
return frombytes(self.descr.name())
@property
def path(self):
+ """Nested path to field, separated by periods (str)."""
return frombytes(self.descr.path().get().ToDotString())
@property
def max_definition_level(self):
+ """Maximum definition level (int)."""
return self.descr.max_definition_level()
@property
def max_repetition_level(self):
+ """Maximum repetition level (int)."""
return self.descr.max_repetition_level()
@property
def physical_type(self):
+ """Name of physical type (str)."""
return physical_type_name_from_enum(self.descr.physical_type())
@property
def logical_type(self):
+ """Logical type of column (:class:`ParquetLogicalType`)."""
return wrap_logical_type(self.descr.logical_type())
@property
def converted_type(self):
+ """Legacy converted type (str or None)."""
return converted_type_name_from_enum(self.descr.converted_type())
- @property
- def logical_type(self):
- return wrap_logical_type(self.descr.logical_type())
-
# FIXED_LEN_BYTE_ARRAY attribute
@property
def length(self):
+ """Array length if fixed length byte array type, None otherwise (int or None)."""
return self.descr.type_length()
# Decimal attributes
@property
def precision(self):
+ """Precision if decimal type, None otherwise (int or None)."""
return self.descr.type_precision()
@property
def scale(self):
+ """Scale if decimal type, None otherwise (int or None)."""
return self.descr.type_scale()
diff --git a/python/pyarrow/parquet/__init__.py b/python/pyarrow/parquet/__init__.py
index f616b04e1d..4e6b85f22c 100644
--- a/python/pyarrow/parquet/__init__.py
+++ b/python/pyarrow/parquet/__init__.py
@@ -37,6 +37,7 @@ from pyarrow._parquet import (ParquetReader, Statistics, # noqa
FileMetaData, RowGroupMetaData,
ColumnChunkMetaData,
ParquetSchema, ColumnSchema,
+ ParquetLogicalType,
FileEncryptionProperties,
FileDecryptionProperties)
from pyarrow.fs import (LocalFileSystem, FileSystem,
@@ -3230,7 +3231,7 @@ def write_metadata(schema, where, metadata_collector=None, **kwargs):
def read_metadata(where, memory_map=False, decryption_properties=None):
"""
- Read FileMetadata from footer of a single Parquet file.
+ Read FileMetaData from footer of a single Parquet file.
Parameters
----------
@@ -3242,7 +3243,7 @@ def read_metadata(where, memory_map=False, decryption_properties=None):
Returns
-------
- metadata : FileMetadata
+ metadata : FileMetaData
Examples
--------
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index b12016c7d5..17e787149a 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -4092,7 +4092,6 @@ cdef class Table(_PandasConvertible):
Yields
------
ChunkedArray
- ChunkedArray
Examples
--------