You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/04/19 18:20:30 UTC
[GitHub] [arrow] lidavidm commented on a diff in pull request #12901: ARROW-16114: [Docs][Python] Document Parquet FileMetaData
lidavidm commented on code in PR #12901:
URL: https://github.com/apache/arrow/pull/12901#discussion_r853360716
##########
python/pyarrow/_parquet.pyx:
##########
@@ -49,6 +49,8 @@ cimport cpython as cp
cdef class Statistics(_Weakrefable):
+ """Statistics for a single column in a single row group"""
Review Comment:
```suggestion
"""Statistics for a single column in a single row group."""
```
##########
python/pyarrow/_parquet.pyx:
##########
@@ -96,72 +98,87 @@ cdef class Statistics(_Weakrefable):
@property
def has_min_max(self):
+ """bool: whether min and max are present"""
return self.statistics.get().HasMinMax()
@property
def has_null_count(self):
+ """bool: whether null count is present"""
return self.statistics.get().HasNullCount()
@property
def has_distinct_count(self):
+ """bool: whether distinct count is preset"""
return self.statistics.get().HasDistinctCount()
@property
def min_raw(self):
+ """bool, int, float, or bytes: min value as physical type"""
if self.has_min_max:
return _cast_statistic_raw_min(self.statistics.get())
else:
return None
@property
def max_raw(self):
+ """bool, int, float, or bytes: max value as physical type"""
if self.has_min_max:
return _cast_statistic_raw_max(self.statistics.get())
else:
return None
@property
def min(self):
+ """min value as logical type"""
if self.has_min_max:
return _cast_statistic_min(self.statistics.get())
else:
return None
@property
def max(self):
+ """max value as logical type"""
if self.has_min_max:
return _cast_statistic_max(self.statistics.get())
else:
return None
@property
def null_count(self):
+ """int: number of null values in chunk"""
return self.statistics.get().null_count()
@property
def distinct_count(self):
+ """int: distinct number of values in chunk"""
+ # TODO: Why is this always zero? ARROW-11793
return self.statistics.get().distinct_count()
@property
def num_values(self):
+ """int: number of non-null values"""
return self.statistics.get().num_values()
@property
def physical_type(self):
+ """str: physical type of column"""
raw_physical_type = self.statistics.get().physical_type()
return physical_type_name_from_enum(raw_physical_type)
@property
def logical_type(self):
+ """:class:`ParquetLogicalType`: logical type of column"""
return wrap_logical_type(self.statistics.get().descr().logical_type())
@property
def converted_type(self):
+ """str or None: legacy converted type"""
raw_converted_type = self.statistics.get().descr().converted_type()
return converted_type_name_from_enum(raw_converted_type)
cdef class ParquetLogicalType(_Weakrefable):
+ """Logical type of parquet type"""
Review Comment:
```suggestion
"""Logical type of Parquet type."""
```
##########
python/pyarrow/_parquet.pyx:
##########
@@ -392,45 +418,62 @@ cdef class ColumnChunkMetaData(_Weakrefable):
@property
def compression(self):
+ """str: type of compression used for column.
+
+ One of 'UNCOMPRESSED', 'SNAPPY', 'GZIP', 'LZO', 'BROTLI', 'LZ4', 'ZSTD',
+ or 'UNKNOWN'."""
return compression_name_from_enum(self.metadata.compression())
@property
def encodings(self):
+ """tuple of str: encodings used for column
+
+ One of 'PLAIN', 'BIT_PACKED', 'RLE', 'BYTE_STREAM_SPLIT', 'DELTA_BINARY_PACKED',
+ 'DELTA_BYTE_ARRAY'."""
return tuple(map(encoding_name_from_enum, self.metadata.encodings()))
@property
def has_dictionary_page(self):
+ """bool: whether there is dictionary data present in the column chunk"""
return bool(self.metadata.has_dictionary_page())
@property
def dictionary_page_offset(self):
+ """int: offset of dictionary page reglative to column chunk offset"""
if self.has_dictionary_page:
return self.metadata.dictionary_page_offset()
else:
return None
@property
def data_page_offset(self):
+ """int: offset of data page reglative to column chunk offset"""
return self.metadata.data_page_offset()
@property
def has_index_page(self):
+ """Not yet supported"""
raise NotImplementedError('not supported in parquet-cpp')
@property
def index_page_offset(self):
+ """Not yet supported"""
raise NotImplementedError("parquet-cpp doesn't return valid values")
@property
def total_compressed_size(self):
+ """int: compresssed size in bytes"""
return self.metadata.total_compressed_size()
@property
def total_uncompressed_size(self):
+ """int: uncompressed size in bytes"""
return self.metadata.total_uncompressed_size()
cdef class RowGroupMetaData(_Weakrefable):
+ """Metadata for a single row group"""
Review Comment:
```suggestion
"""Metadata for a single row group."""
```
##########
python/pyarrow/_parquet.pyx:
##########
@@ -732,13 +809,25 @@ cdef class ParquetSchema(_Weakrefable):
return self.schema.Equals(deref(other.schema))
def column(self, i):
+ """Return the schema for a single column
+
+ Parameters
+ ----------
+ i : int
+ index of column in schema
+
+ Returns
+ -------
+ column_schema : ColumnSchema
+ """
if i < 0 or i >= len(self):
raise IndexError('{0} out of bounds'.format(i))
return ColumnSchema(self, i)
cdef class ColumnSchema(_Weakrefable):
+ """Schema for a single column"""
Review Comment:
```suggestion
"""Schema for a single column."""
```
##########
python/pyarrow/_parquet.pyx:
##########
@@ -96,72 +98,87 @@ cdef class Statistics(_Weakrefable):
@property
def has_min_max(self):
+ """bool: whether min and max are present"""
Review Comment:
Huh, did not realize numpydoc recommended this style for property docstrings. It looks like the sentences should still end in a period, though?
##########
python/pyarrow/_parquet.pyx:
##########
@@ -609,6 +672,17 @@ cdef class FileMetaData(_Weakrefable):
return None
def row_group(self, int i):
+ """Get metadata for row group at index i
Review Comment:
```suggestion
"""Get metadata for row group at index i.
```
##########
python/pyarrow/_parquet.pyx:
##########
@@ -670,6 +744,8 @@ cdef class FileMetaData(_Weakrefable):
cdef class ParquetSchema(_Weakrefable):
+ """A Parquet schema"""
Review Comment:
```suggestion
"""A Parquet schema."""
```
##########
python/pyarrow/_parquet.pyx:
##########
@@ -481,14 +524,17 @@ cdef class RowGroupMetaData(_Weakrefable):
Review Comment:
There are some other methods above that aren't documented, should we document those?
##########
python/pyarrow/_parquet.pyx:
##########
@@ -504,6 +550,8 @@ def _reconstruct_filemetadata(Buffer serialized):
cdef class FileMetaData(_Weakrefable):
+ """Parquet metadata for a single file"""
Review Comment:
```suggestion
"""Parquet metadata for a single file."""
```
##########
python/pyarrow/_parquet.pyx:
##########
@@ -292,6 +309,8 @@ cdef _box_flba(ParquetFLBA val, uint32_t len):
cdef class ColumnChunkMetaData(_Weakrefable):
+ """Column metadata for a single row group"""
Review Comment:
```suggestion
"""Column metadata for a single row group."""
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org
For queries about this service, please contact Infrastructure at:
users@infra.apache.org