You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/04/19 18:20:30 UTC

[GitHub] [arrow] lidavidm commented on a diff in pull request #12901: ARROW-16114: [Docs][Python] Document Parquet FileMetaData

lidavidm commented on code in PR #12901:
URL: https://github.com/apache/arrow/pull/12901#discussion_r853360716


##########
python/pyarrow/_parquet.pyx:
##########
@@ -49,6 +49,8 @@ cimport cpython as cp
 
 
 cdef class Statistics(_Weakrefable):
+    """Statistics for a single column in a single row group"""

Review Comment:
   ```suggestion
       """Statistics for a single column in a single row group."""
   ```



##########
python/pyarrow/_parquet.pyx:
##########
@@ -96,72 +98,87 @@ cdef class Statistics(_Weakrefable):
 
     @property
     def has_min_max(self):
+        """bool: whether min and max are present"""
         return self.statistics.get().HasMinMax()
 
     @property
     def has_null_count(self):
+        """bool: whether null count is present"""
         return self.statistics.get().HasNullCount()
 
     @property
     def has_distinct_count(self):
+        """bool: whether distinct count is preset"""
         return self.statistics.get().HasDistinctCount()
 
     @property
     def min_raw(self):
+        """bool, int, float, or bytes: min value as physical type"""
         if self.has_min_max:
             return _cast_statistic_raw_min(self.statistics.get())
         else:
             return None
 
     @property
     def max_raw(self):
+        """bool, int, float, or bytes: max value as physical type"""
         if self.has_min_max:
             return _cast_statistic_raw_max(self.statistics.get())
         else:
             return None
 
     @property
     def min(self):
+        """min value as logical type"""
         if self.has_min_max:
             return _cast_statistic_min(self.statistics.get())
         else:
             return None
 
     @property
     def max(self):
+        """max value as logical type"""
         if self.has_min_max:
             return _cast_statistic_max(self.statistics.get())
         else:
             return None
 
     @property
     def null_count(self):
+        """int: number of null values in chunk"""
         return self.statistics.get().null_count()
 
     @property
     def distinct_count(self):
+        """int: distinct number of values in chunk"""
+        # TODO: Why is this always zero? ARROW-11793
         return self.statistics.get().distinct_count()
 
     @property
     def num_values(self):
+        """int: number of non-null values"""
         return self.statistics.get().num_values()
 
     @property
     def physical_type(self):
+        """str: physical type of column"""
         raw_physical_type = self.statistics.get().physical_type()
         return physical_type_name_from_enum(raw_physical_type)
 
     @property
     def logical_type(self):
+        """:class:`ParquetLogicalType`: logical type of column"""
         return wrap_logical_type(self.statistics.get().descr().logical_type())
 
     @property
     def converted_type(self):
+        """str or None: legacy converted type"""
         raw_converted_type = self.statistics.get().descr().converted_type()
         return converted_type_name_from_enum(raw_converted_type)
 
 
 cdef class ParquetLogicalType(_Weakrefable):
+    """Logical type of parquet type"""

Review Comment:
   ```suggestion
       """Logical type of Parquet type."""
   ```



##########
python/pyarrow/_parquet.pyx:
##########
@@ -392,45 +418,62 @@ cdef class ColumnChunkMetaData(_Weakrefable):
 
     @property
     def compression(self):
+        """str: type of compression used for column.
+
+        One of 'UNCOMPRESSED', 'SNAPPY', 'GZIP', 'LZO', 'BROTLI', 'LZ4', 'ZSTD', 
+        or 'UNKNOWN'."""
         return compression_name_from_enum(self.metadata.compression())
 
     @property
     def encodings(self):
+        """tuple of str: encodings used for column
+
+        One of 'PLAIN', 'BIT_PACKED', 'RLE', 'BYTE_STREAM_SPLIT', 'DELTA_BINARY_PACKED',
+        'DELTA_BYTE_ARRAY'."""
         return tuple(map(encoding_name_from_enum, self.metadata.encodings()))
 
     @property
     def has_dictionary_page(self):
+        """bool: whether there is dictionary data present in the column chunk"""
         return bool(self.metadata.has_dictionary_page())
 
     @property
     def dictionary_page_offset(self):
+        """int: offset of dictionary page reglative to column chunk offset"""
         if self.has_dictionary_page:
             return self.metadata.dictionary_page_offset()
         else:
             return None
 
     @property
     def data_page_offset(self):
+        """int: offset of data page reglative to column chunk offset"""
         return self.metadata.data_page_offset()
 
     @property
     def has_index_page(self):
+        """Not yet supported"""
         raise NotImplementedError('not supported in parquet-cpp')
 
     @property
     def index_page_offset(self):
+        """Not yet supported"""
         raise NotImplementedError("parquet-cpp doesn't return valid values")
 
     @property
     def total_compressed_size(self):
+        """int: compresssed size in bytes"""
         return self.metadata.total_compressed_size()
 
     @property
     def total_uncompressed_size(self):
+        """int: uncompressed size in bytes"""
         return self.metadata.total_uncompressed_size()
 
 
 cdef class RowGroupMetaData(_Weakrefable):
+    """Metadata for a single row group"""

Review Comment:
   ```suggestion
       """Metadata for a single row group."""
   ```



##########
python/pyarrow/_parquet.pyx:
##########
@@ -732,13 +809,25 @@ cdef class ParquetSchema(_Weakrefable):
         return self.schema.Equals(deref(other.schema))
 
     def column(self, i):
+        """Return the schema for a single column
+
+        Parameters
+        ----------
+        i : int
+            index of column in schema
+
+        Returns
+        -------
+        column_schema : ColumnSchema
+        """
         if i < 0 or i >= len(self):
             raise IndexError('{0} out of bounds'.format(i))
 
         return ColumnSchema(self, i)
 
 
 cdef class ColumnSchema(_Weakrefable):
+    """Schema for a single column"""

Review Comment:
   ```suggestion
       """Schema for a single column."""
   ```



##########
python/pyarrow/_parquet.pyx:
##########
@@ -96,72 +98,87 @@ cdef class Statistics(_Weakrefable):
 
     @property
     def has_min_max(self):
+        """bool: whether min and max are present"""

Review Comment:
   Huh, did not realize numpydoc recommended this style for property docstrings. It looks like the sentences should still end in a period, though?



##########
python/pyarrow/_parquet.pyx:
##########
@@ -609,6 +672,17 @@ cdef class FileMetaData(_Weakrefable):
             return None
 
     def row_group(self, int i):
+        """Get metadata for row group at index i

Review Comment:
   ```suggestion
           """Get metadata for row group at index i.
   ```



##########
python/pyarrow/_parquet.pyx:
##########
@@ -670,6 +744,8 @@ cdef class FileMetaData(_Weakrefable):
 
 
 cdef class ParquetSchema(_Weakrefable):
+    """A Parquet schema"""

Review Comment:
   ```suggestion
       """A Parquet schema."""
   ```



##########
python/pyarrow/_parquet.pyx:
##########
@@ -481,14 +524,17 @@ cdef class RowGroupMetaData(_Weakrefable):
 

Review Comment:
   There are some other methods above that aren't documented, should we document those?



##########
python/pyarrow/_parquet.pyx:
##########
@@ -504,6 +550,8 @@ def _reconstruct_filemetadata(Buffer serialized):
 
 
 cdef class FileMetaData(_Weakrefable):
+    """Parquet metadata for a single file"""

Review Comment:
   ```suggestion
       """Parquet metadata for a single file."""
   ```



##########
python/pyarrow/_parquet.pyx:
##########
@@ -292,6 +309,8 @@ cdef _box_flba(ParquetFLBA val, uint32_t len):
 
 
 cdef class ColumnChunkMetaData(_Weakrefable):
+    """Column metadata for a single row group"""

Review Comment:
   ```suggestion
       """Column metadata for a single row group."""
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org