You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by am...@apache.org on 2022/04/13 10:24:16 UTC

[arrow] branch master updated: ARROW-15428: [Python] Address docstrings in Parquet classes and functions

This is an automated email from the ASF dual-hosted git repository.

amolina pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new cf68c1cd53 ARROW-15428: [Python] Address docstrings in Parquet classes and functions
cf68c1cd53 is described below

commit cf68c1cd53821c0e5399465e4a5ff459066f7f74
Author: Alenka Frim <fr...@gmail.com>
AuthorDate: Wed Apr 13 12:24:09 2022 +0200

    ARROW-15428: [Python] Address docstrings in Parquet classes and functions
    
    This PR is adding docstring examples to:
    
    - /docs/python/generated/pyarrow.parquet.ParquetDataset.html
    - /docs/python/generated/pyarrow.parquet.ParquetFile.html
    - /docs/python/generated/pyarrow.parquet.ParquetWriter.html
    - /docs/python/generated/pyarrow.parquet.read_table.html
    - /docs/python/generated/pyarrow.parquet.write_table.html
    - /docs/python/generated/pyarrow.parquet.write_to_dataset.html
    
    Closes #12704 from AlenkaF/ARROW-15428
    
    Lead-authored-by: Alenka Frim <fr...@gmail.com>
    Co-authored-by: Alenka Frim <Al...@users.noreply.github.com>
    Co-authored-by: Joris Van den Bossche <jo...@gmail.com>
    Signed-off-by: Alessandro Molina <am...@turbogears.org>
---
 python/pyarrow/parquet.py                  | 792 ++++++++++++++++++++++++++++-
 python/pyarrow/tests/parquet/test_basic.py |   4 +-
 2 files changed, 777 insertions(+), 19 deletions(-)

diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py
index 0e3056645a..f68979c87c 100644
--- a/python/pyarrow/parquet.py
+++ b/python/pyarrow/parquet.py
@@ -225,6 +225,47 @@ class ParquetFile:
         in nanoseconds.
     decryption_properties : FileDecryptionProperties, default None
         File decryption properties for Parquet Modular Encryption.
+
+    Examples
+    --------
+
+    Generate an example PyArrow Table and write it to Parquet file:
+
+    >>> import pyarrow as pa
+    >>> table = pa.table({'n_legs': [2, 2, 4, 4, 5, 100],
+    ...                   'animal': ["Flamingo", "Parrot", "Dog", "Horse",
+    ...                              "Brittle stars", "Centipede"]})
+
+    >>> import pyarrow.parquet as pq
+    >>> pq.write_table(table, 'example.parquet')
+
+    Create a ``ParquetFile`` object from the Parquet file:
+
+    >>> parquet_file = pq.ParquetFile('example.parquet')
+
+    Read the data:
+
+    >>> parquet_file.read()
+    pyarrow.Table
+    n_legs: int64
+    animal: string
+    ----
+    n_legs: [[2,2,4,4,5,100]]
+    animal: [["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"]]
+
+    Create a ParquetFile object with "animal" column as DictionaryArray:
+
+    >>> parquet_file = pq.ParquetFile('example.parquet',
+    ...                               read_dictionary=["animal"])
+    >>> parquet_file.read()
+    pyarrow.Table
+    n_legs: int64
+    animal: dictionary<values=string, indices=int32, ordered=0>
+    ----
+    n_legs: [[2,2,4,4,5,100]]
+    animal: [  -- dictionary:
+    ["Flamingo","Parrot",...,"Brittle stars","Centipede"]  -- indices:
+    [0,1,2,3,4,5]]
     """
 
     def __init__(self, source, metadata=None, common_metadata=None,
@@ -263,6 +304,9 @@ class ParquetFile:
 
     @property
     def metadata(self):
+        """
+        Return the Parquet metadata.
+        """
         return self.reader.metadata
 
     @property
@@ -277,11 +321,45 @@ class ParquetFile:
         """
         Return the inferred Arrow schema, converted from the whole Parquet
         file's schema
+
+        Examples
+        --------
+        Generate an example Parquet file:
+
+        >>> import pyarrow as pa
+        >>> table = pa.table({'n_legs': [2, 2, 4, 4, 5, 100],
+        ...                   'animal': ["Flamingo", "Parrot", "Dog", "Horse",
+        ...                              "Brittle stars", "Centipede"]})
+        >>> import pyarrow.parquet as pq
+        >>> pq.write_table(table, 'example.parquet')
+        >>> parquet_file = pq.ParquetFile('example.parquet')
+
+        Read the Arrow schema:
+
+        >>> parquet_file.schema_arrow
+        n_legs: int64
+        animal: string
         """
         return self.reader.schema_arrow
 
     @property
     def num_row_groups(self):
+        """
+        Return the number of row groups of the Parquet file.
+
+        Examples
+        --------
+        >>> import pyarrow as pa
+        >>> table = pa.table({'n_legs': [2, 2, 4, 4, 5, 100],
+        ...                   'animal': ["Flamingo", "Parrot", "Dog", "Horse",
+        ...                              "Brittle stars", "Centipede"]})
+        >>> import pyarrow.parquet as pq
+        >>> pq.write_table(table, 'example.parquet')
+        >>> parquet_file = pq.ParquetFile('example.parquet')
+
+        >>> parquet_file.num_row_groups
+        1
+        """
         return self.reader.num_row_groups
 
     def read_row_group(self, i, columns=None, use_threads=True,
@@ -307,6 +385,24 @@ class ParquetFile:
         -------
         pyarrow.table.Table
             Content of the row group as a table (of columns)
+
+        Examples
+        --------
+        >>> import pyarrow as pa
+        >>> table = pa.table({'n_legs': [2, 2, 4, 4, 5, 100],
+        ...                   'animal': ["Flamingo", "Parrot", "Dog", "Horse",
+        ...                              "Brittle stars", "Centipede"]})
+        >>> import pyarrow.parquet as pq
+        >>> pq.write_table(table, 'example.parquet')
+        >>> parquet_file = pq.ParquetFile('example.parquet')
+
+        >>> parquet_file.read_row_group(0)
+        pyarrow.Table
+        n_legs: int64
+        animal: string
+        ----
+        n_legs: [[2,2,4,4,5,100]]
+        animal: [["Flamingo","Parrot",...,"Brittle stars","Centipede"]]
         """
         column_indices = self._get_column_indices(
             columns, use_pandas_metadata=use_pandas_metadata)
@@ -336,6 +432,24 @@ class ParquetFile:
         -------
         pyarrow.table.Table
             Content of the row groups as a table (of columns).
+
+        Examples
+        --------
+        >>> import pyarrow as pa
+        >>> table = pa.table({'n_legs': [2, 2, 4, 4, 5, 100],
+        ...                   'animal': ["Flamingo", "Parrot", "Dog", "Horse",
+        ...                              "Brittle stars", "Centipede"]})
+        >>> import pyarrow.parquet as pq
+        >>> pq.write_table(table, 'example.parquet')
+        >>> parquet_file = pq.ParquetFile('example.parquet')
+
+        >>> parquet_file.read_row_groups([0,0])
+        pyarrow.Table
+        n_legs: int64
+        animal: string
+        ----
+        n_legs: [[2,2,4,4,5,...,2,4,4,5,100]]
+        animal: [["Flamingo","Parrot","Dog",...,"Brittle stars","Centipede"]]
         """
         column_indices = self._get_column_indices(
             columns, use_pandas_metadata=use_pandas_metadata)
@@ -346,7 +460,7 @@ class ParquetFile:
     def iter_batches(self, batch_size=65536, row_groups=None, columns=None,
                      use_threads=True, use_pandas_metadata=False):
         """
-        Read streaming batches from a Parquet file
+        Read streaming batches from a Parquet file.
 
         Parameters
         ----------
@@ -369,6 +483,30 @@ class ParquetFile:
         -------
         iterator of pyarrow.RecordBatch
             Contents of each batch as a record batch
+
+        Examples
+        --------
+        Generate an example Parquet file:
+
+        >>> import pyarrow as pa
+        >>> table = pa.table({'n_legs': [2, 2, 4, 4, 5, 100],
+        ...                   'animal': ["Flamingo", "Parrot", "Dog", "Horse",
+        ...                              "Brittle stars", "Centipede"]})
+        >>> import pyarrow.parquet as pq
+        >>> pq.write_table(table, 'example.parquet')
+        >>> parquet_file = pq.ParquetFile('example.parquet')
+        >>> for i in parquet_file.iter_batches():
+        ...     print("RecordBatch")
+        ...     print(i.to_pandas())
+        ...
+        RecordBatch
+           n_legs         animal
+        0       2       Flamingo
+        1       2         Parrot
+        2       4            Dog
+        3       4          Horse
+        4       5  Brittle stars
+        5     100      Centipede
         """
         if row_groups is None:
             row_groups = range(0, self.metadata.num_row_groups)
@@ -383,7 +521,7 @@ class ParquetFile:
 
     def read(self, columns=None, use_threads=True, use_pandas_metadata=False):
         """
-        Read a Table from Parquet format,
+        Read a Table from Parquet format.
 
         Parameters
         ----------
@@ -401,6 +539,26 @@ class ParquetFile:
         -------
         pyarrow.table.Table
             Content of the file as a table (of columns).
+
+        Examples
+        --------
+        Generate an example Parquet file:
+
+        >>> import pyarrow as pa
+        >>> table = pa.table({'n_legs': [2, 2, 4, 4, 5, 100],
+        ...                   'animal': ["Flamingo", "Parrot", "Dog", "Horse",
+        ...                              "Brittle stars", "Centipede"]})
+        >>> import pyarrow.parquet as pq
+        >>> pq.write_table(table, 'example.parquet')
+        >>> parquet_file = pq.ParquetFile('example.parquet')
+
+        Read a Table:
+
+        >>> parquet_file.read(columns=["animal"])
+        pyarrow.Table
+        animal: string
+        ----
+        animal: [["Flamingo","Parrot",...,"Brittle stars","Centipede"]]
         """
         column_indices = self._get_column_indices(
             columns, use_pandas_metadata=use_pandas_metadata)
@@ -426,6 +584,19 @@ class ParquetFile:
         Returns
         -------
         num_rows : number of rows in file
+
+        Examples
+        --------
+        >>> import pyarrow as pa
+        >>> table = pa.table({'n_legs': [2, 2, 4, 4, 5, 100],
+        ...                   'animal': ["Flamingo", "Parrot", "Dog", "Horse",
+        ...                              "Brittle stars", "Centipede"]})
+        >>> import pyarrow.parquet as pq
+        >>> pq.write_table(table, 'example.parquet')
+        >>> parquet_file = pq.ParquetFile('example.parquet')
+
+        >>> parquet_file.scan_contents()
+        6
         """
         column_indices = self._get_column_indices(columns)
         return self.reader.scan_contents(column_indices,
@@ -612,6 +783,56 @@ write_batch_size : int, default None
     the batch size can help keep page sizes closer to the intended size.
 """
 
+_parquet_writer_example_doc = """\
+Generate an example PyArrow Table and RecordBatch:
+
+>>> import pyarrow as pa
+>>> table = pa.table({'n_legs': [2, 2, 4, 4, 5, 100],
+...                   'animal': ["Flamingo", "Parrot", "Dog", "Horse",
+...                              "Brittle stars", "Centipede"]})
+>>> batch = pa.record_batch([[2, 2, 4, 4, 5, 100],
+...                         ["Flamingo", "Parrot", "Dog", "Horse",
+...                          "Brittle stars", "Centipede"]],
+...                         names=['n_legs', 'animal'])
+
+create a ParquetWriter object:
+
+>>> import pyarrow.parquet as pq
+>>> writer = pq.ParquetWriter('example.parquet', table.schema)
+
+and write the Table into the Parquet file:
+
+>>> writer.write_table(table)
+>>> writer.close()
+
+>>> pq.read_table('example.parquet').to_pandas()
+   n_legs         animal
+0       2       Flamingo
+1       2         Parrot
+2       4            Dog
+3       4          Horse
+4       5  Brittle stars
+5     100      Centipede
+
+create a ParquetWriter object for the RecordBatch:
+
+>>> writer2 = pq.ParquetWriter('example2.parquet', batch.schema)
+
+and write the RecordBatch into the Parquet file:
+
+>>> writer2.write_batch(batch)
+>>> writer2.close()
+
+>>> pq.read_table('example2.parquet').to_pandas()
+   n_legs         animal
+0       2       Flamingo
+1       2         Parrot
+2       4            Dog
+3       4          Horse
+4       5  Brittle stars
+5     100      Centipede
+"""
+
 
 class ParquetWriter:
 
@@ -629,7 +850,11 @@ writer_engine_version : unused
     corresponding value is assumed to be a list (or any object with
     `.append` method) that will be filled with the file metadata instance
     of the written file.
-""".format(_parquet_writer_arg_docs)
+
+Examples
+--------
+{}
+""".format(_parquet_writer_arg_docs, _parquet_writer_example_doc)
 
     def __init__(self, where, schema, filesystem=None,
                  flavor=None,
@@ -760,6 +985,7 @@ writer_engine_version : unused
             Maximum size of each written row group. If None, the
             row group size will be the minimum of the Table size
             and 64 * 1024 * 1024.
+
         """
         if self.schema_changed:
             table = _sanitize_table(table, self.schema, self.flavor)
@@ -774,6 +1000,9 @@ writer_engine_version : unused
         self.writer.write_table(table, row_group_size=row_group_size)
 
     def close(self):
+        """
+        Close the connection to the Parquet file.
+        """
         if self.is_open:
             self.writer.close()
             self.is_open = False
@@ -1310,6 +1539,45 @@ default "hive"
     you need to specify the field names or a full schema. See the
     ``pyarrow.dataset.partitioning()`` function for more details."""
 
+_parquet_dataset_example = """\
+Generate an example PyArrow Table and write it to a partitioned dataset:
+
+>>> import pyarrow as pa
+>>> table = pa.table({'year': [2020, 2022, 2021, 2022, 2019, 2021],
+...                   'n_legs': [2, 2, 4, 4, 5, 100],
+...                   'animal': ["Flamingo", "Parrot", "Dog", "Horse",
+...                              "Brittle stars", "Centipede"]})
+
+>>> import pyarrow.parquet as pq
+>>> pq.write_to_dataset(table, root_path='dataset_name',
+...                     partition_cols=['year'],
+...                     use_legacy_dataset=False)
+
+create a ParquetDataset object from the dataset source:
+
+>>> dataset = pq.ParquetDataset('dataset_name/', use_legacy_dataset=False)
+
+and read the data:
+
+>>> dataset.read().to_pandas()
+   n_legs         animal  year
+0       5  Brittle stars  2019
+1       2       Flamingo  2020
+2       4            Dog  2021
+3     100      Centipede  2021
+4       2         Parrot  2022
+5       4          Horse  2022
+
+create a ParquetDataset object with filter:
+
+>>> dataset = pq.ParquetDataset('dataset_name/', use_legacy_dataset=False,
+...                             filters=[('n_legs','=',4)])
+>>> dataset.read().to_pandas()
+   n_legs animal  year
+0       4    Dog  2021
+1       4  Horse  2022
+"""
+
 
 class ParquetDataset:
 
@@ -1350,7 +1618,7 @@ metadata_nthreads : int, default 1
     datasets.
 {0}
 use_legacy_dataset : bool, default True
-    Set to False to enable the new code path (experimental, using the
+    Set to False to enable the new code path (using the
     new Arrow Dataset API). Among other things, this allows to pass
     `filters` for all columns and not only the partition keys, enables
     different partitioning schemes, etc.
@@ -1365,7 +1633,11 @@ coerce_int96_timestamp_unit : str, default None.
     Cast timestamps that are stored in INT96 format to a particular resolution
     (e.g. 'ms'). Setting to None is equivalent to 'ns' and therefore INT96
     timestamps will be inferred as timestamps in nanoseconds.
-""".format(_read_docstring_common, _DNF_filter_doc)
+
+Examples
+--------
+{2}
+""".format(_read_docstring_common, _DNF_filter_doc, _parquet_dataset_example)
 
     def __new__(cls, path_or_paths=None, filesystem=None, schema=None,
                 metadata=None, split_row_groups=False, validate_schema=True,
@@ -1549,6 +1821,30 @@ coerce_int96_timestamp_unit : str, default None.
         -------
         pyarrow.Table
             Content of the file as a table (of columns).
+
+        Examples
+        --------
+        Generate an example dataset:
+
+        >>> import pyarrow as pa
+        >>> table = pa.table({'year': [2020, 2022, 2021, 2022, 2019, 2021],
+        ...                   'n_legs': [2, 2, 4, 4, 5, 100],
+        ...                   'animal': ["Flamingo", "Parrot", "Dog", "Horse",
+        ...                              "Brittle stars", "Centipede"]})
+        >>> import pyarrow.parquet as pq
+        >>> pq.write_to_dataset(table, root_path='dataset_name_read',
+        ...                     partition_cols=['year'],
+        ...                     use_legacy_dataset=False)
+        >>> dataset = pq.ParquetDataset('dataset_name_read/',
+        ...                             use_legacy_dataset=False)
+
+        Read multiple Parquet files as a single pyarrow.Table:
+
+        >>> dataset.read(columns=["n_legs"])
+        pyarrow.Table
+        n_legs: int64
+        ----
+        n_legs: [[5],[2],...,[2],[4]]
         """
         tables = []
         for piece in self._pieces:
@@ -1586,6 +1882,38 @@ coerce_int96_timestamp_unit : str, default None.
         -------
         pyarrow.Table
             Content of the file as a table (of columns).
+
+        Examples
+        --------
+        Generate an example PyArrow Table and write it to a partitioned
+        dataset:
+
+        >>> import pyarrow as pa
+        >>> import pandas as pd
+        >>> df = pd.DataFrame({'year': [2020, 2022, 2021, 2022, 2019, 2021],
+        ...                    'n_legs': [2, 2, 4, 4, 5, 100],
+        ...                    'animal': ["Flamingo", "Parrot", "Dog", "Horse",
+        ...                    "Brittle stars", "Centipede"]})
+        >>> table = pa.Table.from_pandas(df)
+        >>> import pyarrow.parquet as pq
+        >>> pq.write_to_dataset(table, root_path='dataset_name_read_pandas',
+        ...                     partition_cols=['year'],
+        ...                     use_legacy_dataset=False)
+        >>> dataset = pq.ParquetDataset('dataset_name_read_pandas/',
+        ...                             use_legacy_dataset=False)
+
+        Read dataset including pandas metadata:
+
+        >>> dataset.read_pandas(columns=["n_legs"])
+        pyarrow.Table
+        n_legs: int64
+        ----
+        n_legs: [[5],[2],...,[2],[4]]
+
+        Select pandas metadata:
+
+        >>> dataset.read_pandas(columns=["n_legs"]).schema.pandas_metadata
+        {'index_columns': [{'kind': 'range', ... 'pandas_version': '1.4.1'}
         """
         return self.read(use_pandas_metadata=True, **kwargs)
 
@@ -1611,6 +1939,9 @@ coerce_int96_timestamp_unit : str, default None.
 
     @property
     def pieces(self):
+        """
+        DEPRECATED
+        """
         warnings.warn(
             _DEPR_MSG.format(
                 "ParquetDataset.pieces",
@@ -1622,6 +1953,9 @@ coerce_int96_timestamp_unit : str, default None.
 
     @property
     def partitions(self):
+        """
+        DEPRECATED
+        """
         warnings.warn(
             _DEPR_MSG.format(
                 "ParquetDataset.partitions",
@@ -1645,6 +1979,9 @@ coerce_int96_timestamp_unit : str, default None.
 
     @property
     def memory_map(self):
+        """
+        DEPRECATED
+        """
         warnings.warn(
             _DEPR_MSG.format("ParquetDataset.memory_map", ""),
             DeprecationWarning, stacklevel=2)
@@ -1652,6 +1989,9 @@ coerce_int96_timestamp_unit : str, default None.
 
     @property
     def read_dictionary(self):
+        """
+        DEPRECATED
+        """
         warnings.warn(
             _DEPR_MSG.format("ParquetDataset.read_dictionary", ""),
             DeprecationWarning, stacklevel=2)
@@ -1659,6 +1999,9 @@ coerce_int96_timestamp_unit : str, default None.
 
     @property
     def buffer_size(self):
+        """
+        DEPRECATED
+        """
         warnings.warn(
             _DEPR_MSG.format("ParquetDataset.buffer_size", ""),
             DeprecationWarning, stacklevel=2)
@@ -1670,6 +2013,9 @@ coerce_int96_timestamp_unit : str, default None.
 
     @property
     def fs(self):
+        """
+        DEPRECATED
+        """
         warnings.warn(
             _DEPR_MSG.format(
                 "ParquetDataset.fs",
@@ -1679,10 +2025,106 @@ coerce_int96_timestamp_unit : str, default None.
             DeprecationWarning, stacklevel=2)
         return self._metadata.fs
 
-    common_metadata = property(
+    _common_metadata = property(
         operator.attrgetter('_metadata.common_metadata')
     )
 
+    @property
+    def common_metadata(self):
+        """
+        DEPRECATED
+        """
+        warnings.warn(
+            _DEPR_MSG.format("ParquetDataset.common_metadata", ""),
+            DeprecationWarning, stacklevel=2)
+        return self._metadata.common_metadata
+
+    @property
+    def fragments(self):
+        """
+        A list of the Dataset source fragments or pieces with absolute
+        file paths. To use this property set 'use_legacy_dataset=False'
+        while constructing ParquetDataset object.
+
+        Examples
+        --------
+        Generate an example dataset:
+
+        >>> import pyarrow as pa
+        >>> table = pa.table({'year': [2020, 2022, 2021, 2022, 2019, 2021],
+        ...                   'n_legs': [2, 2, 4, 4, 5, 100],
+        ...                   'animal': ["Flamingo", "Parrot", "Dog", "Horse",
+        ...                              "Brittle stars", "Centipede"]})
+        >>> import pyarrow.parquet as pq
+        >>> pq.write_to_dataset(table, root_path='dataset_name_fragments',
+        ...                     partition_cols=['year'],
+        ...                     use_legacy_dataset=False)
+        >>> dataset = pq.ParquetDataset('dataset_name_files/',
+        ...                             use_legacy_dataset=False)
+
+        List the fragments:
+
+        >>> dataset.fragments
+        [<pyarrow.dataset.ParquetFileFragment path=dataset_name_fragments/...
+        """
+        raise NotImplementedError(
+            "To use this property set 'use_legacy_dataset=False' while "
+            "constructing the ParquetDataset")
+
+    @property
+    def files(self):
+        """
+        A list of absolute Parquet file paths in the Dataset source.
+        To use this property set 'use_legacy_dataset=False'
+        while constructing ParquetDataset object.
+
+        Examples
+        --------
+        Generate an example dataset:
+
+        >>> import pyarrow as pa
+        >>> table = pa.table({'year': [2020, 2022, 2021, 2022, 2019, 2021],
+        ...                   'n_legs': [2, 2, 4, 4, 5, 100],
+        ...                   'animal': ["Flamingo", "Parrot", "Dog", "Horse",
+        ...                              "Brittle stars", "Centipede"]})
+        >>> import pyarrow.parquet as pq
+        >>> pq.write_to_dataset(table, root_path='dataset_name_files',
+        ...                     partition_cols=['year'],
+        ...                     use_legacy_dataset=False)
+        >>> dataset = pq.ParquetDataset('dataset_name_files/',
+        ...                             use_legacy_dataset=False)
+
+        List the files:
+
+        >>> dataset.files
+        ['dataset_name_files/year=2019/part-0.parquet', ...
+        """
+        raise NotImplementedError(
+            "To use this property set 'use_legacy_dataset=False' while "
+            "constructing the ParquetDataset")
+
+    @property
+    def filesystem(self):
+        """
+        The filesystem type of the Dataset source.
+        To use this property set 'use_legacy_dataset=False'
+        while constructing ParquetDataset object.
+        """
+        raise NotImplementedError(
+            "To use this property set 'use_legacy_dataset=False' while "
+            "constructing the ParquetDataset")
+
+    @property
+    def partitioning(self):
+        """
+        The partitioning of the Dataset source, if discovered.
+        To use this property set 'use_legacy_dataset=False'
+        while constructing ParquetDataset object.
+        """
+        raise NotImplementedError(
+            "To use this property set 'use_legacy_dataset=False' while "
+            "constructing the ParquetDataset")
+
 
 def _make_manifest(path_or_paths, fs, pathsep='/', metadata_nthreads=1,
                    open_file_func=None):
@@ -1732,6 +2174,44 @@ def _is_local_file_system(fs):
 class _ParquetDatasetV2:
     """
     ParquetDataset shim using the Dataset API under the hood.
+
+    Examples
+    --------
+    Generate an example PyArrow Table and write it to a partitioned dataset:
+
+    >>> import pyarrow as pa
+    >>> table = pa.table({'year': [2020, 2022, 2021, 2022, 2019, 2021],
+    ...                   'n_legs': [2, 2, 4, 4, 5, 100],
+    ...                   'animal': ["Flamingo", "Parrot", "Dog", "Horse",
+    ...                              "Brittle stars", "Centipede"]})
+    >>> import pyarrow.parquet as pq
+    >>> pq.write_to_dataset(table, root_path='dataset_v2',
+    ...                     partition_cols=['year'],
+    ...                     use_legacy_dataset=False)
+
+    create a _ParquetDatasetV2 object from the dataset source:
+
+    >>> dataset = pq._ParquetDatasetV2('dataset_v2/')
+
+    and read the data:
+
+    >>> dataset.read().to_pandas()
+       n_legs         animal  year
+    0       5  Brittle stars  2019
+    1       2       Flamingo  2020
+    2       4            Dog  2021
+    3     100      Centipede  2021
+    4       2         Parrot  2022
+    5       4          Horse  2022
+
+    create a _ParquetDatasetV2 object with filter:
+
+    >>> dataset = pq._ParquetDatasetV2('dataset_v2/',
+    ...                                filters=[('n_legs','=',4)])
+    >>> dataset.read().to_pandas()
+       n_legs animal  year
+    0       4    Dog  2021
+    1       4  Horse  2022
     """
 
     def __init__(self, path_or_paths, filesystem=None, filters=None,
@@ -1830,6 +2310,31 @@ class _ParquetDatasetV2:
 
     @property
     def schema(self):
+        """
+        Schema of the Dataset.
+
+        Examples
+        --------
+        Generate an example dataset:
+
+        >>> import pyarrow as pa
+        >>> table = pa.table({'year': [2020, 2022, 2021, 2022, 2019, 2021],
+        ...                   'n_legs': [2, 2, 4, 4, 5, 100],
+        ...                   'animal': ["Flamingo", "Parrot", "Dog", "Horse",
+        ...                              "Brittle stars", "Centipede"]})
+        >>> import pyarrow.parquet as pq
+        >>> pq.write_to_dataset(table, root_path='dataset_v2_schema',
+        ...                     partition_cols=['year'],
+        ...                     use_legacy_dataset=False)
+        >>> dataset = pq._ParquetDatasetV2('dataset_v2_schema/')
+
+        Read the schema:
+
+        >>> dataset.schema
+        n_legs: int64
+        animal: string
+        year: dictionary<values=int32, indices=int32, ordered=0>
+        """
         return self._dataset.schema
 
     def read(self, columns=None, use_threads=True, use_pandas_metadata=False):
@@ -1852,6 +2357,29 @@ class _ParquetDatasetV2:
         -------
         pyarrow.Table
             Content of the file as a table (of columns).
+
+        Examples
+        --------
+        Generate an example dataset:
+
+        >>> import pyarrow as pa
+        >>> table = pa.table({'year': [2020, 2022, 2021, 2022, 2019, 2021],
+        ...                   'n_legs': [2, 2, 4, 4, 5, 100],
+        ...                   'animal': ["Flamingo", "Parrot", "Dog", "Horse",
+        ...                              "Brittle stars", "Centipede"]})
+        >>> import pyarrow.parquet as pq
+        >>> pq.write_to_dataset(table, root_path='dataset_v2_read',
+        ...                     partition_cols=['year'],
+        ...                     use_legacy_dataset=False)
+        >>> dataset = pq._ParquetDatasetV2('dataset_v2_read/')
+
+        Read the dataset:
+
+        >>> dataset.read(columns=["n_legs"])
+        pyarrow.Table
+        n_legs: int64
+        ----
+        n_legs: [[5],[2],...,[2],[4]]
         """
         # if use_pandas_metadata, we need to include index columns in the
         # column selection, to be able to restore those in the pandas DataFrame
@@ -1886,6 +2414,32 @@ class _ParquetDatasetV2:
         """
         Read dataset including pandas metadata, if any. Other arguments passed
         through to ParquetDataset.read, see docstring for further details.
+
+        Examples
+        --------
+        Generate an example dataset:
+
+        >>> import pyarrow as pa
+        >>> table = pa.table({'year': [2020, 2022, 2021, 2022, 2019, 2021],
+        ...                   'n_legs': [2, 2, 4, 4, 5, 100],
+        ...                   'animal': ["Flamingo", "Parrot", "Dog", "Horse",
+        ...                              "Brittle stars", "Centipede"]})
+        >>> import pyarrow.parquet as pq
+        >>> pq.write_to_dataset(table, root_path='dataset_v2_read_pandas',
+        ...                     partition_cols=['year'],
+        ...                     use_legacy_dataset=False)
+        >>> dataset = pq._ParquetDatasetV2('dataset_v2_read_pandas/')
+
+        Read the dataset with pandas metadata:
+
+        >>> dataset.read_pandas(columns=["n_legs"])
+        pyarrow.Table
+        n_legs: int64
+        ----
+        n_legs: [[5],[2],...,[2],[4]]
+
+        >>> dataset.read_pandas(columns=["n_legs"]).schema.pandas_metadata
+        {'index_columns': [{'kind': 'range', ... 'pandas_version': '1.4.1'}
         """
         return self.read(use_pandas_metadata=True, **kwargs)
 
@@ -1899,14 +2453,64 @@ class _ParquetDatasetV2:
 
     @property
     def fragments(self):
+        """
+        A list of the Dataset source fragments or pieces with absolute
+        file paths.
+
+        Examples
+        --------
+        Generate an example dataset:
+
+        >>> import pyarrow as pa
+        >>> table = pa.table({'year': [2020, 2022, 2021, 2022, 2019, 2021],
+        ...                   'n_legs': [2, 2, 4, 4, 5, 100],
+        ...                   'animal': ["Flamingo", "Parrot", "Dog", "Horse",
+        ...                              "Brittle stars", "Centipede"]})
+        >>> import pyarrow.parquet as pq
+        >>> pq.write_to_dataset(table, root_path='dataset_v2_fragments',
+        ...                     partition_cols=['year'],
+        ...                     use_legacy_dataset=False)
+        >>> dataset = pq._ParquetDatasetV2('dataset_v2_fragments/')
+
+        List the fragments:
+
+        >>> dataset.fragments
+        [<pyarrow.dataset.ParquetFileFragment path=dataset_v2_fragments/...
+        """
         return list(self._dataset.get_fragments())
 
     @property
     def files(self):
+        """
+        A list of absolute Parquet file paths in the Dataset source.
+
+        Examples
+        --------
+        Generate an example dataset:
+
+        >>> import pyarrow as pa
+        >>> table = pa.table({'year': [2020, 2022, 2021, 2022, 2019, 2021],
+        ...                   'n_legs': [2, 2, 4, 4, 5, 100],
+        ...                   'animal': ["Flamingo", "Parrot", "Dog", "Horse",
+        ...                              "Brittle stars", "Centipede"]})
+        >>> import pyarrow.parquet as pq
+        >>> pq.write_to_dataset(table, root_path='dataset_v2_files',
+        ...                     partition_cols=['year'],
+        ...                     use_legacy_dataset=False)
+        >>> dataset = pq._ParquetDatasetV2('dataset_v2_files/')
+
+        List the files:
+
+        >>> dataset.files
+        ['dataset_v2_files/year=2019/part-0.parquet', ...
+        """
         return self._dataset.files
 
     @property
     def filesystem(self):
+        """
+        The filesystem type of the Dataset source.
+        """
         return self._dataset.filesystem
 
     @property
@@ -1986,6 +2590,86 @@ decryption_properties : FileDecryptionProperties or None
 Returns
 -------
 {2}
+
+{4}
+"""
+
+_read_table_example = """\
+
+Examples
+--------
+
+Generate an example PyArrow Table and write it to a partitioned dataset:
+
+>>> import pyarrow as pa
+>>> table = pa.table({'year': [2020, 2022, 2021, 2022, 2019, 2021],
+...                   'n_legs': [2, 2, 4, 4, 5, 100],
+...                   'animal': ["Flamingo", "Parrot", "Dog", "Horse",
+...                              "Brittle stars", "Centipede"]})
+>>> import pyarrow.parquet as pq
+>>> pq.write_to_dataset(table, root_path='dataset_name_2',
+...                     partition_cols=['year'])
+
+Read the data:
+
+>>> pq.read_table('dataset_name_2').to_pandas()
+   n_legs         animal  year
+0       5  Brittle stars  2019
+1       2       Flamingo  2020
+2       4            Dog  2021
+3     100      Centipede  2021
+4       2         Parrot  2022
+5       4          Horse  2022
+
+
+Read only a subset of columns:
+
+>>> pq.read_table('dataset_name_2', columns=["n_legs", "animal"])
+pyarrow.Table
+n_legs: int64
+animal: string
+----
+n_legs: [[5],[2],...,[2],[4]]
+animal: [["Brittle stars"],["Flamingo"],...,["Parrot"],["Horse"]]
+
+Read a subset of columns and read one column as DictionaryArray:
+
+>>> pq.read_table('dataset_name_2', columns=["n_legs", "animal"],
+...               read_dictionary=["animal"])
+pyarrow.Table
+n_legs: int64
+animal: dictionary<values=string, indices=int32, ordered=0>
+----
+n_legs: [[5],[2],...,[2],[4]]
+animal: [  -- dictionary:
+["Brittle stars"]  -- indices:
+[0],  -- dictionary:
+["Flamingo"]  -- indices:
+[0],...,  -- dictionary:
+["Parrot"]  -- indices:
+[0],  -- dictionary:
+["Horse"]  -- indices:
+[0]]
+
+Read the table with filter:
+
+>>> pq.read_table('dataset_name_2', columns=["n_legs", "animal"],
+...               filters=[('n_legs','<',4)]).to_pandas()
+   n_legs    animal
+0       2  Flamingo
+1       2    Parrot
+
+Read data from a single Parquet file:
+
+>>> pq.write_table(table, 'example.parquet')
+>>> pq.read_table('dataset_name_2').to_pandas()
+   n_legs         animal  year
+0       5  Brittle stars  2019
+1       2       Flamingo  2020
+2     100      Centipede  2021
+3       4            Dog  2021
+4       2         Parrot  2022
+5       4          Horse  2022
 """
 
 
@@ -2100,7 +2784,7 @@ switched to False.""",
     index columns are also loaded.""")),
     """pyarrow.Table
     Content of the file as a table (of columns)""",
-    _DNF_filter_doc)
+    _DNF_filter_doc, _read_table_example)
 
 
 def read_pandas(source, columns=None, **kwargs):
@@ -2118,7 +2802,7 @@ read_pandas.__doc__ = _read_table_docstring.format(
     """pyarrow.Table
     Content of the file as a Table of Columns, including DataFrame
     indexes as columns""",
-    _DNF_filter_doc)
+    _DNF_filter_doc, "")
 
 
 def write_table(table, where, row_group_size=None, version='1.0',
@@ -2170,6 +2854,40 @@ def write_table(table, where, row_group_size=None, version='1.0',
         raise
 
 
+_write_table_example = """\
+Generate an example PyArrow Table:
+
+>>> import pyarrow as pa
+>>> table = pa.table({'n_legs': [2, 2, 4, 4, 5, 100],
+...                   'animal': ["Flamingo", "Parrot", "Dog", "Horse",
+...                              "Brittle stars", "Centipede"]})
+
+and write the Table into Parquet file:
+
+>>> import pyarrow.parquet as pq
+>>> pq.write_table(table, 'example.parquet')
+
+Defining row group size for the Parquet file:
+
+>>> pq.write_table(table, 'example.parquet', row_group_size=3)
+
+Defining row group compression (default is Snappy):
+
+>>> pq.write_table(table, 'example.parquet', compression='none')
+
+Defining row group compression and encoding per-column:
+
+>>> pq.write_table(table, 'example.parquet',
+...                compression={'n_legs': 'snappy', 'animal': 'gzip'},
+...                use_dictionary=['n_legs', 'animal'])
+
+Defining column encoding per-column:
+
+>>> pq.write_table(table, 'example.parquet',
+...                column_encoding={'animal':'PLAIN'},
+...                use_dictionary=False)
+"""
+
 write_table.__doc__ = """
 Write a Table to Parquet format.
 
@@ -2184,7 +2902,11 @@ row_group_size : int
 {}
 **kwargs : optional
     Additional options for ParquetWriter
-""".format(_parquet_writer_arg_docs)
+
+Examples
+--------
+{}
+""".format(_parquet_writer_arg_docs, _write_table_example)
 
 
 def _mkdir_if_not_exists(fs, path):
@@ -2226,7 +2948,7 @@ def write_to_dataset(table, root_path, partition_cols=None,
         Path will try to be found in the local on-disk filesystem otherwise
         it will be parsed as an URI to determine the filesystem.
     partition_cols : list,
-        Column names by which to partition the dataset
+        Column names by which to partition the dataset.
         Columns are partitioned in the order they are given
     partition_filename_cb : callable,
         A callback function that takes the partition key(s) as an argument
@@ -2244,6 +2966,33 @@ def write_to_dataset(table, root_path, partition_cols=None,
         Using `metadata_collector` in kwargs allows one to collect the
         file metadata instances of dataset pieces. The file paths in the
         ColumnChunkMetaData will be set relative to `root_path`.
+
+    Examples
+    --------
+    Generate an example PyArrow Table:
+
+    >>> import pyarrow as pa
+    >>> table = pa.table({'year': [2020, 2022, 2021, 2022, 2019, 2021],
+    ...                   'n_legs': [2, 2, 4, 4, 5, 100],
+    ...                   'animal': ["Flamingo", "Parrot", "Dog", "Horse",
+    ...                              "Brittle stars", "Centipede"]})
+
+    and write it to a partitioned dataset:
+
+    >>> import pyarrow.parquet as pq
+    >>> pq.write_to_dataset(table, root_path='dataset_name_3',
+    ...                     partition_cols=['year'],
+    ...                     use_legacy_dataset=False
+    ...                    )
+    >>> pq.ParquetDataset('dataset_name_3', use_legacy_dataset=False).files
+    ['dataset_name_3/year=2019/part-0.parquet', ...
+
+    Write a single Parquet file into the root folder:
+
+    >>> pq.write_to_dataset(table, root_path='dataset_name_4',
+    ...                     use_legacy_dataset=False)
+    >>> pq.ParquetDataset('dataset_name_4/', use_legacy_dataset=False).files
+    ['dataset_name_4/part-0.parquet']
     """
     if use_legacy_dataset is None:
         # if a new filesystem is passed -> default to new implementation
@@ -2366,24 +3115,31 @@ def write_metadata(schema, where, metadata_collector=None, **kwargs):
 
     Examples
     --------
+    Generate example data:
+
+    >>> import pyarrow as pa
+    >>> table = pa.table({'n_legs': [2, 2, 4, 4, 5, 100],
+    ...                   'animal': ["Flamingo", "Parrot", "Dog", "Horse",
+    ...                              "Brittle stars", "Centipede"]})
 
     Write a dataset and collect metadata information.
 
     >>> metadata_collector = []
-    >>> write_to_dataset(
-    ...     table, root_path,
-    ...     metadata_collector=metadata_collector, **writer_kwargs)
+    >>> import pyarrow.parquet as pq
+    >>> pq.write_to_dataset(
+    ...     table, 'dataset_metadata',
+    ...      metadata_collector=metadata_collector)
 
     Write the `_common_metadata` parquet file without row groups statistics.
 
-    >>> write_metadata(
-    ...     table.schema, root_path / '_common_metadata', **writer_kwargs)
+    >>> pq.write_metadata(
+    ...     table.schema, 'dataset_metadata/_common_metadata')
 
     Write the `_metadata` parquet file with row groups statistics.
 
-    >>> write_metadata(
-    ...     table.schema, root_path / '_metadata',
-    ...     metadata_collector=metadata_collector, **writer_kwargs)
+    >>> pq.write_metadata(
+    ...     table.schema, 'dataset_metadata/_metadata',
+    ...     metadata_collector=metadata_collector)
     """
     writer = ParquetWriter(where, schema, **kwargs)
     writer.close()
diff --git a/python/pyarrow/tests/parquet/test_basic.py b/python/pyarrow/tests/parquet/test_basic.py
index 96e323d7fd..8c7afb8355 100644
--- a/python/pyarrow/tests/parquet/test_basic.py
+++ b/python/pyarrow/tests/parquet/test_basic.py
@@ -603,7 +603,9 @@ def test_read_table_doesnt_warn(datadir, use_legacy_dataset):
                       use_legacy_dataset=use_legacy_dataset)
 
     if use_legacy_dataset:
-        assert len(record) == 1
+        # DeprecationWarning: 'use_legacy_dataset=True'
+        # DeprecationWarning: 'ParquetDataset.common_metadata' attribute
+        assert len(record) == 2
     else:
         assert len(record) == 0