You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by jo...@apache.org on 2022/10/20 07:41:30 UTC

[arrow] branch master updated: ARROW-15006: [Python][CI][Doc] Enable numpydoc check PR03 (#13983)

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 0f91e684dd ARROW-15006: [Python][CI][Doc] Enable numpydoc check PR03 (#13983)
0f91e684dd is described below

commit 0f91e684ddda3dfd11d376c2755bbc3071c3099d
Author: Bryce Mecum <pe...@gmail.com>
AuthorDate: Wed Oct 19 23:41:24 2022 -0800

    ARROW-15006: [Python][CI][Doc] Enable numpydoc check PR03 (#13983)
    
    Adds an additional numypdoc check to CI (PR03) and fixes all corresponding violations.
    
    Note this does not fully resolve [ARROW-15006](https://issues.apache.org/jira/browse/ARROW-15006).
    
    Authored-by: Bryce Mecum <pe...@gmail.com>
    Signed-off-by: Joris Van den Bossche <jo...@gmail.com>
---
 docker-compose.yml             |  2 +-
 python/pyarrow/_csv.pyx        | 76 ++++++++++++++++++------------------
 python/pyarrow/_dataset.pyx    | 78 ++++++++++++++++++------------------
 python/pyarrow/array.pxi       |  8 ++--
 python/pyarrow/ipc.pxi         |  6 +--
 python/pyarrow/ipc.py          | 14 +++----
 python/pyarrow/parquet/core.py | 89 +++++++++++++++++++++---------------------
 7 files changed, 136 insertions(+), 137 deletions(-)

diff --git a/docker-compose.yml b/docker-compose.yml
index 1c3813757f..86e4c9fd61 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1068,7 +1068,7 @@ services:
       ["/arrow/ci/scripts/cpp_build.sh /arrow /build &&
         /arrow/ci/scripts/python_build.sh /arrow /build &&
         pip install -e /arrow/dev/archery[numpydoc] &&
-        archery numpydoc --allow-rule PR01,PR10 &&
+        archery numpydoc --allow-rule PR01,PR03,PR10 &&
         /arrow/ci/scripts/python_test.sh /arrow"]
 
   conda-python-dask:
diff --git a/python/pyarrow/_csv.pyx b/python/pyarrow/_csv.pyx
index 16bd0985e2..578050e710 100644
--- a/python/pyarrow/_csv.pyx
+++ b/python/pyarrow/_csv.pyx
@@ -189,22 +189,22 @@ cdef class ReadOptions(_Weakrefable):
         self.options.reset(new CCSVReadOptions(CCSVReadOptions.Defaults()))
 
     def __init__(self, *, use_threads=None, block_size=None, skip_rows=None,
-                 column_names=None, autogenerate_column_names=None,
-                 encoding='utf8', skip_rows_after_names=None):
+                 skip_rows_after_names=None, column_names=None,
+                 autogenerate_column_names=None, encoding='utf8'):
         if use_threads is not None:
             self.use_threads = use_threads
         if block_size is not None:
             self.block_size = block_size
         if skip_rows is not None:
             self.skip_rows = skip_rows
+        if skip_rows_after_names is not None:
+            self.skip_rows_after_names = skip_rows_after_names
         if column_names is not None:
             self.column_names = column_names
         if autogenerate_column_names is not None:
             self.autogenerate_column_names= autogenerate_column_names
         # Python-specific option
         self.encoding = encoding
-        if skip_rows_after_names is not None:
-            self.skip_rows_after_names = skip_rows_after_names
 
     @property
     def use_threads(self):
@@ -243,6 +243,23 @@ cdef class ReadOptions(_Weakrefable):
     def skip_rows(self, value):
         deref(self.options).skip_rows = value
 
+    @property
+    def skip_rows_after_names(self):
+        """
+        The number of rows to skip after the column names.
+        This number can be larger than the number of rows in one
+        block, and empty rows are counted.
+        The order of application is as follows:
+        - `skip_rows` is applied (if non-zero);
+        - column names aread (unless `column_names` is set);
+        - `skip_rows_after_names` is applied (if non-zero).
+        """
+        return deref(self.options).skip_rows_after_names
+
+    @skip_rows_after_names.setter
+    def skip_rows_after_names(self, value):
+        deref(self.options).skip_rows_after_names = value
+
     @property
     def column_names(self):
         """
@@ -271,23 +288,6 @@ cdef class ReadOptions(_Weakrefable):
     def autogenerate_column_names(self, value):
         deref(self.options).autogenerate_column_names = value
 
-    @property
-    def skip_rows_after_names(self):
-        """
-        The number of rows to skip after the column names.
-        This number can be larger than the number of rows in one
-        block, and empty rows are counted.
-        The order of application is as follows:
-        - `skip_rows` is applied (if non-zero);
-        - column names aread (unless `column_names` is set);
-        - `skip_rows_after_names` is applied (if non-zero).
-        """
-        return deref(self.options).skip_rows_after_names
-
-    @skip_rows_after_names.setter
-    def skip_rows_after_names(self, value):
-        deref(self.options).skip_rows_after_names = value
-
     def validate(self):
         check_status(deref(self.options).Validate())
 
@@ -296,11 +296,11 @@ cdef class ReadOptions(_Weakrefable):
             self.use_threads == other.use_threads and
             self.block_size == other.block_size and
             self.skip_rows == other.skip_rows and
+            self.skip_rows_after_names == other.skip_rows_after_names and
             self.column_names == other.column_names and
             self.autogenerate_column_names ==
             other.autogenerate_column_names and
-            self.encoding == other.encoding and
-            self.skip_rows_after_names == other.skip_rows_after_names
+            self.encoding == other.encoding
         )
 
     @staticmethod
@@ -605,11 +605,6 @@ cdef class ConvertOptions(_Weakrefable):
     decimal_point : 1-character string, optional (default '.')
         The character used as decimal point in floating-point and decimal
         data.
-    timestamp_parsers : list, optional
-        A sequence of strptime()-compatible format strings, tried in order
-        when attempting to infer or convert timestamp values (the special
-        value ISO8601() can also be given).  By default, a fast built-in
-        ISO-8601 parser is used.
     strings_can_be_null : bool, optional (default False)
         Whether string / binary columns can have null values.
         If true, then strings in null_values are considered null for
@@ -620,16 +615,6 @@ cdef class ConvertOptions(_Weakrefable):
         If true, then strings in "null_values" are also considered null
         when they appear quoted in the CSV file. Otherwise, quoted values
         are never considered null.
-    auto_dict_encode : bool, optional (default False)
-        Whether to try to automatically dict-encode string / binary data.
-        If true, then when type inference detects a string or binary column,
-        it it dict-encoded up to `auto_dict_max_cardinality` distinct values
-        (per chunk), after which it switches to regular encoding.
-        This setting is ignored for non-inferred columns (those in
-        `column_types`).
-    auto_dict_max_cardinality : int, optional
-        The maximum dictionary cardinality for `auto_dict_encode`.
-        This value is per chunk.
     include_columns : list, optional
         The names of columns to include in the Table.
         If empty, the Table will include all columns from the CSV file.
@@ -641,6 +626,21 @@ cdef class ConvertOptions(_Weakrefable):
         produce a column of nulls (whose type is selected using
         `column_types`, or null by default).
         This option is ignored if `include_columns` is empty.
+    auto_dict_encode : bool, optional (default False)
+        Whether to try to automatically dict-encode string / binary data.
+        If true, then when type inference detects a string or binary column,
+        it it dict-encoded up to `auto_dict_max_cardinality` distinct values
+        (per chunk), after which it switches to regular encoding.
+        This setting is ignored for non-inferred columns (those in
+        `column_types`).
+    auto_dict_max_cardinality : int, optional
+        The maximum dictionary cardinality for `auto_dict_encode`.
+        This value is per chunk.
+    timestamp_parsers : list, optional
+        A sequence of strptime()-compatible format strings, tried in order
+        when attempting to infer or convert timestamp values (the special
+        value ISO8601() can also be given).  By default, a fast built-in
+        ISO-8601 parser is used.
 
     Examples
     --------
diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx
index 4ab08d4554..154a02481c 100644
--- a/python/pyarrow/_dataset.pyx
+++ b/python/pyarrow/_dataset.pyx
@@ -257,7 +257,7 @@ cdef class Dataset(_Weakrefable):
         ...                   'n_legs': [2, 2, 4, 4, 5, 100],
         ...                   'animal': ["Flamingo", "Parrot", "Dog", "Horse",
         ...                              "Brittle stars", "Centipede"]})
-        >>> 
+        >>>
         >>> import pyarrow.parquet as pq
         >>> pq.write_table(table, "dataset_scanner.parquet")
 
@@ -1221,12 +1221,12 @@ cdef class CsvFileFormat(FileFormat):
     ----------
     parse_options : pyarrow.csv.ParseOptions
         Options regarding CSV parsing.
+    default_fragment_scan_options : CsvFragmentScanOptions
+        Default options for fragments scan.
     convert_options : pyarrow.csv.ConvertOptions
         Options regarding value conversion.
     read_options : pyarrow.csv.ReadOptions
         General read options.
-    default_fragment_scan_options : CsvFragmentScanOptions
-        Default options for fragments scan.
     """
     cdef:
         CCsvFileFormat* csv_format
@@ -2315,17 +2315,17 @@ cdef class Scanner(_Weakrefable):
         projections.
 
         The list of columns or expressions may use the special fields
-        `__batch_index` (the index of the batch within the fragment), 
-        `__fragment_index` (the index of the fragment within the dataset), 
+        `__batch_index` (the index of the batch within the fragment),
+        `__fragment_index` (the index of the fragment within the dataset),
         `__last_in_fragment` (whether the batch is last in fragment), and
-        `__filename` (the name of the source file or a description of the 
+        `__filename` (the name of the source file or a description of the
         source fragment).
 
         The columns will be passed down to Datasets and corresponding data
         fragments to avoid loading, copying, and deserializing columns
         that will not be required further down the compute chain.
-        By default all of the available columns are projected. 
-        Raises an exception if any of the referenced column names does 
+        By default all of the available columns are projected.
+        Raises an exception if any of the referenced column names does
         not exist in the dataset's Schema.
     filter : Expression, default None
         Scan will return only the rows matching the filter.
@@ -2338,8 +2338,9 @@ cdef class Scanner(_Weakrefable):
         record batches are overflowing memory then this method can be
         called to reduce their size.
     batch_readahead : int, default 16
-        The number of batches to read ahead in a file. Increasing this number 
-        will increase RAM usage but could also improve IO utilization.
+        The number of batches to read ahead in a file. This might not work
+        for all file formats. Increasing this number will increase
+        RAM usage but could also improve IO utilization.
     fragment_readahead : int, default 4
         The number of files to read ahead. Increasing this number will increase
         RAM usage but could also improve IO utilization.
@@ -2375,14 +2376,13 @@ cdef class Scanner(_Weakrefable):
         return self.wrapped
 
     @staticmethod
-    def from_dataset(Dataset dataset not None,
-                     bint use_threads=True, object use_async=None,
-                     MemoryPool memory_pool=None,
-                     object columns=None, Expression filter=None,
-                     int batch_size=_DEFAULT_BATCH_SIZE,
+    def from_dataset(Dataset dataset not None, *, object columns=None,
+                     Expression filter=None, int batch_size=_DEFAULT_BATCH_SIZE,
                      int batch_readahead=_DEFAULT_BATCH_READAHEAD,
                      int fragment_readahead=_DEFAULT_FRAGMENT_READAHEAD,
-                     FragmentScanOptions fragment_scan_options=None):
+                     FragmentScanOptions fragment_scan_options=None,
+                     bint use_threads=True, object use_async=None,
+                     MemoryPool memory_pool=None):
         """
         Create Scanner from Dataset,
 
@@ -2397,10 +2397,10 @@ cdef class Scanner(_Weakrefable):
             projections.
 
             The list of columns or expressions may use the special fields
-            `__batch_index` (the index of the batch within the fragment), 
-            `__fragment_index` (the index of the fragment within the dataset), 
+            `__batch_index` (the index of the batch within the fragment),
+            `__fragment_index` (the index of the fragment within the dataset),
             `__last_in_fragment` (whether the batch is last in fragment), and
-            `__filename` (the name of the source file or a description of the 
+            `__filename` (the name of the source file or a description of the
             source fragment).
 
             The columns will be passed down to Datasets and corresponding data
@@ -2426,6 +2426,9 @@ cdef class Scanner(_Weakrefable):
         fragment_readahead : int, default 4
             The number of files to read ahead. Increasing this number will increase
             RAM usage but could also improve IO utilization.
+        fragment_scan_options : FragmentScanOptions, default None
+            Options specific to a particular scan and fragment type, which
+            can change between different scans of the same dataset.
         use_threads : bool, default True
             If enabled, then maximum parallelism will be used determined by
             the number of available CPU cores.
@@ -2436,9 +2439,6 @@ cdef class Scanner(_Weakrefable):
         memory_pool : MemoryPool, default None
             For memory allocations, if required. If not specified, uses the
             default pool.
-        fragment_scan_options : FragmentScanOptions, default None
-            Options specific to a particular scan and fragment type, which
-            can change between different scans of the same dataset.
         """
         cdef:
             shared_ptr[CScanOptions] options = make_shared[CScanOptions]()
@@ -2461,13 +2461,13 @@ cdef class Scanner(_Weakrefable):
         return Scanner.wrap(scanner)
 
     @staticmethod
-    def from_fragment(Fragment fragment not None, Schema schema=None,
-                      bint use_threads=True, object use_async=None,
-                      MemoryPool memory_pool=None,
+    def from_fragment(Fragment fragment not None, *, Schema schema=None,
                       object columns=None, Expression filter=None,
                       int batch_size=_DEFAULT_BATCH_SIZE,
                       int batch_readahead=_DEFAULT_BATCH_READAHEAD,
-                      FragmentScanOptions fragment_scan_options=None):
+                      FragmentScanOptions fragment_scan_options=None,
+                      bint use_threads=True, object use_async=None,
+                      MemoryPool memory_pool=None,):
         """
         Create Scanner from Fragment,
 
@@ -2484,10 +2484,10 @@ cdef class Scanner(_Weakrefable):
             projections.
 
             The list of columns or expressions may use the special fields
-            `__batch_index` (the index of the batch within the fragment), 
-            `__fragment_index` (the index of the fragment within the dataset), 
+            `__batch_index` (the index of the batch within the fragment),
+            `__fragment_index` (the index of the fragment within the dataset),
             `__last_in_fragment` (whether the batch is last in fragment), and
-            `__filename` (the name of the source file or a description of the 
+            `__filename` (the name of the source file or a description of the
             source fragment).
 
             The columns will be passed down to Datasets and corresponding data
@@ -2510,6 +2510,9 @@ cdef class Scanner(_Weakrefable):
             The number of batches to read ahead in a file. This might not work
             for all file formats. Increasing this number will increase
             RAM usage but could also improve IO utilization.
+        fragment_scan_options : FragmentScanOptions, default None
+            Options specific to a particular scan and fragment type, which
+            can change between different scans of the same dataset.
         use_threads : bool, default True
             If enabled, then maximum parallelism will be used determined by
             the number of available CPU cores.
@@ -2520,9 +2523,6 @@ cdef class Scanner(_Weakrefable):
         memory_pool : MemoryPool, default None
             For memory allocations, if required. If not specified, uses the
             default pool.
-        fragment_scan_options : FragmentScanOptions, default None
-            Options specific to a particular scan and fragment type, which
-            can change between different scans of the same dataset.
         """
         cdef:
             shared_ptr[CScanOptions] options = make_shared[CScanOptions]()
@@ -2549,11 +2549,11 @@ cdef class Scanner(_Weakrefable):
         return Scanner.wrap(scanner)
 
     @staticmethod
-    def from_batches(source, Schema schema=None, bint use_threads=True,
-                     object use_async=None, MemoryPool memory_pool=None,
-                     object columns=None, Expression filter=None,
-                     int batch_size=_DEFAULT_BATCH_SIZE,
-                     FragmentScanOptions fragment_scan_options=None):
+    def from_batches(source, *, Schema schema=None, object columns=None,
+                     Expression filter=None, int batch_size=_DEFAULT_BATCH_SIZE,
+                     FragmentScanOptions fragment_scan_options=None,
+                     bint use_threads=True, object use_async=None,
+                     MemoryPool memory_pool=None):
         """
         Create a Scanner from an iterator of batches.
 
@@ -2574,6 +2574,8 @@ cdef class Scanner(_Weakrefable):
             Scan will return only the rows matching the filter.
         batch_size : int, default 128Ki
             The maximum row count for scanned record batches.
+        fragment_scan_options : FragmentScanOptions
+            The fragment scan options.
         use_threads : bool, default True
             If enabled, then maximum parallelism will be used determined by
             the number of available CPU cores.
@@ -2584,8 +2586,6 @@ cdef class Scanner(_Weakrefable):
         memory_pool : MemoryPool, default None
             For memory allocations, if required. If not specified, uses the
             default pool.
-        fragment_scan_options : FragmentScanOptions
-            The fragment scan options.
         """
         cdef:
             shared_ptr[CScanOptions] options = make_shared[CScanOptions]()
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 036301d90e..86d1f0e39c 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -702,11 +702,11 @@ cdef class _PandasConvertible(_Weakrefable):
         memory_pool : MemoryPool, default None
             Arrow MemoryPool to use for allocations. Uses the default memory
             pool is not passed.
-        strings_to_categorical : bool, default False
-            Encode string (UTF8) and binary types to pandas.Categorical.
         categories : list, default empty
             List of fields that should be returned as pandas.Categorical. Only
             applies to table-like data structures.
+        strings_to_categorical : bool, default False
+            Encode string (UTF8) and binary types to pandas.Categorical.
         zero_copy_only : bool, default False
             Raise an ArrowException if this function call would require copying
             the underlying data.
@@ -2549,11 +2549,11 @@ cdef class DictionaryArray(Array):
             The array of values referenced by the indices.
         mask : ndarray or pandas.Series, bool type
             True values indicate that indices are actually null.
+        ordered : bool, default False
+            Set to True if the category values are ordered.
         from_pandas : bool, default False
             If True, the indices should be treated as though they originated in
             a pandas.Categorical (null encoded as -1).
-        ordered : bool, default False
-            Set to True if the category values are ordered.
         safe : bool, default True
             If True, check that the dictionary indices are in range.
         memory_pool : MemoryPool, default None
diff --git a/python/pyarrow/ipc.pxi b/python/pyarrow/ipc.pxi
index 90822766db..a9f127ef46 100644
--- a/python/pyarrow/ipc.pxi
+++ b/python/pyarrow/ipc.pxi
@@ -106,12 +106,12 @@ cdef class IpcReadOptions(_Weakrefable):
 
     Parameters
     ----------
-    use_threads : bool
-        Whether to use the global CPU thread pool to parallelize any
-        computational tasks like decompression.
     ensure_native_endian : bool
         Whether to convert incoming data to platform-native endianness.
         Default is true.
+    use_threads : bool
+        Whether to use the global CPU thread pool to parallelize any
+        computational tasks like decompression.
     included_fields : list
         If empty (the default), return all deserialized fields.
         If non-empty, the values are the indices of fields to read on
diff --git a/python/pyarrow/ipc.py b/python/pyarrow/ipc.py
index d63c323b33..fc724109d9 100644
--- a/python/pyarrow/ipc.py
+++ b/python/pyarrow/ipc.py
@@ -59,6 +59,12 @@ sink : str, pyarrow.NativeFile, or file-like Python object
     Either a file path, or a writable file object.
 schema : pyarrow.Schema
     The Arrow schema for data to be written to the file.
+use_legacy_format : bool, default None
+    Deprecated in favor of setting options. Cannot be provided with
+    options.
+
+    If None, False will be used unless this default is overridden by
+    setting the environment variable ARROW_PRE_0_15_IPC_FORMAT=1
 options : pyarrow.ipc.IpcWriteOptions
     Options for IPC serialization.
 
@@ -66,13 +72,7 @@ options : pyarrow.ipc.IpcWriteOptions
     be used unless overridden by setting the environment variable
     ARROW_PRE_0_15_IPC_FORMAT=1, and the V5 metadata version will be
     used unless overridden by setting the environment variable
-    ARROW_PRE_1_0_METADATA_VERSION=1.
-use_legacy_format : bool, default None
-    Deprecated in favor of setting options. Cannot be provided with
-    options.
-
-    If None, False will be used unless this default is overridden by
-    setting the environment variable ARROW_PRE_0_15_IPC_FORMAT=1"""
+    ARROW_PRE_1_0_METADATA_VERSION=1."""
 
 
 class RecordBatchStreamWriter(lib._RecordBatchStreamWriter):
diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py
index da3315441c..5716719dde 100644
--- a/python/pyarrow/parquet/core.py
+++ b/python/pyarrow/parquet/core.py
@@ -229,6 +229,8 @@ class ParquetFile:
     common_metadata : FileMetaData, default None
         Will be used in reads for pandas schema metadata if not found in the
         main file's metadata, no other uses at the moment.
+    read_dictionary : list
+        List of column names to read directly as DictionaryArray.
     memory_map : bool, default False
         If the source is a file path, use a memory map to read file, which can
         improve performance in some environments.
@@ -239,8 +241,6 @@ class ParquetFile:
         Coalesce and issue file reads in parallel to improve performance on
         high-latency filesystems (e.g. S3). If True, Arrow will use a
         background I/O thread pool.
-    read_dictionary : list
-        List of column names to read directly as DictionaryArray.
     coerce_int96_timestamp_unit : str, default None.
         Cast timestamps that are stored in INT96 format to a particular
         resolution (e.g. 'ms'). Setting to None is equivalent to 'ns'
@@ -737,6 +737,12 @@ _parquet_writer_arg_docs = """version : {"1.0", "2.4", "2.6"}, default "2.4"
 use_dictionary : bool or list
     Specify if we should use dictionary encoding in general or only for
     some columns.
+compression : str or dict
+    Specify the compression codec, either on a general basis or per-column.
+    Valid values: {'NONE', 'SNAPPY', 'GZIP', 'BROTLI', 'LZ4', 'ZSTD'}.
+write_statistics : bool or list
+    Specify if we should write statistics in general (default is True) or only
+    for some columns.
 use_deprecated_int96_timestamps : bool, default None
     Write timestamps to INT96 Parquet format. Defaults to False unless enabled
     by flavor argument. This take priority over the coerce_timestamps option.
@@ -750,22 +756,16 @@ coerce_timestamps : str, default None
     If the casting results in loss of data, it will raise an exception
     unless ``allow_truncated_timestamps=True`` is given.
     Valid values: {None, 'ms', 'us'}
-data_page_size : int, default None
-    Set a target threshold for the approximate encoded size of data
-    pages within a column chunk (in bytes). If None, use the default data page
-    size of 1MByte.
 allow_truncated_timestamps : bool, default False
     Allow loss of data when coercing timestamps to a particular
     resolution. E.g. if microsecond or nanosecond data is lost when coercing to
     'ms', do not raise an exception. Passing ``allow_truncated_timestamp=True``
     will NOT result in the truncation exception being ignored unless
     ``coerce_timestamps`` is not None.
-compression : str or dict
-    Specify the compression codec, either on a general basis or per-column.
-    Valid values: {'NONE', 'SNAPPY', 'GZIP', 'BROTLI', 'LZ4', 'ZSTD'}.
-write_statistics : bool or list
-    Specify if we should write statistics in general (default is True) or only
-    for some columns.
+data_page_size : int, default None
+    Set a target threshold for the approximate encoded size of data
+    pages within a column chunk (in bytes). If None, use the default data page
+    size of 1MByte.
 flavor : {'spark'}, default None
     Sanitize schema or set other compatibility options to work with
     various target systems.
@@ -1095,12 +1095,12 @@ class ParquetDatasetPiece:
         Path to file in the file system where this piece is located.
     open_file_func : callable
         Function to use for obtaining file handle to dataset piece.
-    partition_keys : list of tuples
-        Two-element tuples of ``(column name, ordinal index)``.
-    row_group : int, default None
-        Row group to load. By default, reads all row groups.
     file_options : dict
         Options
+    row_group : int, default None
+        Row group to load. By default, reads all row groups.
+    partition_keys : list of tuples
+        Two-element tuples of ``(column name, ordinal index)``.
     """
 
     def __init__(self, path, open_file_func=partial(open, mode='rb'),
@@ -1650,11 +1650,11 @@ filesystem : FileSystem, default None
     If nothing passed, will be inferred based on path.
     Path will try to be found in the local on-disk filesystem otherwise
     it will be parsed as an URI to determine the filesystem.
-metadata : pyarrow.parquet.FileMetaData
-    Use metadata obtained elsewhere to validate file schemas.
 schema : pyarrow.parquet.Schema
     Use schema obtained elsewhere to validate file schemas. Alternative to
     metadata parameter.
+metadata : pyarrow.parquet.FileMetaData
+    Use metadata obtained elsewhere to validate file schemas.
 split_row_groups : bool, default False
     Divide files into pieces for each row group in the file.
 validate_schema : bool, default True
@@ -2666,19 +2666,6 @@ schema : Schema, optional
     Optionally provide the Schema for the parquet dataset, in which case it
     will not be inferred from the source.
 {1}
-use_legacy_dataset : bool, default False
-    By default, `read_table` uses the new Arrow Datasets API since
-    pyarrow 1.0.0. Among other things, this allows to pass `filters`
-    for all columns and not only the partition keys, enables
-    different partitioning schemes, etc.
-    Set to True to use the legacy behaviour (this option is deprecated,
-    and the legacy implementation will be removed in a future version).
-ignore_prefixes : list, optional
-    Files matching any of these prefixes will be ignored by the
-    discovery process if use_legacy_dataset=False.
-    This is matched to the basename of a path.
-    By default this is ['.', '_'].
-    Note that discovery happens only if a directory is passed as source.
 filesystem : FileSystem, default None
     If nothing passed, will be inferred based on path.
     Path will try to be found in the local on-disk filesystem otherwise
@@ -2693,6 +2680,19 @@ filters : List[Tuple] or List[List[Tuple]] or None (default)
     and different partitioning schemes are supported.
 
     {3}
+use_legacy_dataset : bool, default False
+    By default, `read_table` uses the new Arrow Datasets API since
+    pyarrow 1.0.0. Among other things, this allows to pass `filters`
+    for all columns and not only the partition keys, enables
+    different partitioning schemes, etc.
+    Set to True to use the legacy behaviour (this option is deprecated,
+    and the legacy implementation will be removed in a future version).
+ignore_prefixes : list, optional
+    Files matching any of these prefixes will be ignored by the
+    discovery process if use_legacy_dataset=False.
+    This is matched to the basename of a path.
+    By default this is ['.', '_'].
+    Note that discovery happens only if a directory is passed as source.
 pre_buffer : bool, default True
     Coalesce and issue file reads in parallel to improve performance on
     high-latency filesystems (e.g. S3). If True, Arrow will use a
@@ -2805,9 +2805,9 @@ Read data from a single Parquet file:
 
 
 def read_table(source, *, columns=None, use_threads=True, metadata=None,
-               schema=None, use_pandas_metadata=False, memory_map=False,
-               read_dictionary=None, filesystem=None, filters=None,
-               buffer_size=0, partitioning="hive", use_legacy_dataset=False,
+               schema=None, use_pandas_metadata=False, read_dictionary=None,
+               memory_map=False, buffer_size=0, partitioning="hive",
+               filesystem=None, filters=None, use_legacy_dataset=False,
                ignore_prefixes=None, pre_buffer=True,
                coerce_int96_timestamp_unit=None,
                decryption_properties=None, thrift_string_size_limit=None,
@@ -2914,10 +2914,9 @@ read_table.__doc__ = _read_table_docstring.format(
 
 Note: starting with pyarrow 1.0, the default for `use_legacy_dataset` is
 switched to False.""",
-    "\n".join((_read_docstring_common,
-               """use_pandas_metadata : bool, default False
+    "\n".join(("""use_pandas_metadata : bool, default False
     If True and file has custom pandas schema metadata, ensure that
-    index columns are also loaded.""")),
+    index columns are also loaded.""", _read_docstring_common)),
     """pyarrow.Table
     Content of the file as a table (of columns)""",
     _DNF_filter_doc, _read_table_example)
@@ -3086,10 +3085,6 @@ def write_to_dataset(table, root_path, partition_cols=None,
     table : pyarrow.Table
     root_path : str, pathlib.Path
         The root directory of the dataset
-    filesystem : FileSystem, default None
-        If nothing passed, will be inferred based on path.
-        Path will try to be found in the local on-disk filesystem otherwise
-        it will be parsed as an URI to determine the filesystem.
     partition_cols : list,
         Column names by which to partition the dataset.
         Columns are partitioned in the order they are given
@@ -3100,16 +3095,16 @@ def write_to_dataset(table, root_path, partition_cols=None,
         This option is only supported for use_legacy_dataset=True.
         When use_legacy_dataset=None and this option is specified,
         use_legacy_datase will be set to True.
+    filesystem : FileSystem, default None
+        If nothing passed, will be inferred based on path.
+        Path will try to be found in the local on-disk filesystem otherwise
+        it will be parsed as an URI to determine the filesystem.
     use_legacy_dataset : bool
         Default is False. Set to True to use the the legacy behaviour
         (this option is deprecated, and the legacy implementation will be
         removed in a future version). The legacy implementation still
         supports the `partition_filename_cb` keyword but is less efficient
         when using partition columns.
-    use_threads : bool, default True
-        Write files in parallel. If enabled, then maximum parallelism will be
-        used determined by the number of available CPU cores.
-        This option is only supported for use_legacy_dataset=False.
     schema : Schema, optional
         This option is only supported for use_legacy_dataset=False.
     partitioning : Partitioning or list[str], optional
@@ -3124,6 +3119,10 @@ def write_to_dataset(table, root_path, partition_cols=None,
         The token '{i}' will be replaced with an automatically incremented
         integer. If not specified, it defaults to "guid-{i}.parquet".
         This option is only supported for use_legacy_dataset=False.
+    use_threads : bool, default True
+        Write files in parallel. If enabled, then maximum parallelism will be
+        used determined by the number of available CPU cores.
+        This option is only supported for use_legacy_dataset=False.
     file_visitor : function
         If set, this function will be called with a WrittenFile instance
         for each file created during the call.  This object will have both