You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ks...@apache.org on 2020/08/11 15:57:46 UTC
[arrow] 17/22: ARROW-9573: [Python][Dataset] Provide read_table(ignore_prefixes=)

This is an automated email from the ASF dual-hosted git repository.

kszucs pushed a commit to branch maint-1.0.x
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit 51d0c450ba5a1e88365b1c58aad646ce80ce5db6
Author: Benjamin Kietzman <be...@gmail.com>
AuthorDate: Thu Aug 6 12:10:42 2020 -0400

    ARROW-9573: [Python][Dataset] Provide read_table(ignore_prefixes=)
    
    Closes #7900 from bkietz/9573-expose-ignore_prefixes
    
    Authored-by: Benjamin Kietzman <be...@gmail.com>
    Signed-off-by: Benjamin Kietzman <be...@gmail.com>
---
 python/pyarrow/dataset.py            |  2 +-
 python/pyarrow/parquet.py            | 23 +++++++++++++++++++----
 python/pyarrow/tests/test_parquet.py | 24 ++++++++++++++++++++++++
 3 files changed, 44 insertions(+), 5 deletions(-)

diff --git a/python/pyarrow/dataset.py b/python/pyarrow/dataset.py
index f4620d0..fd03aee 100644
--- a/python/pyarrow/dataset.py
+++ b/python/pyarrow/dataset.py
@@ -581,7 +581,7 @@ def dataset(source, schema=None, format=None, filesystem=None,
         files may be present in the Dataset (resulting in an error at scan
         time).
     ignore_prefixes : list, optional
-        Files matching one of those prefixes will be ignored by the
+        Files matching any of these prefixes will be ignored by the
         discovery process. This is matched to the basename of a path.
         By default this is ['.', '_'].
         Note that discovery happens only if a directory is passed as source.
diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py
index 59c79ac..b5be07f 100644
--- a/python/pyarrow/parquet.py
+++ b/python/pyarrow/parquet.py
@@ -1376,7 +1376,7 @@ class _ParquetDatasetV2:
 
     def __init__(self, path_or_paths, filesystem=None, filters=None,
                  partitioning="hive", read_dictionary=None, buffer_size=None,
-                 memory_map=False, **kwargs):
+                 memory_map=False, ignore_prefixes=None, **kwargs):
         import pyarrow.dataset as ds
         import pyarrow.fs
 
@@ -1430,7 +1430,8 @@ class _ParquetDatasetV2:
 
         self._dataset = ds.dataset(path_or_paths, filesystem=filesystem,
                                    format=parquet_format,
-                                   partitioning=partitioning)
+                                   partitioning=partitioning,
+                                   ignore_prefixes=ignore_prefixes)
 
     @property
     def schema(self):
@@ -1521,6 +1522,12 @@ use_legacy_dataset : bool, default False
     for all columns and not only the partition keys, enables
     different partitioning schemes, etc.
     Set to False to use the legacy behaviour.
+ignore_prefixes : list, optional
+    Files matching any of these prefixes will be ignored by the
+    discovery process if use_legacy_dataset=False.
+    This is matched to the basename of a path.
+    By default this is ['.', '_'].
+    Note that discovery happens only if a directory is passed as source.
 filesystem : FileSystem, default None
     If nothing passed, paths assumed to be found in the local on-disk
     filesystem.
@@ -1544,7 +1551,8 @@ Returns
 def read_table(source, columns=None, use_threads=True, metadata=None,
                use_pandas_metadata=False, memory_map=False,
                read_dictionary=None, filesystem=None, filters=None,
-               buffer_size=0, partitioning="hive", use_legacy_dataset=False):
+               buffer_size=0, partitioning="hive", use_legacy_dataset=False,
+               ignore_prefixes=None):
     if not use_legacy_dataset:
         if metadata is not None:
             raise ValueError(
@@ -1562,6 +1570,7 @@ def read_table(source, columns=None, use_threads=True, metadata=None,
                 read_dictionary=read_dictionary,
                 buffer_size=buffer_size,
                 filters=filters,
+                ignore_prefixes=ignore_prefixes,
             )
         except ImportError:
             # fall back on ParquetFile for simple cases when pyarrow.dataset
@@ -1585,6 +1594,11 @@ def read_table(source, columns=None, use_threads=True, metadata=None,
         return dataset.read(columns=columns, use_threads=use_threads,
                             use_pandas_metadata=use_pandas_metadata)
 
+    if ignore_prefixes is not None:
+        raise ValueError(
+            "The 'ignore_prefixes' keyword is only supported when "
+            "use_legacy_dataset=False")
+
     if _is_path_like(source):
         pf = ParquetDataset(source, metadata=metadata, memory_map=memory_map,
                             read_dictionary=read_dictionary,
@@ -1616,7 +1630,7 @@ switched to False.""",
 
 def read_pandas(source, columns=None, use_threads=True, memory_map=False,
                 metadata=None, filters=None, buffer_size=0,
-                use_legacy_dataset=True):
+                use_legacy_dataset=True, ignore_prefixes=None):
     return read_table(
         source,
         columns=columns,
@@ -1627,6 +1641,7 @@ def read_pandas(source, columns=None, use_threads=True, memory_map=False,
         buffer_size=buffer_size,
         use_pandas_metadata=True,
         use_legacy_dataset=use_legacy_dataset,
+        ignore_prefixes=ignore_prefixes
     )
 
 
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index a24b1b3..07af08f 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -2678,6 +2678,30 @@ def test_ignore_no_private_directories_path_list(
     _assert_dataset_paths(dataset, paths, use_legacy_dataset)
 
 
+@pytest.mark.pandas
+@parametrize_legacy_dataset_fixed
+def test_ignore_custom_prefixes(tempdir, use_legacy_dataset):
+    # ARROW-9573 - allow override of default ignore_prefixes
+    part = ["xxx"] * 3 + ["yyy"] * 3
+    table = pa.table([
+        pa.array(range(len(part))),
+        pa.array(part).dictionary_encode(),
+    ], names=['index', '_part'])
+
+    pq.write_to_dataset(table, str(tempdir), partition_cols=['_part'])
+
+    private_duplicate = tempdir / '_private_duplicate'
+    private_duplicate.mkdir()
+    pq.write_to_dataset(table, str(private_duplicate),
+                        partition_cols=['_part'])
+
+    read = pq.read_table(
+        tempdir, use_legacy_dataset=use_legacy_dataset,
+        ignore_prefixes=['_private'])
+
+    assert read.equals(table)
+
+
 @parametrize_legacy_dataset_fixed
 def test_empty_directory(tempdir, use_legacy_dataset):
     # ARROW-5310 - reading empty directory