You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ks...@apache.org on 2020/08/11 15:57:46 UTC
[arrow] 17/22: ARROW-9573: [Python][Dataset] Provide
read_table(ignore_prefixes=)
This is an automated email from the ASF dual-hosted git repository.
kszucs pushed a commit to branch maint-1.0.x
in repository https://gitbox.apache.org/repos/asf/arrow.git
commit 51d0c450ba5a1e88365b1c58aad646ce80ce5db6
Author: Benjamin Kietzman <be...@gmail.com>
AuthorDate: Thu Aug 6 12:10:42 2020 -0400
ARROW-9573: [Python][Dataset] Provide read_table(ignore_prefixes=)
Closes #7900 from bkietz/9573-expose-ignore_prefixes
Authored-by: Benjamin Kietzman <be...@gmail.com>
Signed-off-by: Benjamin Kietzman <be...@gmail.com>
---
python/pyarrow/dataset.py | 2 +-
python/pyarrow/parquet.py | 23 +++++++++++++++++++----
python/pyarrow/tests/test_parquet.py | 24 ++++++++++++++++++++++++
3 files changed, 44 insertions(+), 5 deletions(-)
diff --git a/python/pyarrow/dataset.py b/python/pyarrow/dataset.py
index f4620d0..fd03aee 100644
--- a/python/pyarrow/dataset.py
+++ b/python/pyarrow/dataset.py
@@ -581,7 +581,7 @@ def dataset(source, schema=None, format=None, filesystem=None,
files may be present in the Dataset (resulting in an error at scan
time).
ignore_prefixes : list, optional
- Files matching one of those prefixes will be ignored by the
+ Files matching any of these prefixes will be ignored by the
discovery process. This is matched to the basename of a path.
By default this is ['.', '_'].
Note that discovery happens only if a directory is passed as source.
diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py
index 59c79ac..b5be07f 100644
--- a/python/pyarrow/parquet.py
+++ b/python/pyarrow/parquet.py
@@ -1376,7 +1376,7 @@ class _ParquetDatasetV2:
def __init__(self, path_or_paths, filesystem=None, filters=None,
partitioning="hive", read_dictionary=None, buffer_size=None,
- memory_map=False, **kwargs):
+ memory_map=False, ignore_prefixes=None, **kwargs):
import pyarrow.dataset as ds
import pyarrow.fs
@@ -1430,7 +1430,8 @@ class _ParquetDatasetV2:
self._dataset = ds.dataset(path_or_paths, filesystem=filesystem,
format=parquet_format,
- partitioning=partitioning)
+ partitioning=partitioning,
+ ignore_prefixes=ignore_prefixes)
@property
def schema(self):
@@ -1521,6 +1522,12 @@ use_legacy_dataset : bool, default False
for all columns and not only the partition keys, enables
different partitioning schemes, etc.
Set to False to use the legacy behaviour.
+ignore_prefixes : list, optional
+ Files matching any of these prefixes will be ignored by the
+ discovery process if use_legacy_dataset=False.
+ This is matched to the basename of a path.
+ By default this is ['.', '_'].
+ Note that discovery happens only if a directory is passed as source.
filesystem : FileSystem, default None
If nothing passed, paths assumed to be found in the local on-disk
filesystem.
@@ -1544,7 +1551,8 @@ Returns
def read_table(source, columns=None, use_threads=True, metadata=None,
use_pandas_metadata=False, memory_map=False,
read_dictionary=None, filesystem=None, filters=None,
- buffer_size=0, partitioning="hive", use_legacy_dataset=False):
+ buffer_size=0, partitioning="hive", use_legacy_dataset=False,
+ ignore_prefixes=None):
if not use_legacy_dataset:
if metadata is not None:
raise ValueError(
@@ -1562,6 +1570,7 @@ def read_table(source, columns=None, use_threads=True, metadata=None,
read_dictionary=read_dictionary,
buffer_size=buffer_size,
filters=filters,
+ ignore_prefixes=ignore_prefixes,
)
except ImportError:
# fall back on ParquetFile for simple cases when pyarrow.dataset
@@ -1585,6 +1594,11 @@ def read_table(source, columns=None, use_threads=True, metadata=None,
return dataset.read(columns=columns, use_threads=use_threads,
use_pandas_metadata=use_pandas_metadata)
+ if ignore_prefixes is not None:
+ raise ValueError(
+ "The 'ignore_prefixes' keyword is only supported when "
+ "use_legacy_dataset=False")
+
if _is_path_like(source):
pf = ParquetDataset(source, metadata=metadata, memory_map=memory_map,
read_dictionary=read_dictionary,
@@ -1616,7 +1630,7 @@ switched to False.""",
def read_pandas(source, columns=None, use_threads=True, memory_map=False,
metadata=None, filters=None, buffer_size=0,
- use_legacy_dataset=True):
+ use_legacy_dataset=True, ignore_prefixes=None):
return read_table(
source,
columns=columns,
@@ -1627,6 +1641,7 @@ def read_pandas(source, columns=None, use_threads=True, memory_map=False,
buffer_size=buffer_size,
use_pandas_metadata=True,
use_legacy_dataset=use_legacy_dataset,
+ ignore_prefixes=ignore_prefixes
)
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index a24b1b3..07af08f 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -2678,6 +2678,30 @@ def test_ignore_no_private_directories_path_list(
_assert_dataset_paths(dataset, paths, use_legacy_dataset)
+@pytest.mark.pandas
+@parametrize_legacy_dataset_fixed
+def test_ignore_custom_prefixes(tempdir, use_legacy_dataset):
+ # ARROW-9573 - allow override of default ignore_prefixes
+ part = ["xxx"] * 3 + ["yyy"] * 3
+ table = pa.table([
+ pa.array(range(len(part))),
+ pa.array(part).dictionary_encode(),
+ ], names=['index', '_part'])
+
+ pq.write_to_dataset(table, str(tempdir), partition_cols=['_part'])
+
+ private_duplicate = tempdir / '_private_duplicate'
+ private_duplicate.mkdir()
+ pq.write_to_dataset(table, str(private_duplicate),
+ partition_cols=['_part'])
+
+ read = pq.read_table(
+ tempdir, use_legacy_dataset=use_legacy_dataset,
+ ignore_prefixes=['_private'])
+
+ assert read.equals(table)
+
+
@parametrize_legacy_dataset_fixed
def test_empty_directory(tempdir, use_legacy_dataset):
# ARROW-5310 - reading empty directory