You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/07/18 00:34:56 UTC
arrow git commit: ARROW-1079: [Python] Filter out private directories
when building Parquet dataset manifest
Repository: arrow
Updated Branches:
refs/heads/master b4d34f8fd -> a1c8b83b4
ARROW-1079: [Python] Filter out private directories when building Parquet dataset manifest
Some systems like Hive and Impala use special files or directories to signal to other readers that a dataset modification is in progress. If such directories (starting with an underscore) exist in a flat Parquet directory, it currently breaks the dataset reader.
Author: Wes McKinney <we...@twosigma.com>
Closes #860 from wesm/ARROW-1079 and squashes the following commits:
c1c445b4 [Wes McKinney] Filter out private directories when building Parquet dataset manifest
Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/a1c8b83b
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/a1c8b83b
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/a1c8b83b
Branch: refs/heads/master
Commit: a1c8b83b49192230bd2c91bd009e2ff272d89310
Parents: b4d34f8
Author: Wes McKinney <we...@twosigma.com>
Authored: Mon Jul 17 20:34:51 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Mon Jul 17 20:34:51 2017 -0400
----------------------------------------------------------------------
python/pyarrow/parquet.py | 9 ++++++++
python/pyarrow/tests/test_parquet.py | 35 +++++++++++++++++++++++++++++--
2 files changed, 42 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/arrow/blob/a1c8b83b/python/pyarrow/parquet.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py
index dc26dab..aa2352c 100644
--- a/python/pyarrow/parquet.py
+++ b/python/pyarrow/parquet.py
@@ -15,6 +15,7 @@
# specific language governing permissions and limitations
# under the License.
+import os
import json
import six
@@ -414,6 +415,9 @@ class ParquetManifest(object):
elif fs.isdir(path):
directories.append(path)
+ # ARROW-1079: Filter out "private" directories starting with underscore
+ directories = [x for x in directories if not _is_private_directory(x)]
+
if len(files) > 0 and len(directories) > 0:
raise ValueError('Found files in an intermediate '
'directory: {0}'.format(base_path))
@@ -456,6 +460,11 @@ def _parse_hive_partition(value):
return value.split('=', 1)
+def _is_private_directory(x):
+ _, tail = os.path.split(x)
+ return tail.startswith('_') and '=' not in tail
+
+
def _path_split(path, sep):
i = path.rfind(sep) + 1
head, tail = path[:i], path[i:]
http://git-wip-us.apache.org/repos/asf/arrow/blob/a1c8b83b/python/pyarrow/tests/test_parquet.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index f606a7f..0f44d16 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -36,9 +36,14 @@ import pandas.util.testing as tm
parquet = pytest.mark.parquet
-def _write_table(*args, **kwargs):
+def _write_table(table, path, **kwargs):
import pyarrow.parquet as pq
- return pq.write_table(*args, **kwargs)
+
+ if isinstance(table, pd.DataFrame):
+ table = pa.Table.from_pandas(table)
+
+ pq.write_table(table, path, **kwargs)
+ return table
def _read_table(*args, **kwargs):
@@ -852,6 +857,32 @@ def test_read_multiple_files(tmpdir):
@parquet
+def test_ignore_private_directories(tmpdir):
+ import pyarrow.parquet as pq
+
+ nfiles = 10
+ size = 5
+
+ dirpath = tmpdir.join(guid()).strpath
+ os.mkdir(dirpath)
+
+ test_data = []
+ paths = []
+ for i in range(nfiles):
+ df = _test_dataframe(size, seed=i)
+ path = pjoin(dirpath, '{0}.parquet'.format(i))
+
+ test_data.append(_write_table(df, path))
+ paths.append(path)
+
+ # private directory
+ os.mkdir(pjoin(dirpath, '_impala_staging'))
+
+ dataset = pq.ParquetDataset(dirpath)
+ assert set(paths) == set(x.path for x in dataset.pieces)
+
+
+@parquet
def test_multiindex_duplicate_values(tmpdir):
num_rows = 3
numbers = list(range(num_rows))