You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/07/28 15:14:41 UTC
arrow git commit: ARROW-1273: [Python] Add Parquet read_metadata,
read_schema convenience functions
Repository: arrow
Updated Branches:
refs/heads/master 4df2a0bfa -> 44855bb16
ARROW-1273: [Python] Add Parquet read_metadata, read_schema convenience functions
cc @xhochy @cpcloud for feedback on API
Author: Wes McKinney <we...@twosigma.com>
Closes #904 from wesm/ARROW-1273 and squashes the following commits:
13725654 [Wes McKinney] Add Parquet read_metadata, read_schema convenience functions
Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/44855bb1
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/44855bb1
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/44855bb1
Branch: refs/heads/master
Commit: 44855bb16312031a6d4285632d0071c676ef38aa
Parents: 4df2a0b
Author: Wes McKinney <we...@twosigma.com>
Authored: Fri Jul 28 11:14:35 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Fri Jul 28 11:14:35 2017 -0400
----------------------------------------------------------------------
python/doc/source/api.rst | 3 +++
python/pyarrow/parquet.py | 30 ++++++++++++++++++++++++++++++
python/pyarrow/tests/test_parquet.py | 31 ++++++++++++++++++++++++-------
3 files changed, 57 insertions(+), 7 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/arrow/blob/44855bb1/python/doc/source/api.rst
----------------------------------------------------------------------
diff --git a/python/doc/source/api.rst b/python/doc/source/api.rst
index 6554465..b84163b 100644
--- a/python/doc/source/api.rst
+++ b/python/doc/source/api.rst
@@ -239,5 +239,8 @@ Apache Parquet
ParquetDataset
ParquetFile
read_table
+ read_metadata
+ read_pandas
+ read_schema
write_metadata
write_table
http://git-wip-us.apache.org/repos/asf/arrow/blob/44855bb1/python/pyarrow/parquet.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py
index 34c1d12..a3af9ae 100644
--- a/python/pyarrow/parquet.py
+++ b/python/pyarrow/parquet.py
@@ -805,3 +805,33 @@ def write_metadata(schema, where, version='1.0',
)
writer = ParquetWriter(where, schema, **options)
writer.close()
+
+
+def read_metadata(where):
+ """
+ Read FileMetadata from footer of a single Parquet file
+
+ Parameters
+ ----------
+ where : string (filepath) or file-like object
+
+ Returns
+ -------
+ metadata : FileMetadata
+ """
+ return ParquetFile(where).metadata
+
+
+def read_schema(where):
+ """
+ Read effective Arrow schema from Parquet file metadata
+
+ Parameters
+ ----------
+ where : string (filepath) or file-like object
+
+ Returns
+ -------
+ schema : pyarrow.Schema
+ """
+ return ParquetFile(where).schema.to_arrow_schema()
http://git-wip-us.apache.org/repos/asf/arrow/blob/44855bb1/python/pyarrow/tests/test_parquet.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index 7443df8..f840673 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -124,9 +124,8 @@ def test_pandas_parquet_custom_metadata(tmpdir):
assert b'pandas' in arrow_table.schema.metadata
_write_table(arrow_table, filename.strpath, version="2.0")
- pf = pq.ParquetFile(filename.strpath)
- md = pf.metadata.metadata
+ md = pq.read_metadata(filename.strpath).metadata
assert b'pandas' in md
js = json.loads(md[b'pandas'].decode('utf8'))
@@ -592,7 +591,7 @@ def test_pass_separate_metadata():
_write_table(a_table, buf, compression='snappy', version='2.0')
buf.seek(0)
- metadata = pq.ParquetFile(buf).metadata
+ metadata = pq.read_metadata(buf)
buf.seek(0)
@@ -788,14 +787,32 @@ def test_read_common_metadata_files(tmpdir):
dataset = pq.ParquetDataset(base_path)
assert dataset.metadata_path == metadata_path
- pf = pq.ParquetFile(data_path)
- assert dataset.schema.equals(pf.schema)
+ common_schema = pq.read_metadata(data_path).schema
+ assert dataset.schema.equals(common_schema)
# handle list of one directory
dataset2 = pq.ParquetDataset([base_path])
assert dataset2.schema.equals(dataset.schema)
+@parquet
+def test_read_schema(tmpdir):
+ import pyarrow.parquet as pq
+
+ N = 100
+ df = pd.DataFrame({
+ 'index': np.arange(N),
+ 'values': np.random.randn(N)
+ }, columns=['index', 'values'])
+
+ data_path = pjoin(str(tmpdir), 'test.parquet')
+
+ table = pa.Table.from_pandas(df)
+ _write_table(table, data_path)
+
+ assert table.schema.equals(pq.read_schema(data_path))
+
+
def _filter_partition(df, part_keys):
predicate = np.ones(len(df), dtype=bool)
@@ -847,7 +864,7 @@ def test_read_multiple_files(tmpdir):
assert result.equals(expected)
# Read with provided metadata
- metadata = pq.ParquetFile(paths[0]).metadata
+ metadata = pq.read_metadata(paths[0])
result2 = read_multiple_files(paths, metadata=metadata)
assert result2.equals(expected)
@@ -873,7 +890,7 @@ def test_read_multiple_files(tmpdir):
t = pa.Table.from_pandas(bad_apple)
_write_table(t, bad_apple_path)
- bad_meta = pq.ParquetFile(bad_apple_path).metadata
+ bad_meta = pq.read_metadata(bad_apple_path)
with pytest.raises(ValueError):
read_multiple_files(paths + [bad_apple_path])