You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/11/29 01:07:14 UTC
[arrow] branch master updated: ARROW-1684: [Python] Support
selecting nested Parquet fields by any path prefix
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new bc70994 ARROW-1684: [Python] Support selecting nested Parquet fields by any path prefix
bc70994 is described below
commit bc7099408b6dc6288510cf58b83b29ace0ca3c6d
Author: Wes McKinney <we...@twosigma.com>
AuthorDate: Tue Nov 28 20:07:11 2017 -0500
ARROW-1684: [Python] Support selecting nested Parquet fields by any path prefix
Author: Wes McKinney <we...@twosigma.com>
Closes #1366 from wesm/ARROW-1684 and squashes the following commits:
e63e42aa [Wes McKinney] Support selecting nested Parquet fields by any path prefix
---
python/pyarrow/_parquet.pxd | 1 +
python/pyarrow/_parquet.pyx | 29 ++++++++++++++++++++-----
python/pyarrow/parquet.py | 41 +++++++++++++++++++++++++++++++-----
python/pyarrow/tests/test_parquet.py | 22 +++++++++++++++++++
4 files changed, 83 insertions(+), 10 deletions(-)
diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd
index 7e5e575..55b66b5 100644
--- a/python/pyarrow/_parquet.pxd
+++ b/python/pyarrow/_parquet.pxd
@@ -37,6 +37,7 @@ cdef extern from "parquet/api/schema.h" namespace "parquet::schema" nogil:
cdef cppclass ColumnPath:
c_string ToDotString()
+ vector[c_string] ToDotVector()
cdef extern from "parquet/api/schema.h" namespace "parquet" nogil:
diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index eca6b20..147af21 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -600,9 +600,11 @@ cdef class ParquetReader:
object source
CMemoryPool* allocator
unique_ptr[FileReader] reader
- column_idx_map
FileMetaData _metadata
+ cdef public:
+ _column_idx_map
+
def __cinit__(self, MemoryPool memory_pool=None):
self.allocator = maybe_unbox_memory_pool(memory_pool)
self._metadata = None
@@ -624,6 +626,23 @@ cdef class ParquetReader:
check_status(OpenFile(rd_handle, self.allocator, properties,
c_metadata, &self.reader))
+ property column_paths:
+
+ def __get__(self):
+ cdef:
+ FileMetaData container = self.metadata
+ const CFileMetaData* metadata = container._metadata
+ vector[c_string] path
+ int i = 0
+
+ paths = []
+ for i in range(0, metadata.num_columns()):
+ path = (metadata.schema().Column(i)
+ .path().get().ToDotVector())
+ paths.append([frombytes(x) for x in path])
+
+ return paths
+
@property
def metadata(self):
cdef:
@@ -729,14 +748,14 @@ cdef class ParquetReader:
const CFileMetaData* metadata = container._metadata
int i = 0
- if self.column_idx_map is None:
- self.column_idx_map = {}
+ if self._column_idx_map is None:
+ self._column_idx_map = {}
for i in range(0, metadata.num_columns()):
col_bytes = tobytes(metadata.schema().Column(i)
.path().get().ToDotString())
- self.column_idx_map[col_bytes] = i
+ self._column_idx_map[col_bytes] = i
- return self.column_idx_map[tobytes(column_name)]
+ return self._column_idx_map[tobytes(column_name)]
def read_column(self, int column_index):
cdef:
diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py
index 37da662..9fb890c 100644
--- a/python/pyarrow/parquet.py
+++ b/python/pyarrow/parquet.py
@@ -15,6 +15,7 @@
# specific language governing permissions and limitations
# under the License.
+from collections import defaultdict
import os
import inspect
import json
@@ -54,6 +55,24 @@ class ParquetFile(object):
self.reader = ParquetReader()
self.reader.open(source, metadata=metadata)
self.common_metadata = common_metadata
+ self._nested_paths_by_prefix = self._build_nested_paths()
+
+ def _build_nested_paths(self):
+ paths = self.reader.column_paths
+
+ result = defaultdict(list)
+
+ def _visit_piece(i, key, rest):
+ result[key].append(i)
+
+ if len(rest) > 0:
+ nested_key = '.'.join((key, rest[0]))
+ _visit_piece(i, nested_key, rest[1:])
+
+ for i, path in enumerate(paths):
+ _visit_piece(i, path[0], path[1:])
+
+ return result
@property
def metadata(self):
@@ -75,7 +94,9 @@ class ParquetFile(object):
Parameters
----------
columns: list
- If not None, only these columns will be read from the row group.
+ If not None, only these columns will be read from the row group. A
+ column name may be a prefix of a nested field, e.g. 'a' will select
+ 'a.b', 'a.c', and 'a.d.e'
nthreads : int, default 1
Number of columns to read in parallel. If > 1, requires that the
underlying file source is threadsafe
@@ -100,7 +121,9 @@ class ParquetFile(object):
Parameters
----------
columns: list
- If not None, only these columns will be read from the file.
+ If not None, only these columns will be read from the file. A
+ column name may be a prefix of a nested field, e.g. 'a' will select
+ 'a.b', 'a.c', and 'a.d.e'
nthreads : int, default 1
Number of columns to read in parallel. If > 1, requires that the
underlying file source is threadsafe
@@ -143,7 +166,11 @@ class ParquetFile(object):
if column_names is None:
return None
- indices = list(map(self.reader.column_name_idx, column_names))
+ indices = []
+
+ for name in column_names:
+ if name in self._nested_paths_by_prefix:
+ indices.extend(self._nested_paths_by_prefix[name])
if use_pandas_metadata:
file_keyvalues = self.metadata.metadata
@@ -837,7 +864,9 @@ def read_table(source, columns=None, nthreads=1, metadata=None,
name or directory name. For passing Python file objects or byte
buffers, see pyarrow.io.PythonFileInterface or pyarrow.io.BufferReader.
columns: list
- If not None, only these columns will be read from the file.
+ If not None, only these columns will be read from the file. A column
+ name may be a prefix of a nested field, e.g. 'a' will select 'a.b',
+ 'a.c', and 'a.d.e'
nthreads : int, default 1
Number of columns to read in parallel. Requires that the underlying
file source is threadsafe
@@ -875,7 +904,9 @@ def read_pandas(source, columns=None, nthreads=1, metadata=None):
name. For passing Python file objects or byte buffers,
see pyarrow.io.PythonFileInterface or pyarrow.io.BufferReader.
columns: list
- If not None, only these columns will be read from the file.
+ If not None, only these columns will be read from the file. A column
+ name may be a prefix of a nested field, e.g. 'a' will select 'a.b',
+ 'a.c', and 'a.d.e'
nthreads : int, default 1
Number of columns to read in parallel. Requires that the underlying
file source is threadsafe
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index 274ff45..9004fc0 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -1469,6 +1469,28 @@ def test_index_column_name_duplicate(tmpdir):
@parquet
+def test_parquet_nested_convenience(tmpdir):
+ # ARROW-1684
+ import pyarrow.parquet as pq
+
+ df = pd.DataFrame({
+ 'a': [[1, 2, 3], None, [4, 5], []],
+ 'b': [[1.], None, None, [6., 7.]],
+ })
+
+ path = str(tmpdir / 'nested_convenience.parquet')
+
+ table = pa.Table.from_pandas(df, preserve_index=False)
+ _write_table(table, path)
+
+ read = pq.read_table(path, columns=['a'])
+ tm.assert_frame_equal(read.to_pandas(), df[['a']])
+
+ read = pq.read_table(path, columns=['a', 'b'])
+ tm.assert_frame_equal(read.to_pandas(), df)
+
+
+@parquet
def test_backwards_compatible_index_naming():
expected_string = b"""\
carat cut color clarity depth table price x y z
--
To stop receiving notification emails like this one, please contact
['"commits@arrow.apache.org" <co...@arrow.apache.org>'].