You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by uw...@apache.org on 2017/01/10 07:44:59 UTC
arrow git commit: ARROW-442: [Python] Inspect Parquet file metadata from Python

Repository: arrow
Updated Branches:
  refs/heads/master 3195948f6 -> f44b6a3b9


ARROW-442: [Python] Inspect Parquet file metadata from Python

I also made the Cython parquet extension "private" so that higher level logic (e.g. upcoming handling of multiple files) can be handled in pure Python (which doesn't need to be compiled)

Requires PARQUET-828 for the test suite to pass.

Author: Wes McKinney <we...@twosigma.com>

Closes #275 from wesm/ARROW-442 and squashes the following commits:

a4255a2 [Wes McKinney] Add row group metadata accessor, add smoke tests
75a11cf [Wes McKinney] Add more metadata accessor scaffolding, to be tested
e59ca40 [Wes McKinney] Move parquet Cython wrapper to a private import, add parquet.py for high level logic


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/f44b6a3b
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/f44b6a3b
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/f44b6a3b

Branch: refs/heads/master
Commit: f44b6a3b91a15461804dd7877840a557caa52e4e
Parents: 3195948
Author: Wes McKinney <we...@twosigma.com>
Authored: Tue Jan 10 08:44:01 2017 +0100
Committer: Uwe L. Korn <uw...@xhochy.com>
Committed: Tue Jan 10 08:44:01 2017 +0100

----------------------------------------------------------------------
 python/CMakeLists.txt                |   2 +-
 python/pyarrow/_parquet.pxd          | 217 +++++++++++++
 python/pyarrow/_parquet.pyx          | 520 ++++++++++++++++++++++++++++++
 python/pyarrow/includes/parquet.pxd  | 147 ---------
 python/pyarrow/parquet.py            | 116 +++++++
 python/pyarrow/parquet.pyx           | 244 --------------
 python/pyarrow/tests/test_parquet.py |  71 +++-
 python/setup.py                      |   4 +-
 8 files changed, 922 insertions(+), 399 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/f44b6a3b/python/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index e42c45d..45115d4 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -428,7 +428,7 @@ if (PYARROW_BUILD_PARQUET)
     parquet_arrow)
   set(CYTHON_EXTENSIONS
     ${CYTHON_EXTENSIONS}
-    parquet)
+    _parquet)
 endif()
 
 add_library(pyarrow SHARED

http://git-wip-us.apache.org/repos/asf/arrow/blob/f44b6a3b/python/pyarrow/_parquet.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd
new file mode 100644
index 0000000..faca845
--- /dev/null
+++ b/python/pyarrow/_parquet.pxd
@@ -0,0 +1,217 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# distutils: language = c++
+
+from pyarrow.includes.common cimport *
+from pyarrow.includes.libarrow cimport (CArray, CSchema, CStatus,
+                                        CTable, MemoryPool)
+from pyarrow.includes.libarrow_io cimport ReadableFileInterface, OutputStream
+
+
+cdef extern from "parquet/api/schema.h" namespace "parquet::schema" nogil:
+  cdef cppclass Node:
+    pass
+
+  cdef cppclass GroupNode(Node):
+    pass
+
+  cdef cppclass PrimitiveNode(Node):
+    pass
+
+  cdef cppclass ColumnPath:
+    c_string ToDotString()
+
+cdef extern from "parquet/api/schema.h" namespace "parquet" nogil:
+    enum ParquetType" parquet::Type::type":
+        ParquetType_BOOLEAN" parquet::Type::BOOLEAN"
+        ParquetType_INT32" parquet::Type::INT32"
+        ParquetType_INT64" parquet::Type::INT64"
+        ParquetType_INT96" parquet::Type::INT96"
+        ParquetType_FLOAT" parquet::Type::FLOAT"
+        ParquetType_DOUBLE" parquet::Type::DOUBLE"
+        ParquetType_BYTE_ARRAY" parquet::Type::BYTE_ARRAY"
+        ParquetType_FIXED_LEN_BYTE_ARRAY" parquet::Type::FIXED_LEN_BYTE_ARRAY"
+
+    enum ParquetLogicalType" parquet::LogicalType::type":
+        ParquetLogicalType_NONE" parquet::LogicalType::NONE"
+        ParquetLogicalType_UTF8" parquet::LogicalType::UTF8"
+        ParquetLogicalType_MAP" parquet::LogicalType::MAP"
+        ParquetLogicalType_MAP_KEY_VALUE" parquet::LogicalType::MAP_KEY_VALUE"
+        ParquetLogicalType_LIST" parquet::LogicalType::LIST"
+        ParquetLogicalType_ENUM" parquet::LogicalType::ENUM"
+        ParquetLogicalType_DECIMAL" parquet::LogicalType::DECIMAL"
+        ParquetLogicalType_DATE" parquet::LogicalType::DATE"
+        ParquetLogicalType_TIME_MILLIS" parquet::LogicalType::TIME_MILLIS"
+        ParquetLogicalType_TIME_MICROS" parquet::LogicalType::TIME_MICROS"
+        ParquetLogicalType_TIMESTAMP_MILLIS" parquet::LogicalType::TIMESTAMP_MILLIS"
+        ParquetLogicalType_TIMESTAMP_MICROS" parquet::LogicalType::TIMESTAMP_MICROS"
+        ParquetLogicalType_UINT_8" parquet::LogicalType::UINT_8"
+        ParquetLogicalType_UINT_16" parquet::LogicalType::UINT_16"
+        ParquetLogicalType_UINT_32" parquet::LogicalType::UINT_32"
+        ParquetLogicalType_UINT_64" parquet::LogicalType::UINT_64"
+        ParquetLogicalType_INT_8" parquet::LogicalType::INT_8"
+        ParquetLogicalType_INT_16" parquet::LogicalType::INT_16"
+        ParquetLogicalType_INT_32" parquet::LogicalType::INT_32"
+        ParquetLogicalType_INT_64" parquet::LogicalType::INT_64"
+        ParquetLogicalType_JSON" parquet::LogicalType::JSON"
+        ParquetLogicalType_BSON" parquet::LogicalType::BSON"
+        ParquetLogicalType_INTERVAL" parquet::LogicalType::INTERVAL"
+
+    enum ParquetRepetition" parquet::Repetition::type":
+        ParquetRepetition_REQUIRED" parquet::REPETITION::REQUIRED"
+        ParquetRepetition_OPTIONAL" parquet::REPETITION::OPTIONAL"
+        ParquetRepetition_REPEATED" parquet::REPETITION::REPEATED"
+
+    enum ParquetEncoding" parquet::Encoding::type":
+        ParquetEncoding_PLAIN" parquet::Encoding::PLAIN"
+        ParquetEncoding_PLAIN_DICTIONARY" parquet::Encoding::PLAIN_DICTIONARY"
+        ParquetEncoding_RLE" parquet::Encoding::RLE"
+        ParquetEncoding_BIT_PACKED" parquet::Encoding::BIT_PACKED"
+        ParquetEncoding_DELTA_BINARY_PACKED" parquet::Encoding::DELTA_BINARY_PACKED"
+        ParquetEncoding_DELTA_LENGTH_BYTE_ARRAY" parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY"
+        ParquetEncoding_DELTA_BYTE_ARRAY" parquet::Encoding::DELTA_BYTE_ARRAY"
+        ParquetEncoding_RLE_DICTIONARY" parquet::Encoding::RLE_DICTIONARY"
+
+    enum ParquetCompression" parquet::Compression::type":
+        ParquetCompression_UNCOMPRESSED" parquet::Compression::UNCOMPRESSED"
+        ParquetCompression_SNAPPY" parquet::Compression::SNAPPY"
+        ParquetCompression_GZIP" parquet::Compression::GZIP"
+        ParquetCompression_LZO" parquet::Compression::LZO"
+        ParquetCompression_BROTLI" parquet::Compression::BROTLI"
+
+    enum ParquetVersion" parquet::ParquetVersion::type":
+        ParquetVersion_V1" parquet::ParquetVersion::PARQUET_1_0"
+        ParquetVersion_V2" parquet::ParquetVersion::PARQUET_2_0"
+
+    cdef cppclass ColumnDescriptor:
+        shared_ptr[ColumnPath] path()
+
+        int16_t max_definition_level()
+        int16_t max_repetition_level()
+
+        ParquetType physical_type()
+        ParquetLogicalType logical_type()
+        const c_string& name()
+        int type_length()
+        int type_precision()
+        int type_scale()
+
+    cdef cppclass SchemaDescriptor:
+        const ColumnDescriptor* Column(int i)
+        shared_ptr[Node] schema()
+        GroupNode* group()
+        int num_columns()
+
+
+cdef extern from "parquet/api/reader.h" namespace "parquet" nogil:
+    cdef cppclass ColumnReader:
+        pass
+
+    cdef cppclass BoolReader(ColumnReader):
+        pass
+
+    cdef cppclass Int32Reader(ColumnReader):
+        pass
+
+    cdef cppclass Int64Reader(ColumnReader):
+        pass
+
+    cdef cppclass Int96Reader(ColumnReader):
+        pass
+
+    cdef cppclass FloatReader(ColumnReader):
+        pass
+
+    cdef cppclass DoubleReader(ColumnReader):
+        pass
+
+    cdef cppclass ByteArrayReader(ColumnReader):
+        pass
+
+    cdef cppclass RowGroupReader:
+        pass
+
+    cdef cppclass CRowGroupMetaData" parquet::RowGroupMetaData":
+        int num_columns()
+        int64_t num_rows()
+        int64_t total_byte_size()
+
+    cdef cppclass CFileMetaData" parquet::FileMetaData":
+        uint32_t size()
+        int num_columns()
+        int64_t num_rows()
+        int num_row_groups()
+        int32_t version()
+        const c_string created_by()
+        int num_schema_elements()
+
+        unique_ptr[CRowGroupMetaData] RowGroup(int i)
+        const SchemaDescriptor* schema()
+
+    cdef cppclass ParquetFileReader:
+        # TODO: Some default arguments are missing
+        @staticmethod
+        unique_ptr[ParquetFileReader] OpenFile(const c_string& path)
+        shared_ptr[CFileMetaData] metadata();
+
+
+cdef extern from "parquet/api/writer.h" namespace "parquet" nogil:
+    cdef cppclass ParquetOutputStream" parquet::OutputStream":
+        pass
+
+    cdef cppclass LocalFileOutputStream(ParquetOutputStream):
+        LocalFileOutputStream(const c_string& path)
+        void Close()
+
+    cdef cppclass WriterProperties:
+        cppclass Builder:
+            Builder* version(ParquetVersion version)
+            Builder* compression(ParquetCompression codec)
+            Builder* compression(const c_string& path,
+                                 ParquetCompression codec)
+            Builder* disable_dictionary()
+            Builder* enable_dictionary()
+            Builder* enable_dictionary(const c_string& path)
+            shared_ptr[WriterProperties] build()
+
+
+cdef extern from "parquet/arrow/reader.h" namespace "parquet::arrow" nogil:
+    CStatus OpenFile(const shared_ptr[ReadableFileInterface]& file,
+                     MemoryPool* allocator,
+                     unique_ptr[FileReader]* reader)
+
+    cdef cppclass FileReader:
+        FileReader(MemoryPool* pool, unique_ptr[ParquetFileReader] reader)
+        CStatus ReadFlatColumn(int i, shared_ptr[CArray]* out);
+        CStatus ReadFlatTable(shared_ptr[CTable]* out);
+        const ParquetFileReader* parquet_reader();
+
+
+cdef extern from "parquet/arrow/schema.h" namespace "parquet::arrow" nogil:
+    CStatus FromParquetSchema(const SchemaDescriptor* parquet_schema,
+                              shared_ptr[CSchema]* out)
+    CStatus ToParquetSchema(const CSchema* arrow_schema,
+                            shared_ptr[SchemaDescriptor]* out)
+
+
+cdef extern from "parquet/arrow/writer.h" namespace "parquet::arrow" nogil:
+    cdef CStatus WriteFlatTable(
+        const CTable* table, MemoryPool* pool,
+        const shared_ptr[OutputStream]& sink,
+        int64_t chunk_size,
+        const shared_ptr[WriterProperties]& properties)

http://git-wip-us.apache.org/repos/asf/arrow/blob/f44b6a3b/python/pyarrow/_parquet.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
new file mode 100644
index 0000000..c0dc3eb
--- /dev/null
+++ b/python/pyarrow/_parquet.pyx
@@ -0,0 +1,520 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# cython: profile=False
+# distutils: language = c++
+# cython: embedsignature = True
+
+from pyarrow._parquet cimport *
+
+from pyarrow.includes.libarrow cimport *
+from pyarrow.includes.libarrow_io cimport (ReadableFileInterface, OutputStream,
+                                           FileOutputStream)
+cimport pyarrow.includes.pyarrow as pyarrow
+
+from pyarrow.array cimport Array
+from pyarrow.compat import tobytes, frombytes
+from pyarrow.error import ArrowException
+from pyarrow.error cimport check_status
+from pyarrow.io import NativeFile
+from pyarrow.table cimport Table
+
+from pyarrow.io cimport NativeFile, get_reader, get_writer
+
+import six
+
+
+cdef class RowGroupMetaData:
+    cdef:
+        unique_ptr[CRowGroupMetaData] up_metadata
+        CRowGroupMetaData* metadata
+        object parent
+
+    def __cinit__(self):
+        pass
+
+    cdef init_from_file(self, FileMetaData parent, int i):
+        if i < 0 or i >= parent.num_row_groups:
+            raise IndexError('{0} out of bounds'.format(i))
+        self.up_metadata = parent.metadata.RowGroup(i)
+        self.metadata = self.up_metadata.get()
+        self.parent = parent
+
+    def __repr__(self):
+        return """{0}
+  num_columns: {1}
+  num_rows: {2}
+  total_byte_size: {3}""".format(object.__repr__(self),
+                                 self.num_columns,
+                                 self.num_rows,
+                                 self.total_byte_size)
+
+    property num_columns:
+
+        def __get__(self):
+            return self.metadata.num_columns()
+
+    property num_rows:
+
+        def __get__(self):
+            return self.metadata.num_rows()
+
+    property total_byte_size:
+
+        def __get__(self):
+            return self.metadata.total_byte_size()
+
+
+cdef class FileMetaData:
+    cdef:
+        shared_ptr[CFileMetaData] sp_metadata
+        CFileMetaData* metadata
+        object _schema
+
+    def __cinit__(self):
+        pass
+
+    cdef init(self, const shared_ptr[CFileMetaData]& metadata):
+        self.sp_metadata = metadata
+        self.metadata = metadata.get()
+
+    def __repr__(self):
+        return """{0}
+  created_by: {1}
+  num_columns: {2}
+  num_rows: {3}
+  num_row_groups: {4}
+  format_version: {5}
+  serialized_size: {6}""".format(object.__repr__(self),
+                                 self.created_by, self.num_columns,
+                                 self.num_rows, self.num_row_groups,
+                                 self.format_version,
+                                 self.serialized_size)
+
+    @property
+    def schema(self):
+        if self._schema is not None:
+            return self._schema
+
+        cdef Schema schema = Schema()
+        schema.init_from_filemeta(self)
+        self._schema = schema
+        return schema
+
+    property serialized_size:
+
+        def __get__(self):
+            return self.metadata.size()
+
+    property num_columns:
+
+        def __get__(self):
+            return self.metadata.num_columns()
+
+    property num_rows:
+
+        def __get__(self):
+            return self.metadata.num_rows()
+
+    property num_row_groups:
+
+        def __get__(self):
+            return self.metadata.num_row_groups()
+
+    property format_version:
+
+        def __get__(self):
+            cdef int version = self.metadata.version()
+            if version == 2:
+                return '2.0'
+            elif version == 1:
+                return '1.0'
+            else:
+                print('Unrecognized file version, assuming 1.0: {0}'
+                      .format(version))
+                return '1.0'
+
+    property created_by:
+
+        def __get__(self):
+            return frombytes(self.metadata.created_by())
+
+    def row_group(self, int i):
+        """
+
+        """
+        cdef RowGroupMetaData result = RowGroupMetaData()
+        result.init_from_file(self, i)
+        return result
+
+
+cdef class Schema:
+    cdef:
+        object parent  # the FileMetaData owning the SchemaDescriptor
+        const SchemaDescriptor* schema
+
+    def __cinit__(self):
+        self.parent = None
+        self.schema = NULL
+
+    def __repr__(self):
+        cdef const ColumnDescriptor* descr
+        elements = []
+        for i in range(self.schema.num_columns()):
+            col = self.column(i)
+            logical_type = col.logical_type
+            formatted = '{0}: {1}'.format(col.path, col.physical_type)
+            if logical_type != 'NONE':
+                formatted += ' {0}'.format(logical_type)
+            elements.append(formatted)
+
+        return """{0}
+{1}
+ """.format(object.__repr__(self), '\n'.join(elements))
+
+    cdef init_from_filemeta(self, FileMetaData container):
+        self.parent = container
+        self.schema = container.metadata.schema()
+
+    def __len__(self):
+        return self.schema.num_columns()
+
+    def __getitem__(self, i):
+        return self.column(i)
+
+    def column(self, i):
+        if i < 0 or i >= len(self):
+            raise IndexError('{0} out of bounds'.format(i))
+
+        cdef ColumnSchema col = ColumnSchema()
+        col.init_from_schema(self, i)
+        return col
+
+
+cdef class ColumnSchema:
+    cdef:
+        object parent
+        const ColumnDescriptor* descr
+
+    def __cinit__(self):
+        self.descr = NULL
+
+    cdef init_from_schema(self, Schema schema, int i):
+        self.parent = schema
+        self.descr = schema.schema.Column(i)
+
+    def __repr__(self):
+        physical_type = self.physical_type
+        logical_type = self.logical_type
+        if logical_type == 'DECIMAL':
+            logical_type = 'DECIMAL({0}, {1})'.format(self.precision,
+                                                      self.scale)
+        elif physical_type == 'FIXED_LEN_BYTE_ARRAY':
+            logical_type = ('FIXED_LEN_BYTE_ARRAY(length={0})'
+                            .format(self.length))
+
+        return """<ParquetColumnSchema>
+  name: {0}
+  path: {1}
+  max_definition_level: {2}
+  max_repetition_level: {3}
+  physical_type: {4}
+  logical_type: {5}""".format(self.name, self.path, self.max_definition_level,
+                       self.max_repetition_level, physical_type,
+                       logical_type)
+
+    property name:
+
+        def __get__(self):
+            return frombytes(self.descr.name())
+
+    property path:
+
+        def __get__(self):
+            return frombytes(self.descr.path().get().ToDotString())
+
+    property max_definition_level:
+
+        def __get__(self):
+            return self.descr.max_definition_level()
+
+    property max_repetition_level:
+
+        def __get__(self):
+            return self.descr.max_repetition_level()
+
+    property physical_type:
+
+        def __get__(self):
+            return physical_type_name_from_enum(self.descr.physical_type())
+
+    property logical_type:
+
+        def __get__(self):
+            return logical_type_name_from_enum(self.descr.logical_type())
+
+    # FIXED_LEN_BYTE_ARRAY attribute
+    property length:
+
+        def __get__(self):
+            return self.descr.type_length()
+
+    # Decimal attributes
+    property precision:
+
+        def __get__(self):
+            return self.descr.type_precision()
+
+    property scale:
+
+        def __get__(self):
+            return self.descr.type_scale()
+
+
+cdef physical_type_name_from_enum(ParquetType type_):
+    return {
+        ParquetType_BOOLEAN: 'BOOLEAN',
+        ParquetType_INT32: 'INT32',
+        ParquetType_INT64: 'INT64',
+        ParquetType_INT96: 'INT96',
+        ParquetType_FLOAT: 'FLOAT',
+        ParquetType_DOUBLE: 'DOUBLE',
+        ParquetType_BYTE_ARRAY: 'BYTE_ARRAY',
+        ParquetType_FIXED_LEN_BYTE_ARRAY: 'FIXED_LEN_BYTE_ARRAY',
+    }.get(type_, 'UNKNOWN')
+
+
+cdef logical_type_name_from_enum(ParquetLogicalType type_):
+    return {
+        ParquetLogicalType_NONE: 'NONE',
+        ParquetLogicalType_UTF8: 'UTF8',
+        ParquetLogicalType_MAP: 'MAP',
+        ParquetLogicalType_MAP_KEY_VALUE: 'MAP_KEY_VALUE',
+        ParquetLogicalType_LIST: 'LIST',
+        ParquetLogicalType_ENUM: 'ENUM',
+        ParquetLogicalType_DECIMAL: 'DECIMAL',
+        ParquetLogicalType_DATE: 'DATE',
+        ParquetLogicalType_TIME_MILLIS: 'TIME_MILLIS',
+        ParquetLogicalType_TIME_MICROS: 'TIME_MICROS',
+        ParquetLogicalType_TIMESTAMP_MILLIS: 'TIMESTAMP_MILLIS',
+        ParquetLogicalType_TIMESTAMP_MICROS: 'TIMESTAMP_MICROS',
+        ParquetLogicalType_UINT_8: 'UINT_8',
+        ParquetLogicalType_UINT_16: 'UINT_16',
+        ParquetLogicalType_UINT_32: 'UINT_32',
+        ParquetLogicalType_UINT_64: 'UINT_64',
+        ParquetLogicalType_INT_8: 'INT_8',
+        ParquetLogicalType_INT_16: 'INT_16',
+        ParquetLogicalType_INT_32: 'INT_32',
+        ParquetLogicalType_INT_64: 'UINT_64',
+        ParquetLogicalType_JSON: 'JSON',
+        ParquetLogicalType_BSON: 'BSON',
+        ParquetLogicalType_INTERVAL: 'INTERVAL',
+    }.get(type_, 'UNKNOWN')
+
+
+cdef class ParquetReader:
+    cdef:
+        MemoryPool* allocator
+        unique_ptr[FileReader] reader
+        column_idx_map
+        FileMetaData _metadata
+
+    def __cinit__(self):
+        self.allocator = default_memory_pool()
+        self._metadata = None
+
+    def open(self, object source):
+        cdef:
+            shared_ptr[ReadableFileInterface] rd_handle
+            c_string path
+
+        if isinstance(source, six.string_types):
+            path = tobytes(source)
+
+            # Must be in one expression to avoid calling std::move which is not
+            # possible in Cython (due to missing rvalue support)
+
+            # TODO(wesm): ParquetFileReader::OpenFile can throw?
+            self.reader = unique_ptr[FileReader](
+                new FileReader(default_memory_pool(),
+                               ParquetFileReader.OpenFile(path)))
+        else:
+            get_reader(source, &rd_handle)
+            check_status(OpenFile(rd_handle, self.allocator, &self.reader))
+
+    @property
+    def metadata(self):
+        cdef:
+            shared_ptr[CFileMetaData] metadata
+            FileMetaData result
+        if self._metadata is not None:
+            return self._metadata
+
+        metadata = self.reader.get().parquet_reader().metadata()
+
+        self._metadata = result = FileMetaData()
+        result.init(metadata)
+        return result
+
+    def read_all(self):
+        cdef:
+            Table table = Table()
+            shared_ptr[CTable] ctable
+
+        with nogil:
+            check_status(self.reader.get()
+                         .ReadFlatTable(&ctable))
+
+        table.init(ctable)
+        return table
+
+    def column_name_idx(self, column_name):
+        """
+        Find the matching index of a column in the schema.
+
+        Parameter
+        ---------
+        column_name: str
+            Name of the column, separation of nesting levels is done via ".".
+
+        Returns
+        -------
+        column_idx: int
+            Integer index of the position of the column
+        """
+        cdef:
+            FileMetaData container = self.metadata
+            const CFileMetaData* metadata = container.metadata
+            int i = 0
+
+        if self.column_idx_map is None:
+            self.column_idx_map = {}
+            for i in range(0, metadata.num_columns()):
+                col_bytes = tobytes(metadata.schema().Column(i)
+                                    .path().get().ToDotString())
+                self.column_idx_map[col_bytes] = i
+
+        return self.column_idx_map[tobytes(column_name)]
+
+    def read_column(self, int column_index):
+        cdef:
+            Array array = Array()
+            shared_ptr[CArray] carray
+
+        with nogil:
+            check_status(self.reader.get()
+                         .ReadFlatColumn(column_index, &carray))
+
+        array.init(carray)
+        return array
+
+
+cdef check_compression_name(name):
+    if name.upper() not in ['NONE', 'SNAPPY', 'GZIP', 'LZO', 'BROTLI']:
+        raise ArrowException("Unsupported compression: " + name)
+
+
+cdef ParquetCompression compression_from_name(object name):
+    name = name.upper()
+    if name == "SNAPPY":
+        return ParquetCompression_SNAPPY
+    elif name == "GZIP":
+        return ParquetCompression_GZIP
+    elif name == "LZO":
+        return ParquetCompression_LZO
+    elif name == "BROTLI":
+        return ParquetCompression_BROTLI
+    else:
+        return ParquetCompression_UNCOMPRESSED
+
+
+cdef class ParquetWriter:
+    cdef:
+        shared_ptr[WriterProperties] properties
+        shared_ptr[OutputStream] sink
+
+    cdef readonly:
+        object use_dictionary
+        object compression
+        object version
+        int row_group_size
+
+    def __cinit__(self, where, use_dictionary=None, compression=None,
+                  version=None):
+        cdef shared_ptr[FileOutputStream] filestream
+
+        if isinstance(where, six.string_types):
+            check_status(FileOutputStream.Open(tobytes(where), &filestream))
+            self.sink = <shared_ptr[OutputStream]> filestream
+        else:
+            get_writer(where, &self.sink)
+
+        self.use_dictionary = use_dictionary
+        self.compression = compression
+        self.version = version
+        self._setup_properties()
+
+    cdef _setup_properties(self):
+        cdef WriterProperties.Builder properties_builder
+        self._set_version(&properties_builder)
+        self._set_compression_props(&properties_builder)
+        self._set_dictionary_props(&properties_builder)
+        self.properties = properties_builder.build()
+
+    cdef _set_version(self, WriterProperties.Builder* props):
+        if self.version is not None:
+            if self.version == "1.0":
+                props.version(ParquetVersion_V1)
+            elif self.version == "2.0":
+                props.version(ParquetVersion_V2)
+            else:
+                raise ArrowException("Unsupported Parquet format version")
+
+    cdef _set_compression_props(self, WriterProperties.Builder* props):
+        if isinstance(self.compression, basestring):
+            check_compression_name(self.compression)
+            props.compression(compression_from_name(self.compression))
+        elif self.compression is not None:
+            # Deactivate dictionary encoding by default
+            props.disable_dictionary()
+            for column, codec in self.compression.iteritems():
+                check_compression_name(codec)
+                props.compression(column, compression_from_name(codec))
+
+    cdef _set_dictionary_props(self, WriterProperties.Builder* props):
+        if isinstance(self.use_dictionary, bool):
+            if self.use_dictionary:
+                props.enable_dictionary()
+            else:
+                props.disable_dictionary()
+        else:
+            # Deactivate dictionary encoding by default
+            props.disable_dictionary()
+            for column in self.use_dictionary:
+                props.enable_dictionary(column)
+
+    def write_table(self, Table table, row_group_size=None):
+        cdef CTable* ctable = table.table
+
+        if row_group_size is None:
+            row_group_size = ctable.num_rows()
+
+        cdef int c_row_group_size = row_group_size
+        with nogil:
+            check_status(WriteFlatTable(ctable, default_memory_pool(),
+                                        self.sink, c_row_group_size,
+                                        self.properties))

http://git-wip-us.apache.org/repos/asf/arrow/blob/f44b6a3b/python/pyarrow/includes/parquet.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/parquet.pxd b/python/pyarrow/includes/parquet.pxd
deleted file mode 100644
index d9e121d..0000000
--- a/python/pyarrow/includes/parquet.pxd
+++ /dev/null
@@ -1,147 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# distutils: language = c++
-
-from pyarrow.includes.common cimport *
-from pyarrow.includes.libarrow cimport CArray, CSchema, CStatus, CTable, MemoryPool
-from pyarrow.includes.libarrow_io cimport ReadableFileInterface, OutputStream
-
-
-cdef extern from "parquet/api/schema.h" namespace "parquet::schema" nogil:
-  cdef cppclass Node:
-    pass
-
-  cdef cppclass GroupNode(Node):
-    pass
-
-  cdef cppclass PrimitiveNode(Node):
-    pass
-
-  cdef cppclass ColumnPath:
-    c_string ToDotString()
-
-cdef extern from "parquet/api/schema.h" namespace "parquet" nogil:
-  enum ParquetVersion" parquet::ParquetVersion::type":
-      PARQUET_1_0" parquet::ParquetVersion::PARQUET_1_0"
-      PARQUET_2_0" parquet::ParquetVersion::PARQUET_2_0"
-
-  enum Compression" parquet::Compression::type":
-      UNCOMPRESSED" parquet::Compression::UNCOMPRESSED"
-      SNAPPY" parquet::Compression::SNAPPY"
-      GZIP" parquet::Compression::GZIP"
-      LZO" parquet::Compression::LZO"
-      BROTLI" parquet::Compression::BROTLI"
-
-  cdef cppclass ColumnDescriptor:
-    shared_ptr[ColumnPath] path()
-
-  cdef cppclass SchemaDescriptor:
-    const ColumnDescriptor* Column(int i)
-    shared_ptr[Node] schema()
-    GroupNode* group()
-
-
-cdef extern from "parquet/api/reader.h" namespace "parquet" nogil:
-    cdef cppclass ColumnReader:
-        pass
-
-    cdef cppclass BoolReader(ColumnReader):
-        pass
-
-    cdef cppclass Int32Reader(ColumnReader):
-        pass
-
-    cdef cppclass Int64Reader(ColumnReader):
-        pass
-
-    cdef cppclass Int96Reader(ColumnReader):
-        pass
-
-    cdef cppclass FloatReader(ColumnReader):
-        pass
-
-    cdef cppclass DoubleReader(ColumnReader):
-        pass
-
-    cdef cppclass ByteArrayReader(ColumnReader):
-        pass
-
-    cdef cppclass RowGroupReader:
-        pass
-
-    cdef cppclass FileMetaData:
-        uint32_t size()
-        int num_columns()
-        int64_t num_rows()
-        int num_row_groups()
-        int32_t version()
-        const c_string created_by()
-        int num_schema_elements()
-        const SchemaDescriptor* schema()
-
-    cdef cppclass ParquetFileReader:
-        # TODO: Some default arguments are missing
-        @staticmethod
-        unique_ptr[ParquetFileReader] OpenFile(const c_string& path)
-        shared_ptr[FileMetaData] metadata();
-
-
-cdef extern from "parquet/api/writer.h" namespace "parquet" nogil:
-    cdef cppclass ParquetOutputStream" parquet::OutputStream":
-        pass
-
-    cdef cppclass LocalFileOutputStream(ParquetOutputStream):
-        LocalFileOutputStream(const c_string& path)
-        void Close()
-
-    cdef cppclass WriterProperties:
-        cppclass Builder:
-            Builder* version(ParquetVersion version)
-            Builder* compression(Compression codec)
-            Builder* compression(const c_string& path, Compression codec)
-            Builder* disable_dictionary()
-            Builder* enable_dictionary()
-            Builder* enable_dictionary(const c_string& path)
-            shared_ptr[WriterProperties] build()
-
-
-cdef extern from "parquet/arrow/reader.h" namespace "parquet::arrow" nogil:
-    CStatus OpenFile(const shared_ptr[ReadableFileInterface]& file,
-                     MemoryPool* allocator,
-                     unique_ptr[FileReader]* reader)
-
-    cdef cppclass FileReader:
-        FileReader(MemoryPool* pool, unique_ptr[ParquetFileReader] reader)
-        CStatus ReadFlatColumn(int i, shared_ptr[CArray]* out);
-        CStatus ReadFlatTable(shared_ptr[CTable]* out);
-        const ParquetFileReader* parquet_reader();
-
-
-cdef extern from "parquet/arrow/schema.h" namespace "parquet::arrow" nogil:
-    CStatus FromParquetSchema(const SchemaDescriptor* parquet_schema,
-                              shared_ptr[CSchema]* out)
-    CStatus ToParquetSchema(const CSchema* arrow_schema,
-                            shared_ptr[SchemaDescriptor]* out)
-
-
-cdef extern from "parquet/arrow/writer.h" namespace "parquet::arrow" nogil:
-    cdef CStatus WriteFlatTable(
-        const CTable* table, MemoryPool* pool,
-        const shared_ptr[OutputStream]& sink,
-        int64_t chunk_size,
-        const shared_ptr[WriterProperties]& properties)

http://git-wip-us.apache.org/repos/asf/arrow/blob/f44b6a3b/python/pyarrow/parquet.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py
new file mode 100644
index 0000000..2dedb72
--- /dev/null
+++ b/python/pyarrow/parquet.py
@@ -0,0 +1,116 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pyarrow._parquet as _parquet
+from pyarrow.table import Table
+
+
+class ParquetFile(object):
+    """
+    Open a Parquet binary file for reading
+
+    Parameters
+    ----------
+    source : str or pyarrow.io.NativeFile
+        Readable source. For passing Python file objects or byte buffers,
+        see pyarrow.io.PythonFileInterface or pyarrow.io.BytesReader.
+    metadata : ParquetFileMetadata, default None
+        Use existing metadata object, rather than reading from file.
+    """
+    def __init__(self, source, metadata=None):
+        self.reader = _parquet.ParquetReader()
+        self.reader.open(source)
+
+    @property
+    def metadata(self):
+        return self.reader.metadata
+
+    @property
+    def schema(self):
+        return self.metadata.schema
+
+    def read(self, nrows=None, columns=None):
+        """
+        Read a Table from Parquet format
+
+        Parameters
+        ----------
+        columns: list
+            If not None, only these columns will be read from the file.
+
+        Returns
+        -------
+        pyarrow.table.Table
+            Content of the file as a table (of columns)
+        """
+        if nrows is not None:
+            raise NotImplementedError("nrows argument")
+
+        if columns is None:
+            return self.reader.read_all()
+        else:
+            column_idxs = [self.reader.column_name_idx(column)
+                           for column in columns]
+            arrays = [self.reader.read_column(column_idx)
+                      for column_idx in column_idxs]
+            return Table.from_arrays(columns, arrays)
+
+
+def read_table(source, columns=None):
+    """
+    Read a Table from Parquet format
+
+    Parameters
+    ----------
+    source: str or pyarrow.io.NativeFile
+        Readable source. For passing Python file objects or byte buffers, see
+        pyarrow.io.PythonFileInterface or pyarrow.io.BytesReader.
+    columns: list
+        If not None, only these columns will be read from the file.
+
+    Returns
+    -------
+    pyarrow.table.Table
+        Content of the file as a table (of columns)
+    """
+    return ParquetFile(source).read(columns=columns)
+
+
+def write_table(table, sink, chunk_size=None, version=None,
+                use_dictionary=True, compression=None):
+    """
+    Write a Table to Parquet format
+
+    Parameters
+    ----------
+    table : pyarrow.Table
+    sink: string or pyarrow.io.NativeFile
+    chunk_size : int
+        The maximum number of rows in each Parquet RowGroup. As a default,
+        we will write a single RowGroup per file.
+    version : {"1.0", "2.0"}, default "1.0"
+        The Parquet format version, defaults to 1.0
+    use_dictionary : bool or list
+        Specify if we should use dictionary encoding in general or only for
+        some columns.
+    compression : str or dict
+        Specify the compression codec, either on a general basis or per-column.
+    """
+    writer = _parquet.ParquetWriter(sink, use_dictionary=use_dictionary,
+                                    compression=compression,
+                                    version=version)
+    writer.write_table(table, row_group_size=chunk_size)

http://git-wip-us.apache.org/repos/asf/arrow/blob/f44b6a3b/python/pyarrow/parquet.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/parquet.pyx b/python/pyarrow/parquet.pyx
deleted file mode 100644
index c092185..0000000
--- a/python/pyarrow/parquet.pyx
+++ /dev/null
@@ -1,244 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# cython: profile=False
-# distutils: language = c++
-# cython: embedsignature = True
-
-from pyarrow.includes.libarrow cimport *
-from pyarrow.includes.parquet cimport *
-from pyarrow.includes.libarrow_io cimport ReadableFileInterface, OutputStream, FileOutputStream
-cimport pyarrow.includes.pyarrow as pyarrow
-
-from pyarrow.array cimport Array
-from pyarrow.compat import tobytes
-from pyarrow.error import ArrowException
-from pyarrow.error cimport check_status
-from pyarrow.io import NativeFile
-from pyarrow.table cimport Table
-
-from pyarrow.io cimport NativeFile, get_reader, get_writer
-
-import six
-
-__all__ = [
-    'read_table',
-    'write_table'
-]
-
-cdef class ParquetReader:
-    cdef:
-        MemoryPool* allocator
-        unique_ptr[FileReader] reader
-        column_idx_map
-
-    def __cinit__(self):
-        self.allocator = default_memory_pool()
-
-    def open(self, source):
-        self._open(source)
-
-    cdef _open(self, object source):
-        cdef:
-            shared_ptr[ReadableFileInterface] rd_handle
-            c_string path
-
-        if isinstance(source, six.string_types):
-            path = tobytes(source)
-
-            # Must be in one expression to avoid calling std::move which is not
-            # possible in Cython (due to missing rvalue support)
-
-            # TODO(wesm): ParquetFileReader::OpenFile can throw?
-            self.reader = unique_ptr[FileReader](
-                new FileReader(default_memory_pool(),
-                               ParquetFileReader.OpenFile(path)))
-        else:
-            get_reader(source, &rd_handle)
-            check_status(OpenFile(rd_handle, self.allocator, &self.reader))
-
-    def read_all(self):
-        cdef:
-            Table table = Table()
-            shared_ptr[CTable] ctable
-
-        with nogil:
-            check_status(self.reader.get()
-                         .ReadFlatTable(&ctable))
-
-        table.init(ctable)
-        return table
-
-    def column_name_idx(self, column_name):
-        """
-        Find the matching index of a column in the schema.
-
-        Parameter
-        ---------
-        column_name: str
-            Name of the column, separation of nesting levels is done via ".".
-
-        Returns
-        -------
-        column_idx: int
-            Integer index of the position of the column
-        """
-        cdef:
-            const FileMetaData* metadata = (self.reader.get().parquet_reader()
-                                            .metadata().get())
-            int i = 0
-
-        if self.column_idx_map is None:
-            self.column_idx_map = {}
-            for i in range(0, metadata.num_columns()):
-                col_bytes = tobytes(metadata.schema().Column(i)
-                                    .path().get().ToDotString())
-                self.column_idx_map[col_bytes] = i
-
-        return self.column_idx_map[tobytes(column_name)]
-
-    def read_column(self, int column_index):
-        cdef:
-            Array array = Array()
-            shared_ptr[CArray] carray
-
-        with nogil:
-            check_status(self.reader.get()
-                         .ReadFlatColumn(column_index, &carray))
-
-        array.init(carray)
-        return array
-
-
-def read_table(source, columns=None):
-    """
-    Read a Table from Parquet format
-
-    Parameters
-    ----------
-    source: str or pyarrow.io.NativeFile
-        Readable source. For passing Python file objects or byte buffers, see
-        pyarrow.io.PythonFileInterface or pyarrow.io.BytesReader.
-    columns: list
-        If not None, only these columns will be read from the file.
-
-    Returns
-    -------
-    pyarrow.table.Table
-        Content of the file as a table (of columns)
-    """
-    cdef ParquetReader reader = ParquetReader()
-    reader._open(source)
-
-    if columns is None:
-        return reader.read_all()
-    else:
-        column_idxs = [reader.column_name_idx(column) for column in columns]
-        arrays = [reader.read_column(column_idx) for column_idx in column_idxs]
-        return Table.from_arrays(columns, arrays)
-
-
-def write_table(table, sink, chunk_size=None, version=None,
-                use_dictionary=True, compression=None):
-    """
-    Write a Table to Parquet format
-
-    Parameters
-    ----------
-    table : pyarrow.Table
-    sink: string or pyarrow.io.NativeFile
-    chunk_size : int
-        The maximum number of rows in each Parquet RowGroup. As a default,
-        we will write a single RowGroup per file.
-    version : {"1.0", "2.0"}, default "1.0"
-        The Parquet format version, defaults to 1.0
-    use_dictionary : bool or list
-        Specify if we should use dictionary encoding in general or only for
-        some columns.
-    compression : str or dict
-        Specify the compression codec, either on a general basis or per-column.
-    """
-    cdef Table table_ = table
-    cdef CTable* ctable_ = table_.table
-    cdef shared_ptr[FileOutputStream] filesink_
-    cdef shared_ptr[OutputStream] sink_
-
-    cdef WriterProperties.Builder properties_builder
-    cdef int64_t chunk_size_ = 0
-    if chunk_size is None:
-        chunk_size_ = ctable_.num_rows()
-    else:
-        chunk_size_ = chunk_size
-
-    if version is not None:
-        if version == "1.0":
-            properties_builder.version(PARQUET_1_0)
-        elif version == "2.0":
-            properties_builder.version(PARQUET_2_0)
-        else:
-            raise ArrowException("Unsupported Parquet format version")
-
-    if isinstance(use_dictionary, bool):
-        if use_dictionary:
-            properties_builder.enable_dictionary()
-        else:
-            properties_builder.disable_dictionary()
-    else:
-        # Deactivate dictionary encoding by default
-        properties_builder.disable_dictionary()
-        for column in use_dictionary:
-            properties_builder.enable_dictionary(column)
-
-    if isinstance(compression, basestring):
-        if compression == "NONE":
-            properties_builder.compression(UNCOMPRESSED)
-        elif compression == "SNAPPY":
-            properties_builder.compression(SNAPPY)
-        elif compression == "GZIP":
-            properties_builder.compression(GZIP)
-        elif compression == "LZO":
-            properties_builder.compression(LZO)
-        elif compression == "BROTLI":
-            properties_builder.compression(BROTLI)
-        else:
-            raise ArrowException("Unsupport compression codec")
-    elif compression is not None:
-        # Deactivate dictionary encoding by default
-        properties_builder.disable_dictionary()
-        for column, codec in compression.iteritems():
-            if codec == "NONE":
-                properties_builder.compression(column, UNCOMPRESSED)
-            elif codec == "SNAPPY":
-                properties_builder.compression(column, SNAPPY)
-            elif codec == "GZIP":
-                properties_builder.compression(column, GZIP)
-            elif codec == "LZO":
-                properties_builder.compression(column, LZO)
-            elif codec == "BROTLI":
-                properties_builder.compression(column, BROTLI)
-            else:
-                raise ArrowException("Unsupport compression codec")
-
-    if isinstance(sink, six.string_types):
-        check_status(FileOutputStream.Open(tobytes(sink), &filesink_))
-        sink_ = <shared_ptr[OutputStream]>filesink_
-    else:
-        get_writer(sink, &sink_)
-
-    with nogil:
-        check_status(WriteFlatTable(ctable_, default_memory_pool(), sink_,
-                                    chunk_size_, properties_builder.build()))

http://git-wip-us.apache.org/repos/asf/arrow/blob/f44b6a3b/python/pyarrow/tests/test_parquet.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index 0fb913c..ad4bc58 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -55,10 +55,8 @@ def test_single_pylist_column_roundtrip(tmpdir):
             assert data_written.equals(data_read)
 
 
-@parquet
-def test_pandas_parquet_2_0_rountrip(tmpdir):
-    size = 10000
-    np.random.seed(0)
+def alltypes_sample(size=10000, seed=0):
+    np.random.seed(seed)
     df = pd.DataFrame({
         'uint8': np.arange(size, dtype=np.uint8),
         'uint16': np.arange(size, dtype=np.uint16),
@@ -71,13 +69,21 @@ def test_pandas_parquet_2_0_rountrip(tmpdir):
         'float32': np.arange(size, dtype=np.float32),
         'float64': np.arange(size, dtype=np.float64),
         'bool': np.random.randn(size) > 0,
-        # Pandas only support ns resolution, Arrow at the moment only ms
+        # TODO(wesm): Test other timestamp resolutions now that arrow supports
+        # them
         'datetime': np.arange("2016-01-01T00:00:00.001", size,
                               dtype='datetime64[ms]'),
         'str': [str(x) for x in range(size)],
         'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None],
         'empty_str': [''] * size
     })
+    return df
+
+
+@parquet
+def test_pandas_parquet_2_0_rountrip(tmpdir):
+    df = alltypes_sample(size=10000)
+
     filename = tmpdir.join('pandas_rountrip.parquet')
     arrow_table = A.Table.from_pandas(df, timestamps_to_ms=True)
     A.parquet.write_table(arrow_table, filename.strpath, version="2.0")
@@ -117,6 +123,7 @@ def test_pandas_parquet_1_0_rountrip(tmpdir):
 
     pdt.assert_frame_equal(df, df_read)
 
+
 @parquet
 def test_pandas_column_selection(tmpdir):
     size = 10000
@@ -227,3 +234,57 @@ def test_pandas_parquet_configuration_options(tmpdir):
         table_read = pq.read_table(filename.strpath)
         df_read = table_read.to_pandas()
         pdt.assert_frame_equal(df, df_read)
+
+
+@parquet
+def test_parquet_metadata_api():
+    df = alltypes_sample(size=10000)
+    df = df.reindex(columns=sorted(df.columns))
+
+    a_table = A.Table.from_pandas(df, timestamps_to_ms=True)
+
+    buf = io.BytesIO()
+    pq.write_table(a_table, buf, compression='snappy', version='2.0')
+
+    buf.seek(0)
+    fileh = pq.ParquetFile(buf)
+
+    ncols = len(df.columns)
+
+    # Series of sniff tests
+    meta = fileh.metadata
+    repr(meta)
+    assert meta.num_rows == len(df)
+    assert meta.num_columns == ncols
+    assert meta.num_row_groups == 1
+    assert meta.format_version == '2.0'
+    assert 'parquet-cpp' in meta.created_by
+
+    # Schema
+    schema = fileh.schema
+    assert meta.schema is schema
+    assert len(schema) == ncols
+    repr(schema)
+
+    col = schema[0]
+    repr(col)
+    assert col.name == df.columns[0]
+    assert col.max_definition_level == 1
+    assert col.max_repetition_level == 0
+    assert col.max_repetition_level == 0
+
+    assert col.physical_type == 'BOOLEAN'
+    assert col.logical_type == 'NONE'
+
+    with pytest.raises(IndexError):
+        schema[ncols]
+
+    with pytest.raises(IndexError):
+        schema[-1]
+
+    # Row group
+    rg_meta = meta.row_group(0)
+    repr(rg_meta)
+
+    assert rg_meta.num_rows == len(df)
+    assert rg_meta.num_columns == ncols

http://git-wip-us.apache.org/repos/asf/arrow/blob/f44b6a3b/python/setup.py
----------------------------------------------------------------------
diff --git a/python/setup.py b/python/setup.py
index 3829a79..72ff584 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -95,7 +95,7 @@ class build_ext(_build_ext):
         'error',
         'io',
         'ipc',
-        'parquet',
+        '_parquet',
         'scalar',
         'schema',
         'table']
@@ -214,7 +214,7 @@ class build_ext(_build_ext):
         os.chdir(saved_cwd)
 
     def _failure_permitted(self, name):
-        if name == 'parquet' and not self.with_parquet:
+        if name == '_parquet' and not self.with_parquet:
             return True
         return False