You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2016/12/23 22:29:28 UTC

[2/2] arrow git commit: ARROW-432: [Python] Construct precise pandas BlockManager structure for zero-copy DataFrame initialization

ARROW-432: [Python] Construct precise pandas BlockManager structure for zero-copy DataFrame initialization

This avoids "memory tripling" (because pd.DataFrame will often immediately consolidate the arrays) and also will allow the Arrow->pandas deserialization to be parallelized for further performance gains.

@xhochy this also has the effect of coercing all timestamps to `datetime[ns]` -- for pandas I believe this is the proper behavior, but wanted to run it by you.

In a local benchmark on roughly 1GB of data I have:

setup code:

```python
import numpy as np
import pandas as pd
import pyarrow as pa

DATA_SIZE = (1 << 30)
NCOLS = 100

data = np.random.randn(NCOLS, DATA_SIZE / NCOLS / 8).T
data[::2] = np.nan

df = pd.DataFrame(data, columns=['c' + str(i) for i in range(NCOLS)])

table = pa.Table.from_pandas(df)
```

before these changes (I added `block_based` argument to toggle this code path off):

```python
In [4]: %timeit df2 = table.to_pandas(block_based=False)
1 loop, best of 3: 252 ms per loop
```

```python
In [5]: %timeit df2 = table.to_pandas()
10 loops, best of 3: 139 ms per loop
```

This takes the effective in-memory bandwidth on numerical data from 3.97 GB/s to 7.19 GB/s.

I also moved the clang-format files to the top level so we can more easily run code formatting on the C++ code under python/.

Author: Wes McKinney <we...@twosigma.com>

Closes #251 from wesm/ARROW-432 and squashes the following commits:

f22e1b5 [Wes McKinney] Remove unneeded import
ea83ded [Wes McKinney] Unit tests pass again
ec239b8 [Wes McKinney] Fix DataFrame constructions, code formatting
af960ee [Wes McKinney] Draft blocks -> DataFrame scaffold
110692f [Wes McKinney] First draft of scaffolding for creating precise pandas.DataFrame block structure
4928a0b [Wes McKinney] Refactor post rebase
c89cfaf [Wes McKinney] Rearrange to-pandas deserialization to better permit reads into pre-allocated memory


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/65af9ea1
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/65af9ea1
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/65af9ea1

Branch: refs/heads/master
Commit: 65af9ea16a3c9241a66203b57cbfe2041a5ee52b
Parents: fd4eb98
Author: Wes McKinney <we...@twosigma.com>
Authored: Fri Dec 23 17:29:17 2016 -0500
Committer: Wes McKinney <we...@twosigma.com>
Committed: Fri Dec 23 17:29:17 2016 -0500

----------------------------------------------------------------------
 .clang-format                          |   65 ++
 .clang-tidy                            |   14 +
 .clang-tidy-ignore                     |    2 +
 cpp/CMakeLists.txt                     |    3 +-
 cpp/src/.clang-format                  |   65 --
 cpp/src/.clang-tidy                    |   14 -
 cpp/src/.clang-tidy-ignore             |    2 -
 python/pyarrow/includes/pyarrow.pxd    |    5 +-
 python/pyarrow/table.pyx               |   54 +-
 python/src/pyarrow/adapters/builtin.cc |   66 +-
 python/src/pyarrow/adapters/builtin.h  |    4 +-
 python/src/pyarrow/adapters/pandas.cc  | 1184 +++++++++++++++++++--------
 python/src/pyarrow/adapters/pandas.h   |   33 +-
 python/src/pyarrow/api.h               |    2 +-
 python/src/pyarrow/common.cc           |    4 +-
 python/src/pyarrow/common.h            |   52 +-
 python/src/pyarrow/config.cc           |    5 +-
 python/src/pyarrow/config.h            |    6 +-
 python/src/pyarrow/helpers.cc          |   37 +-
 python/src/pyarrow/helpers.h           |    4 +-
 python/src/pyarrow/io.cc               |   14 +-
 python/src/pyarrow/io.h                |    6 +-
 python/src/pyarrow/numpy_interop.h     |    4 +-
 python/src/pyarrow/util/datetime.h     |    8 +-
 python/src/pyarrow/util/test_main.cc   |    2 +-
 25 files changed, 1067 insertions(+), 588 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/65af9ea1/.clang-format
----------------------------------------------------------------------
diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000..7d5b3cf
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,65 @@
+---
+Language:        Cpp
+# BasedOnStyle:  Google
+AccessModifierOffset: -1
+AlignAfterOpenBracket: false 
+AlignConsecutiveAssignments: false
+AlignEscapedNewlinesLeft: true
+AlignOperands:   true
+AlignTrailingComments: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: true 
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: Inline
+AllowShortIfStatementsOnASingleLine: true
+AllowShortLoopsOnASingleLine: false 
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: true
+BinPackArguments: true
+BinPackParameters: true 
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Attach
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+ColumnLimit: 90 
+CommentPragmas:  '^ IWYU pragma:'
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false 
+DisableFormat:   false
+ExperimentalAutoDetectBinPacking: false
+ForEachMacros:   [ foreach, Q_FOREACH, BOOST_FOREACH ]
+IndentCaseLabels: true
+IndentWidth:     2
+IndentWrappedFunctionNames: false
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBlockIndentWidth: 2
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: false
+PenaltyBreakBeforeFirstCallParameter: 1000
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 200
+PointerAlignment: Left
+SpaceAfterCStyleCast: false
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: ControlStatements
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 2
+SpacesInAngles:  false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard:        Cpp11
+TabWidth:        8
+UseTab:          Never

http://git-wip-us.apache.org/repos/asf/arrow/blob/65af9ea1/.clang-tidy
----------------------------------------------------------------------
diff --git a/.clang-tidy b/.clang-tidy
new file mode 100644
index 0000000..deaa9bd
--- /dev/null
+++ b/.clang-tidy
@@ -0,0 +1,14 @@
+---
+Checks:          'clang-diagnostic-*,clang-analyzer-*,-clang-analyzer-alpha*,google-.*,modernize-.*,readablity-.*'
+HeaderFilterRegex: 'arrow/.*'
+AnalyzeTemporaryDtors: true
+CheckOptions:    
+  - key:             google-readability-braces-around-statements.ShortStatementLines
+    value:           '1'
+  - key:             google-readability-function-size.StatementThreshold
+    value:           '800'
+  - key:             google-readability-namespace-comments.ShortNamespaceLines
+    value:           '10'
+  - key:             google-readability-namespace-comments.SpacesBeforeComments
+    value:           '2'
+

http://git-wip-us.apache.org/repos/asf/arrow/blob/65af9ea1/.clang-tidy-ignore
----------------------------------------------------------------------
diff --git a/.clang-tidy-ignore b/.clang-tidy-ignore
new file mode 100644
index 0000000..5ab4d20
--- /dev/null
+++ b/.clang-tidy-ignore
@@ -0,0 +1,2 @@
+ipc-adapter-test.cc
+memory-pool-test.cc

http://git-wip-us.apache.org/repos/asf/arrow/blob/65af9ea1/cpp/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 315995c..93e9853 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -741,7 +741,8 @@ endif (UNIX)
 if (${CLANG_FORMAT_FOUND})
   # runs clang format and updates files in place.
   add_custom_target(format ${BUILD_SUPPORT_DIR}/run-clang-format.sh ${CMAKE_CURRENT_SOURCE_DIR} ${CLANG_FORMAT_BIN} 1
-  `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc -or -name \\*.h | sed -e '/_generated/g'`)
+    `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc -or -name \\*.h | sed -e '/_generated/g'`
+    `find ${CMAKE_CURRENT_SOURCE_DIR}/../python -name \\*.cc -or -name \\*.h`)
 
   # runs clang format and exits with a non-zero exit code if any files need to be reformatted
   add_custom_target(check-format ${BUILD_SUPPORT_DIR}/run-clang-format.sh ${CMAKE_CURRENT_SOURCE_DIR} ${CLANG_FORMAT_BIN} 0

http://git-wip-us.apache.org/repos/asf/arrow/blob/65af9ea1/cpp/src/.clang-format
----------------------------------------------------------------------
diff --git a/cpp/src/.clang-format b/cpp/src/.clang-format
deleted file mode 100644
index 7d5b3cf..0000000
--- a/cpp/src/.clang-format
+++ /dev/null
@@ -1,65 +0,0 @@
----
-Language:        Cpp
-# BasedOnStyle:  Google
-AccessModifierOffset: -1
-AlignAfterOpenBracket: false 
-AlignConsecutiveAssignments: false
-AlignEscapedNewlinesLeft: true
-AlignOperands:   true
-AlignTrailingComments: true
-AllowAllParametersOfDeclarationOnNextLine: true
-AllowShortBlocksOnASingleLine: true 
-AllowShortCaseLabelsOnASingleLine: false
-AllowShortFunctionsOnASingleLine: Inline
-AllowShortIfStatementsOnASingleLine: true
-AllowShortLoopsOnASingleLine: false 
-AlwaysBreakAfterDefinitionReturnType: None
-AlwaysBreakBeforeMultilineStrings: true
-AlwaysBreakTemplateDeclarations: true
-BinPackArguments: true
-BinPackParameters: true 
-BreakBeforeBinaryOperators: None
-BreakBeforeBraces: Attach
-BreakBeforeTernaryOperators: true
-BreakConstructorInitializersBeforeComma: false
-ColumnLimit: 90 
-CommentPragmas:  '^ IWYU pragma:'
-ConstructorInitializerAllOnOneLineOrOnePerLine: true
-ConstructorInitializerIndentWidth: 4
-ContinuationIndentWidth: 4
-Cpp11BracedListStyle: true
-DerivePointerAlignment: false 
-DisableFormat:   false
-ExperimentalAutoDetectBinPacking: false
-ForEachMacros:   [ foreach, Q_FOREACH, BOOST_FOREACH ]
-IndentCaseLabels: true
-IndentWidth:     2
-IndentWrappedFunctionNames: false
-KeepEmptyLinesAtTheStartOfBlocks: false
-MacroBlockBegin: ''
-MacroBlockEnd:   ''
-MaxEmptyLinesToKeep: 1
-NamespaceIndentation: None
-ObjCBlockIndentWidth: 2
-ObjCSpaceAfterProperty: false
-ObjCSpaceBeforeProtocolList: false
-PenaltyBreakBeforeFirstCallParameter: 1000
-PenaltyBreakComment: 300
-PenaltyBreakFirstLessLess: 120
-PenaltyBreakString: 1000
-PenaltyExcessCharacter: 1000000
-PenaltyReturnTypeOnItsOwnLine: 200
-PointerAlignment: Left
-SpaceAfterCStyleCast: false
-SpaceBeforeAssignmentOperators: true
-SpaceBeforeParens: ControlStatements
-SpaceInEmptyParentheses: false
-SpacesBeforeTrailingComments: 2
-SpacesInAngles:  false
-SpacesInContainerLiterals: true
-SpacesInCStyleCastParentheses: false
-SpacesInParentheses: false
-SpacesInSquareBrackets: false
-Standard:        Cpp11
-TabWidth:        8
-UseTab:          Never

http://git-wip-us.apache.org/repos/asf/arrow/blob/65af9ea1/cpp/src/.clang-tidy
----------------------------------------------------------------------
diff --git a/cpp/src/.clang-tidy b/cpp/src/.clang-tidy
deleted file mode 100644
index deaa9bd..0000000
--- a/cpp/src/.clang-tidy
+++ /dev/null
@@ -1,14 +0,0 @@
----
-Checks:          'clang-diagnostic-*,clang-analyzer-*,-clang-analyzer-alpha*,google-.*,modernize-.*,readablity-.*'
-HeaderFilterRegex: 'arrow/.*'
-AnalyzeTemporaryDtors: true
-CheckOptions:    
-  - key:             google-readability-braces-around-statements.ShortStatementLines
-    value:           '1'
-  - key:             google-readability-function-size.StatementThreshold
-    value:           '800'
-  - key:             google-readability-namespace-comments.ShortNamespaceLines
-    value:           '10'
-  - key:             google-readability-namespace-comments.SpacesBeforeComments
-    value:           '2'
-

http://git-wip-us.apache.org/repos/asf/arrow/blob/65af9ea1/cpp/src/.clang-tidy-ignore
----------------------------------------------------------------------
diff --git a/cpp/src/.clang-tidy-ignore b/cpp/src/.clang-tidy-ignore
deleted file mode 100644
index 5ab4d20..0000000
--- a/cpp/src/.clang-tidy-ignore
+++ /dev/null
@@ -1,2 +0,0 @@
-ipc-adapter-test.cc
-memory-pool-test.cc

http://git-wip-us.apache.org/repos/asf/arrow/blob/65af9ea1/python/pyarrow/includes/pyarrow.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/pyarrow.pxd b/python/pyarrow/includes/pyarrow.pxd
index a5444c2..dc6ccd2 100644
--- a/python/pyarrow/includes/pyarrow.pxd
+++ b/python/pyarrow/includes/pyarrow.pxd
@@ -18,7 +18,7 @@
 # distutils: language = c++
 
 from pyarrow.includes.common cimport *
-from pyarrow.includes.libarrow cimport (CArray, CBuffer, CColumn,
+from pyarrow.includes.libarrow cimport (CArray, CBuffer, CColumn, CTable,
                                         CDataType, CStatus, Type, MemoryPool)
 
 cimport pyarrow.includes.libarrow_io as arrow_io
@@ -39,6 +39,9 @@ cdef extern from "pyarrow/api.h" namespace "pyarrow" nogil:
     CStatus ConvertColumnToPandas(const shared_ptr[CColumn]& arr,
                                   PyObject* py_ref, PyObject** out)
 
+    CStatus ConvertTableToPandas(const shared_ptr[CTable]& table,
+                                 int nthreads, PyObject** out)
+
     MemoryPool* get_memory_pool()
 
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/65af9ea1/python/pyarrow/table.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/table.pyx b/python/pyarrow/table.pyx
index 2f7d430..9375557 100644
--- a/python/pyarrow/table.pyx
+++ b/python/pyarrow/table.pyx
@@ -430,6 +430,32 @@ cdef class RecordBatch:
         return result
 
 
+cdef table_to_blockmanager(const shared_ptr[CTable]& table, int nthreads):
+    cdef:
+        PyObject* result_obj
+        CColumn* col
+        int i
+
+    from pandas.core.internals import BlockManager, make_block
+    from pandas import RangeIndex
+
+    check_status(pyarrow.ConvertTableToPandas(table, nthreads, &result_obj))
+
+    result = PyObject_to_object(result_obj)
+
+    blocks = []
+    for block_arr, placement_arr in result:
+        blocks.append(make_block(block_arr, placement=placement_arr))
+
+    names = []
+    for i in range(table.get().num_columns()):
+        col = table.get().column(i).get()
+        names.append(frombytes(col.name()))
+
+    axes = [names, RangeIndex(table.get().num_rows())]
+    return BlockManager(blocks, axes)
+
+
 cdef class Table:
     """
     A collection of top-level named, equal length Arrow arrays.
@@ -584,7 +610,7 @@ cdef class Table:
         table.init(c_table)
         return table
 
-    def to_pandas(self):
+    def to_pandas(self, nthreads=1, block_based=True):
         """
         Convert the arrow::Table to a pandas DataFrame
 
@@ -599,17 +625,21 @@ cdef class Table:
 
         import pandas as pd
 
-        names = []
-        data = []
-        for i in range(self.table.num_columns()):
-            col = self.table.column(i)
-            column = self.column(i)
-            check_status(pyarrow.ConvertColumnToPandas(
-                col, <PyObject*> column, &arr))
-            names.append(frombytes(col.get().name()))
-            data.append(PyObject_to_object(arr))
-
-        return pd.DataFrame(dict(zip(names, data)), columns=names)
+        if block_based:
+            mgr = table_to_blockmanager(self.sp_table, nthreads)
+            return pd.DataFrame(mgr)
+        else:
+            names = []
+            data = []
+            for i in range(self.table.num_columns()):
+                col = self.table.column(i)
+                column = self.column(i)
+                check_status(pyarrow.ConvertColumnToPandas(
+                    col, <PyObject*> column, &arr))
+                names.append(frombytes(col.get().name()))
+                data.append(PyObject_to_object(arr))
+
+            return pd.DataFrame(dict(zip(names, data)), columns=names)
 
     @property
     def name(self):

http://git-wip-us.apache.org/repos/asf/arrow/blob/65af9ea1/python/src/pyarrow/adapters/builtin.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/adapters/builtin.cc b/python/src/pyarrow/adapters/builtin.cc
index 2a13944..fb7475f 100644
--- a/python/src/pyarrow/adapters/builtin.cc
+++ b/python/src/pyarrow/adapters/builtin.cc
@@ -44,16 +44,16 @@ static inline bool IsPyInteger(PyObject* obj) {
 
 class ScalarVisitor {
  public:
-  ScalarVisitor() :
-      total_count_(0),
-      none_count_(0),
-      bool_count_(0),
-      int_count_(0),
-      date_count_(0),
-      timestamp_count_(0),
-      float_count_(0),
-      binary_count_(0),
-      unicode_count_(0) {}
+  ScalarVisitor()
+      : total_count_(0),
+        none_count_(0),
+        bool_count_(0),
+        int_count_(0),
+        date_count_(0),
+        timestamp_count_(0),
+        float_count_(0),
+        binary_count_(0),
+        unicode_count_(0) {}
 
   void Visit(PyObject* obj) {
     ++total_count_;
@@ -100,9 +100,7 @@ class ScalarVisitor {
     }
   }
 
-  int64_t total_count() const {
-    return total_count_;
-  }
+  int64_t total_count() const { return total_count_; }
 
  private:
   int64_t total_count_;
@@ -123,17 +121,14 @@ static constexpr int MAX_NESTING_LEVELS = 32;
 
 class SeqVisitor {
  public:
-  SeqVisitor() :
-      max_nesting_level_(0) {
+  SeqVisitor() : max_nesting_level_(0) {
     memset(nesting_histogram_, 0, MAX_NESTING_LEVELS * sizeof(int));
   }
 
-  Status Visit(PyObject* obj, int level=0) {
+  Status Visit(PyObject* obj, int level = 0) {
     Py_ssize_t size = PySequence_Size(obj);
 
-    if (level > max_nesting_level_) {
-      max_nesting_level_ = level;
-    }
+    if (level > max_nesting_level_) { max_nesting_level_ = level; }
 
     for (int64_t i = 0; i < size; ++i) {
       // TODO(wesm): Error checking?
@@ -188,9 +183,7 @@ class SeqVisitor {
   int max_observed_level() const {
     int result = 0;
     for (int i = 0; i < MAX_NESTING_LEVELS; ++i) {
-      if (nesting_histogram_[i] > 0) {
-        result = i;
-      }
+      if (nesting_histogram_[i] > 0) { result = i; }
     }
     return result;
   }
@@ -198,9 +191,7 @@ class SeqVisitor {
   int num_nesting_levels() const {
     int result = 0;
     for (int i = 0; i < MAX_NESTING_LEVELS; ++i) {
-      if (nesting_histogram_[i] > 0) {
-        ++result;
-      }
+      if (nesting_histogram_[i] > 0) { ++result; }
     }
     return result;
   }
@@ -214,8 +205,8 @@ class SeqVisitor {
 };
 
 // Non-exhaustive type inference
-static Status InferArrowType(PyObject* obj, int64_t* size,
-    std::shared_ptr<DataType>* out_type) {
+static Status InferArrowType(
+    PyObject* obj, int64_t* size, std::shared_ptr<DataType>* out_type) {
   *size = PySequence_Size(obj);
   if (PyErr_Occurred()) {
     // Not a sequence
@@ -224,9 +215,7 @@ static Status InferArrowType(PyObject* obj, int64_t* size,
   }
 
   // For 0-length sequences, refuse to guess
-  if (*size == 0) {
-    *out_type = arrow::null();
-  }
+  if (*size == 0) { *out_type = arrow::null(); }
 
   SeqVisitor seq_visitor;
   RETURN_NOT_OK(seq_visitor.Visit(obj));
@@ -234,9 +223,7 @@ static Status InferArrowType(PyObject* obj, int64_t* size,
 
   *out_type = seq_visitor.GetType();
 
-  if (*out_type == nullptr) {
-    return Status::TypeError("Unable to determine data type");
-  }
+  if (*out_type == nullptr) { return Status::TypeError("Unable to determine data type"); }
 
   return Status::OK();
 }
@@ -337,7 +324,8 @@ class TimestampConverter : public TypedConverter<arrow::TimestampBuilder> {
       if (item.obj() == Py_None) {
         typed_builder_->AppendNull();
       } else {
-        PyDateTime_DateTime* pydatetime = reinterpret_cast<PyDateTime_DateTime*>(item.obj());
+        PyDateTime_DateTime* pydatetime =
+            reinterpret_cast<PyDateTime_DateTime*>(item.obj());
         struct tm datetime = {0};
         datetime.tm_year = PyDateTime_GET_YEAR(pydatetime) - 1900;
         datetime.tm_mon = PyDateTime_GET_MONTH(pydatetime) - 1;
@@ -462,6 +450,7 @@ class ListConverter : public TypedConverter<arrow::ListBuilder> {
     }
     return Status::OK();
   }
+
  protected:
   std::shared_ptr<SeqConverter> value_converter_;
 };
@@ -496,8 +485,8 @@ Status ListConverter::Init(const std::shared_ptr<ArrayBuilder>& builder) {
   builder_ = builder;
   typed_builder_ = static_cast<arrow::ListBuilder*>(builder.get());
 
-  value_converter_ = GetConverter(static_cast<arrow::ListType*>(
-          builder->type().get())->value_type());
+  value_converter_ =
+      GetConverter(static_cast<arrow::ListType*>(builder->type().get())->value_type());
   if (value_converter_ == nullptr) {
     return Status::NotImplemented("value type not implemented");
   }
@@ -521,8 +510,7 @@ Status ConvertPySequence(PyObject* obj, std::shared_ptr<arrow::Array>* out) {
   std::shared_ptr<SeqConverter> converter = GetConverter(type);
   if (converter == nullptr) {
     std::stringstream ss;
-    ss << "No type converter implemented for "
-       << type->ToString();
+    ss << "No type converter implemented for " << type->ToString();
     return Status::NotImplemented(ss.str());
   }
 
@@ -536,4 +524,4 @@ Status ConvertPySequence(PyObject* obj, std::shared_ptr<arrow::Array>* out) {
   return builder->Finish(out);
 }
 
-} // namespace pyarrow
+}  // namespace pyarrow

http://git-wip-us.apache.org/repos/asf/arrow/blob/65af9ea1/python/src/pyarrow/adapters/builtin.h
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/adapters/builtin.h b/python/src/pyarrow/adapters/builtin.h
index 2ddfdaa..1ff3694 100644
--- a/python/src/pyarrow/adapters/builtin.h
+++ b/python/src/pyarrow/adapters/builtin.h
@@ -40,6 +40,6 @@ namespace pyarrow {
 PYARROW_EXPORT
 arrow::Status ConvertPySequence(PyObject* obj, std::shared_ptr<arrow::Array>* out);
 
-} // namespace pyarrow
+}  // namespace pyarrow
 
-#endif // PYARROW_ADAPTERS_BUILTIN_H
+#endif  // PYARROW_ADAPTERS_BUILTIN_H

http://git-wip-us.apache.org/repos/asf/arrow/blob/65af9ea1/python/src/pyarrow/adapters/pandas.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/adapters/pandas.cc b/python/src/pyarrow/adapters/pandas.cc
index 38f3b6f..899eb55 100644
--- a/python/src/pyarrow/adapters/pandas.cc
+++ b/python/src/pyarrow/adapters/pandas.cc
@@ -28,10 +28,13 @@
 #include <memory>
 #include <sstream>
 #include <string>
+#include <unordered_map>
 
 #include "arrow/api.h"
-#include "arrow/util/bit-util.h"
 #include "arrow/status.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/bit-util.h"
+#include "arrow/util/macros.h"
 
 #include "pyarrow/common.h"
 #include "pyarrow/config.h"
@@ -40,10 +43,13 @@
 namespace pyarrow {
 
 using arrow::Array;
+using arrow::ChunkedArray;
 using arrow::Column;
 using arrow::Field;
 using arrow::DataType;
 using arrow::Status;
+using arrow::Table;
+using arrow::Type;
 
 namespace BitUtil = arrow::BitUtil;
 
@@ -51,8 +57,7 @@ namespace BitUtil = arrow::BitUtil;
 // Serialization
 
 template <int TYPE>
-struct npy_traits {
-};
+struct npy_traits {};
 
 template <>
 struct npy_traits<NPY_BOOL> {
@@ -60,21 +65,17 @@ struct npy_traits<NPY_BOOL> {
   using TypeClass = arrow::BooleanType;
 
   static constexpr bool supports_nulls = false;
-  static inline bool isnull(uint8_t v) {
-    return false;
-  }
+  static inline bool isnull(uint8_t v) { return false; }
 };
 
-#define NPY_INT_DECL(TYPE, CapType, T)              \
-  template <>                                       \
-  struct npy_traits<NPY_##TYPE> {                   \
-    typedef T value_type;                           \
-    using TypeClass = arrow::CapType##Type;         \
-                                                    \
-    static constexpr bool supports_nulls = false;   \
-    static inline bool isnull(T v) {                \
-      return false;                                 \
-    }                                               \
+#define NPY_INT_DECL(TYPE, CapType, T)               \
+  template <>                                        \
+  struct npy_traits<NPY_##TYPE> {                    \
+    typedef T value_type;                            \
+    using TypeClass = arrow::CapType##Type;          \
+                                                     \
+    static constexpr bool supports_nulls = false;    \
+    static inline bool isnull(T v) { return false; } \
   };
 
 NPY_INT_DECL(INT8, Int8, int8_t);
@@ -93,9 +94,7 @@ struct npy_traits<NPY_FLOAT32> {
 
   static constexpr bool supports_nulls = true;
 
-  static inline bool isnull(float v) {
-    return v != v;
-  }
+  static inline bool isnull(float v) { return v != v; }
 };
 
 template <>
@@ -105,9 +104,7 @@ struct npy_traits<NPY_FLOAT64> {
 
   static constexpr bool supports_nulls = true;
 
-  static inline bool isnull(double v) {
-    return v != v;
-  }
+  static inline bool isnull(double v) { return v != v; }
 };
 
 template <>
@@ -135,18 +132,14 @@ struct npy_traits<NPY_OBJECT> {
 template <int TYPE>
 class ArrowSerializer {
  public:
-  ArrowSerializer(arrow::MemoryPool* pool, PyArrayObject* arr, PyArrayObject* mask) :
-      pool_(pool),
-      arr_(arr),
-      mask_(mask) {
+  ArrowSerializer(arrow::MemoryPool* pool, PyArrayObject* arr, PyArrayObject* mask)
+      : pool_(pool), arr_(arr), mask_(mask) {
     length_ = PyArray_SIZE(arr_);
   }
 
   Status Convert(std::shared_ptr<Array>* out);
 
-  int stride() const {
-    return PyArray_STRIDES(arr_)[0];
-  }
+  int stride() const { return PyArray_STRIDES(arr_)[0]; }
 
   Status InitNullBitmap() {
     int null_bytes = BitUtil::BytesForBits(length_);
@@ -215,9 +208,7 @@ class ArrowSerializer {
         const int32_t length = PyBytes_GET_SIZE(obj);
         s = string_builder.Append(PyBytes_AS_STRING(obj), length);
         Py_DECREF(obj);
-        if (!s.ok()) {
-          return s;
-        }
+        if (!s.ok()) { return s; }
       } else if (PyBytes_Check(obj)) {
         have_bytes = true;
         const int32_t length = PyBytes_GET_SIZE(obj);
@@ -230,8 +221,8 @@ class ArrowSerializer {
 
     if (have_bytes) {
       const auto& arr = static_cast<const arrow::StringArray&>(*out->get());
-      *out = std::make_shared<arrow::BinaryArray>(arr.length(), arr.offsets(),
-          arr.data(), arr.null_count(), arr.null_bitmap());
+      *out = std::make_shared<arrow::BinaryArray>(
+          arr.length(), arr.offsets(), arr.data(), arr.null_count(), arr.null_bitmap());
     }
     return Status::OK();
   }
@@ -259,8 +250,7 @@ class ArrowSerializer {
       }
     }
 
-    *out = std::make_shared<arrow::BooleanArray>(length_, data, null_count,
-        null_bitmap_);
+    *out = std::make_shared<arrow::BooleanArray>(length_, data, null_count, null_bitmap_);
 
     return Status::OK();
   }
@@ -321,26 +311,27 @@ inline Status ArrowSerializer<TYPE>::MakeDataType(std::shared_ptr<DataType>* out
 }
 
 template <>
-inline Status ArrowSerializer<NPY_DATETIME>::MakeDataType(std::shared_ptr<DataType>* out) {
+inline Status ArrowSerializer<NPY_DATETIME>::MakeDataType(
+    std::shared_ptr<DataType>* out) {
   PyArray_Descr* descr = PyArray_DESCR(arr_);
   auto date_dtype = reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(descr->c_metadata);
   arrow::TimestampType::Unit unit;
 
   switch (date_dtype->meta.base) {
-      case NPY_FR_s:
-          unit = arrow::TimestampType::Unit::SECOND;
-          break;
-      case NPY_FR_ms:
-          unit = arrow::TimestampType::Unit::MILLI;
-          break;
-      case NPY_FR_us:
-          unit = arrow::TimestampType::Unit::MICRO;
-          break;
-      case NPY_FR_ns:
-          unit = arrow::TimestampType::Unit::NANO;
-          break;
-      default:
-          return Status::Invalid("Unknown NumPy datetime unit");
+    case NPY_FR_s:
+      unit = arrow::TimestampType::Unit::SECOND;
+      break;
+    case NPY_FR_ms:
+      unit = arrow::TimestampType::Unit::MILLI;
+      break;
+    case NPY_FR_us:
+      unit = arrow::TimestampType::Unit::MICRO;
+      break;
+    case NPY_FR_ns:
+      unit = arrow::TimestampType::Unit::NANO;
+      break;
+    default:
+      return Status::Invalid("Unknown NumPy datetime unit");
   }
 
   out->reset(new arrow::TimestampType(unit));
@@ -351,9 +342,7 @@ template <int TYPE>
 inline Status ArrowSerializer<TYPE>::Convert(std::shared_ptr<Array>* out) {
   typedef npy_traits<TYPE> traits;
 
-  if (mask_ != nullptr || traits::supports_nulls) {
-    RETURN_NOT_OK(InitNullBitmap());
-  }
+  if (mask_ != nullptr || traits::supports_nulls) { RETURN_NOT_OK(InitNullBitmap()); }
 
   int64_t null_count = 0;
   if (mask_ != nullptr) {
@@ -429,9 +418,7 @@ inline Status ArrowSerializer<NPY_OBJECT>::Convert(std::shared_ptr<Array>* out)
 template <int TYPE>
 inline Status ArrowSerializer<TYPE>::ConvertData() {
   // TODO(wesm): strided arrays
-  if (is_strided()) {
-    return Status::Invalid("no support for strided data yet");
-  }
+  if (is_strided()) { return Status::Invalid("no support for strided data yet"); }
 
   data_ = std::make_shared<NumPyBuffer>(arr_);
   return Status::OK();
@@ -439,9 +426,7 @@ inline Status ArrowSerializer<TYPE>::ConvertData() {
 
 template <>
 inline Status ArrowSerializer<NPY_BOOL>::ConvertData() {
-  if (is_strided()) {
-    return Status::Invalid("no support for strided data yet");
-  }
+  if (is_strided()) { return Status::Invalid("no support for strided data yet"); }
 
   int nbytes = BitUtil::BytesForBits(length_);
   auto buffer = std::make_shared<arrow::PoolBuffer>(pool_);
@@ -453,9 +438,7 @@ inline Status ArrowSerializer<NPY_BOOL>::ConvertData() {
 
   memset(bitmap, 0, nbytes);
   for (int i = 0; i < length_; ++i) {
-    if (values[i] > 0) {
-      BitUtil::SetBit(bitmap, i);
-    }
+    if (values[i] > 0) { BitUtil::SetBit(bitmap, i); }
   }
 
   data_ = buffer;
@@ -468,29 +451,24 @@ inline Status ArrowSerializer<NPY_OBJECT>::ConvertData() {
   return Status::TypeError("NYI");
 }
 
+#define TO_ARROW_CASE(TYPE)                                 \
+  case NPY_##TYPE: {                                        \
+    ArrowSerializer<NPY_##TYPE> converter(pool, arr, mask); \
+    RETURN_NOT_OK(converter.Convert(out));                  \
+  } break;
 
-#define TO_ARROW_CASE(TYPE)                                     \
-  case NPY_##TYPE:                                              \
-    {                                                           \
-      ArrowSerializer<NPY_##TYPE> converter(pool, arr, mask);   \
-      RETURN_NOT_OK(converter.Convert(out));                    \
-    }                                                           \
-    break;
-
-Status PandasMaskedToArrow(arrow::MemoryPool* pool, PyObject* ao, PyObject* mo,
-    std::shared_ptr<Array>* out) {
+Status PandasMaskedToArrow(
+    arrow::MemoryPool* pool, PyObject* ao, PyObject* mo, std::shared_ptr<Array>* out) {
   PyArrayObject* arr = reinterpret_cast<PyArrayObject*>(ao);
   PyArrayObject* mask = nullptr;
 
-  if (mo != nullptr) {
-    mask = reinterpret_cast<PyArrayObject*>(mo);
-  }
+  if (mo != nullptr) { mask = reinterpret_cast<PyArrayObject*>(mo); }
 
   if (PyArray_NDIM(arr) != 1) {
     return Status::Invalid("only handle 1-dimensional arrays");
   }
 
-  switch(PyArray_DESCR(arr)->type_num) {
+  switch (PyArray_DESCR(arr)->type_num) {
     TO_ARROW_CASE(BOOL);
     TO_ARROW_CASE(INT8);
     TO_ARROW_CASE(INT16);
@@ -506,15 +484,13 @@ Status PandasMaskedToArrow(arrow::MemoryPool* pool, PyObject* ao, PyObject* mo,
     TO_ARROW_CASE(OBJECT);
     default:
       std::stringstream ss;
-      ss << "unsupported type " << PyArray_DESCR(arr)->type_num
-         << std::endl;
+      ss << "unsupported type " << PyArray_DESCR(arr)->type_num << std::endl;
       return Status::NotImplemented(ss.str());
   }
   return Status::OK();
 }
 
-Status PandasToArrow(arrow::MemoryPool* pool, PyObject* ao,
-    std::shared_ptr<Array>* out) {
+Status PandasToArrow(arrow::MemoryPool* pool, PyObject* ao, std::shared_ptr<Array>* out) {
   return PandasMaskedToArrow(pool, ao, nullptr, out);
 }
 
@@ -522,28 +498,27 @@ Status PandasToArrow(arrow::MemoryPool* pool, PyObject* ao,
 // Deserialization
 
 template <int TYPE>
-struct arrow_traits {
-};
+struct arrow_traits {};
 
 template <>
 struct arrow_traits<arrow::Type::BOOL> {
   static constexpr int npy_type = NPY_BOOL;
   static constexpr bool supports_nulls = false;
   static constexpr bool is_boolean = true;
-  static constexpr bool is_pandas_numeric_not_nullable = false;
-  static constexpr bool is_pandas_numeric_nullable = false;
+  static constexpr bool is_numeric_not_nullable = false;
+  static constexpr bool is_numeric_nullable = false;
 };
 
-#define INT_DECL(TYPE)                                           \
-  template <>                                                    \
-  struct arrow_traits<arrow::Type::TYPE> {                       \
-    static constexpr int npy_type = NPY_##TYPE;                  \
-    static constexpr bool supports_nulls = false;                \
-    static constexpr double na_value = NAN;                      \
-    static constexpr bool is_boolean = false;                    \
-    static constexpr bool is_pandas_numeric_not_nullable = true; \
-    static constexpr bool is_pandas_numeric_nullable = false;    \
-    typedef typename npy_traits<NPY_##TYPE>::value_type T;       \
+#define INT_DECL(TYPE)                                     \
+  template <>                                              \
+  struct arrow_traits<arrow::Type::TYPE> {                 \
+    static constexpr int npy_type = NPY_##TYPE;            \
+    static constexpr bool supports_nulls = false;          \
+    static constexpr double na_value = NAN;                \
+    static constexpr bool is_boolean = false;              \
+    static constexpr bool is_numeric_not_nullable = true;  \
+    static constexpr bool is_numeric_nullable = false;     \
+    typedef typename npy_traits<NPY_##TYPE>::value_type T; \
   };
 
 INT_DECL(INT8);
@@ -561,8 +536,8 @@ struct arrow_traits<arrow::Type::FLOAT> {
   static constexpr bool supports_nulls = true;
   static constexpr float na_value = NAN;
   static constexpr bool is_boolean = false;
-  static constexpr bool is_pandas_numeric_not_nullable = false;
-  static constexpr bool is_pandas_numeric_nullable = true;
+  static constexpr bool is_numeric_not_nullable = false;
+  static constexpr bool is_numeric_nullable = true;
   typedef typename npy_traits<NPY_FLOAT32>::value_type T;
 };
 
@@ -572,19 +547,21 @@ struct arrow_traits<arrow::Type::DOUBLE> {
   static constexpr bool supports_nulls = true;
   static constexpr double na_value = NAN;
   static constexpr bool is_boolean = false;
-  static constexpr bool is_pandas_numeric_not_nullable = false;
-  static constexpr bool is_pandas_numeric_nullable = true;
+  static constexpr bool is_numeric_not_nullable = false;
+  static constexpr bool is_numeric_nullable = true;
   typedef typename npy_traits<NPY_FLOAT64>::value_type T;
 };
 
+static constexpr int64_t kPandasTimestampNull = std::numeric_limits<int64_t>::min();
+
 template <>
 struct arrow_traits<arrow::Type::TIMESTAMP> {
   static constexpr int npy_type = NPY_DATETIME;
   static constexpr bool supports_nulls = true;
-  static constexpr int64_t na_value = std::numeric_limits<int64_t>::min();
+  static constexpr int64_t na_value = kPandasTimestampNull;
   static constexpr bool is_boolean = false;
-  static constexpr bool is_pandas_numeric_not_nullable = false;
-  static constexpr bool is_pandas_numeric_nullable = true;
+  static constexpr bool is_numeric_not_nullable = false;
+  static constexpr bool is_numeric_nullable = true;
   typedef typename npy_traits<NPY_DATETIME>::value_type T;
 };
 
@@ -592,10 +569,10 @@ template <>
 struct arrow_traits<arrow::Type::DATE> {
   static constexpr int npy_type = NPY_DATETIME;
   static constexpr bool supports_nulls = true;
-  static constexpr int64_t na_value = std::numeric_limits<int64_t>::min();
+  static constexpr int64_t na_value = kPandasTimestampNull;
   static constexpr bool is_boolean = false;
-  static constexpr bool is_pandas_numeric_not_nullable = false;
-  static constexpr bool is_pandas_numeric_nullable = true;
+  static constexpr bool is_numeric_not_nullable = false;
+  static constexpr bool is_numeric_nullable = true;
   typedef typename npy_traits<NPY_DATETIME>::value_type T;
 };
 
@@ -604,18 +581,39 @@ struct arrow_traits<arrow::Type::STRING> {
   static constexpr int npy_type = NPY_OBJECT;
   static constexpr bool supports_nulls = true;
   static constexpr bool is_boolean = false;
-  static constexpr bool is_pandas_numeric_not_nullable = false;
-  static constexpr bool is_pandas_numeric_nullable = false;
+  static constexpr bool is_numeric_not_nullable = false;
+  static constexpr bool is_numeric_nullable = false;
+};
+
+template <>
+struct arrow_traits<arrow::Type::BINARY> {
+  static constexpr int npy_type = NPY_OBJECT;
+  static constexpr bool supports_nulls = true;
+  static constexpr bool is_boolean = false;
+  static constexpr bool is_numeric_not_nullable = false;
+  static constexpr bool is_numeric_nullable = false;
 };
 
+template <typename T>
+struct WrapBytes {};
 
-static inline PyObject* make_pystring(const uint8_t* data, int32_t length) {
+template <>
+struct WrapBytes<arrow::StringArray> {
+  static inline PyObject* Wrap(const uint8_t* data, int64_t length) {
 #if PY_MAJOR_VERSION >= 3
-  return PyUnicode_FromStringAndSize(reinterpret_cast<const char*>(data), length);
+    return PyUnicode_FromStringAndSize(reinterpret_cast<const char*>(data), length);
 #else
-  return PyString_FromStringAndSize(reinterpret_cast<const char*>(data), length);
+    return PyString_FromStringAndSize(reinterpret_cast<const char*>(data), length);
 #endif
-}
+  }
+};
+
+template <>
+struct WrapBytes<arrow::BinaryArray> {
+  static inline PyObject* Wrap(const uint8_t* data, int64_t length) {
+    return PyBytes_FromStringAndSize(reinterpret_cast<const char*>(data), length);
+  }
+};
 
 inline void set_numpy_metadata(int type, DataType* datatype, PyArrayObject* out) {
   if (type == NPY_DATETIME) {
@@ -645,20 +643,169 @@ inline void set_numpy_metadata(int type, DataType* datatype, PyArrayObject* out)
   }
 }
 
-template <int TYPE>
-class ArrowDeserializer {
- public:
-  ArrowDeserializer(const std::shared_ptr<Column>& col, PyObject* py_ref) :
-      col_(col), py_ref_(py_ref) {}
+template <typename T>
+inline void ConvertIntegerWithNulls(const ChunkedArray& data, double* out_values) {
+  for (int c = 0; c < data.num_chunks(); c++) {
+    const std::shared_ptr<Array> arr = data.chunk(c);
+    auto prim_arr = static_cast<arrow::PrimitiveArray*>(arr.get());
+    auto in_values = reinterpret_cast<const T*>(prim_arr->data()->data());
+    // Upcast to double, set NaN as appropriate
 
-  Status Convert(PyObject** out) {
-    const std::shared_ptr<arrow::ChunkedArray> data = col_->data();
+    for (int i = 0; i < arr->length(); ++i) {
+      *out_values++ = prim_arr->IsNull(i) ? NAN : in_values[i];
+    }
+  }
+}
 
-    RETURN_NOT_OK(ConvertValues<TYPE>(data));
-    *out = reinterpret_cast<PyObject*>(out_);
+template <typename T>
+inline void ConvertIntegerNoNullsSameType(const ChunkedArray& data, T* out_values) {
+  for (int c = 0; c < data.num_chunks(); c++) {
+    const std::shared_ptr<Array> arr = data.chunk(c);
+    auto prim_arr = static_cast<arrow::PrimitiveArray*>(arr.get());
+    auto in_values = reinterpret_cast<const T*>(prim_arr->data()->data());
+    memcpy(out_values, in_values, sizeof(T) * arr->length());
+    out_values += arr->length();
+  }
+}
 
-    return Status::OK();
+template <typename InType, typename OutType>
+inline void ConvertIntegerNoNullsCast(const ChunkedArray& data, OutType* out_values) {
+  for (int c = 0; c < data.num_chunks(); c++) {
+    const std::shared_ptr<Array> arr = data.chunk(c);
+    auto prim_arr = static_cast<arrow::PrimitiveArray*>(arr.get());
+    auto in_values = reinterpret_cast<const InType*>(prim_arr->data()->data());
+    for (int32_t i = 0; i < arr->length(); ++i) {
+      *out_values = in_values[i];
+    }
+  }
+}
+
+static Status ConvertBooleanWithNulls(const ChunkedArray& data, PyObject** out_values) {
+  PyAcquireGIL lock;
+  for (int c = 0; c < data.num_chunks(); c++) {
+    const std::shared_ptr<Array> arr = data.chunk(c);
+    auto bool_arr = static_cast<arrow::BooleanArray*>(arr.get());
+
+    for (int64_t i = 0; i < arr->length(); ++i) {
+      if (bool_arr->IsNull(i)) {
+        Py_INCREF(Py_None);
+        *out_values++ = Py_None;
+      } else if (bool_arr->Value(i)) {
+        // True
+        Py_INCREF(Py_True);
+        *out_values++ = Py_True;
+      } else {
+        // False
+        Py_INCREF(Py_False);
+        *out_values++ = Py_False;
+      }
+    }
+  }
+  return Status::OK();
+}
+
+static void ConvertBooleanNoNulls(const ChunkedArray& data, uint8_t* out_values) {
+  for (int c = 0; c < data.num_chunks(); c++) {
+    const std::shared_ptr<Array> arr = data.chunk(c);
+    auto bool_arr = static_cast<arrow::BooleanArray*>(arr.get());
+    for (int64_t i = 0; i < arr->length(); ++i) {
+      *out_values++ = static_cast<uint8_t>(bool_arr->Value(i));
+    }
+  }
+}
+
+template <typename ArrayType>
+inline Status ConvertBinaryLike(const ChunkedArray& data, PyObject** out_values) {
+  PyAcquireGIL lock;
+  for (int c = 0; c < data.num_chunks(); c++) {
+    auto arr = static_cast<ArrayType*>(data.chunk(c).get());
+
+    const uint8_t* data_ptr;
+    int32_t length;
+    const bool has_nulls = data.null_count() > 0;
+    for (int64_t i = 0; i < arr->length(); ++i) {
+      if (has_nulls && arr->IsNull(i)) {
+        Py_INCREF(Py_None);
+        *out_values = Py_None;
+      } else {
+        data_ptr = arr->GetValue(i, &length);
+        *out_values = WrapBytes<ArrayType>::Wrap(data_ptr, length);
+        if (*out_values == nullptr) {
+          return Status::UnknownError("String initialization failed");
+        }
+      }
+      ++out_values;
+    }
+  }
+  return Status::OK();
+}
+
+template <typename T>
+inline void ConvertNumericNullable(const ChunkedArray& data, T na_value, T* out_values) {
+  for (int c = 0; c < data.num_chunks(); c++) {
+    const std::shared_ptr<Array> arr = data.chunk(c);
+    auto prim_arr = static_cast<arrow::PrimitiveArray*>(arr.get());
+    auto in_values = reinterpret_cast<const T*>(prim_arr->data()->data());
+
+    const uint8_t* valid_bits = arr->null_bitmap_data();
+
+    if (arr->null_count() > 0) {
+      for (int64_t i = 0; i < arr->length(); ++i) {
+        *out_values++ = BitUtil::BitNotSet(valid_bits, i) ? na_value : in_values[i];
+      }
+    } else {
+      memcpy(out_values, in_values, sizeof(T) * arr->length());
+      out_values += arr->length();
+    }
   }
+}
+
+template <typename InType, typename OutType>
+inline void ConvertNumericNullableCast(
+    const ChunkedArray& data, OutType na_value, OutType* out_values) {
+  for (int c = 0; c < data.num_chunks(); c++) {
+    const std::shared_ptr<Array> arr = data.chunk(c);
+    auto prim_arr = static_cast<arrow::PrimitiveArray*>(arr.get());
+    auto in_values = reinterpret_cast<const InType*>(prim_arr->data()->data());
+
+    for (int64_t i = 0; i < arr->length(); ++i) {
+      *out_values++ = arr->IsNull(i) ? na_value : static_cast<OutType>(in_values[i]);
+    }
+  }
+}
+
+template <typename T>
+inline void ConvertDates(const ChunkedArray& data, T na_value, T* out_values) {
+  for (int c = 0; c < data.num_chunks(); c++) {
+    const std::shared_ptr<Array> arr = data.chunk(c);
+    auto prim_arr = static_cast<arrow::PrimitiveArray*>(arr.get());
+    auto in_values = reinterpret_cast<const T*>(prim_arr->data()->data());
+
+    for (int64_t i = 0; i < arr->length(); ++i) {
+      // There are 1000 * 60 * 60 * 24 = 86400000ms in a day
+      *out_values++ = arr->IsNull(i) ? na_value : in_values[i] / 86400000;
+    }
+  }
+}
+
+template <typename InType, int SHIFT>
+inline void ConvertDatetimeNanos(const ChunkedArray& data, int64_t* out_values) {
+  for (int c = 0; c < data.num_chunks(); c++) {
+    const std::shared_ptr<Array> arr = data.chunk(c);
+    auto prim_arr = static_cast<arrow::PrimitiveArray*>(arr.get());
+    auto in_values = reinterpret_cast<const InType*>(prim_arr->data()->data());
+
+    for (int64_t i = 0; i < arr->length(); ++i) {
+      *out_values++ = arr->IsNull(i) ? kPandasTimestampNull
+                                     : (static_cast<int64_t>(in_values[i]) * SHIFT);
+    }
+  }
+}
+
+class ArrowDeserializer {
+ public:
+  ArrowDeserializer(const std::shared_ptr<Column>& col, PyObject* py_ref)
+      : col_(col), data_(*col->data().get()), py_ref_(py_ref) {}
 
   Status AllocateOutput(int type) {
     PyAcquireGIL lock;
@@ -676,20 +823,29 @@ class ArrowDeserializer {
     return Status::OK();
   }
 
-  Status OutputFromData(int type, void* data) {
+  template <int TYPE>
+  Status ConvertValuesZeroCopy(int npy_type, std::shared_ptr<Array> arr) {
+    typedef typename arrow_traits<TYPE>::T T;
+
+    auto prim_arr = static_cast<arrow::PrimitiveArray*>(arr.get());
+    auto in_values = reinterpret_cast<const T*>(prim_arr->data()->data());
+
+    // Zero-Copy. We can pass the data pointer directly to NumPy.
+    void* data = const_cast<T*>(in_values);
+
     PyAcquireGIL lock;
 
     // Zero-Copy. We can pass the data pointer directly to NumPy.
     npy_intp dims[1] = {col_->length()};
-    out_ = reinterpret_cast<PyArrayObject*>(PyArray_SimpleNewFromData(1, dims,
-                type, data));
+    out_ = reinterpret_cast<PyArrayObject*>(
+        PyArray_SimpleNewFromData(1, dims, npy_type, data));
 
     if (out_ == NULL) {
       // Error occurred, trust that SimpleNew set the error state
       return Status::OK();
     }
 
-    set_numpy_metadata(type, col_->type().get(), out_);
+    set_numpy_metadata(npy_type, col_->type().get(), out_);
 
     if (PyArray_SetBaseObject(out_, py_ref_) == -1) {
       // Error occurred, trust that SetBaseObject set the error state
@@ -705,317 +861,621 @@ class ArrowDeserializer {
     return Status::OK();
   }
 
-  template <int T2>
-  Status ConvertValuesZeroCopy(std::shared_ptr<Array> arr) {
-    typedef typename arrow_traits<T2>::T T;
+  // ----------------------------------------------------------------------
+  // Allocate new array and deserialize. Can do a zero copy conversion for some
+  // types
 
-    auto prim_arr = static_cast<arrow::PrimitiveArray*>(arr.get());
-    auto in_values = reinterpret_cast<const T*>(prim_arr->data()->data());
+  Status Convert(PyObject** out) {
+#define CONVERT_CASE(TYPE)                             \
+  case arrow::Type::TYPE: {                            \
+    RETURN_NOT_OK(ConvertValues<arrow::Type::TYPE>()); \
+  } break;
+
+    switch (col_->type()->type) {
+      CONVERT_CASE(BOOL);
+      CONVERT_CASE(INT8);
+      CONVERT_CASE(INT16);
+      CONVERT_CASE(INT32);
+      CONVERT_CASE(INT64);
+      CONVERT_CASE(UINT8);
+      CONVERT_CASE(UINT16);
+      CONVERT_CASE(UINT32);
+      CONVERT_CASE(UINT64);
+      CONVERT_CASE(FLOAT);
+      CONVERT_CASE(DOUBLE);
+      CONVERT_CASE(BINARY);
+      CONVERT_CASE(STRING);
+      CONVERT_CASE(DATE);
+      CONVERT_CASE(TIMESTAMP);
+      default:
+        return Status::NotImplemented("Arrow type reading not implemented");
+    }
 
-    // Zero-Copy. We can pass the data pointer directly to NumPy.
-    void* data = const_cast<T*>(in_values);
-    int type = arrow_traits<TYPE>::npy_type;
-    RETURN_NOT_OK(OutputFromData(type, data));
+#undef CONVERT_CASE
 
+    *out = reinterpret_cast<PyObject*>(out_);
     return Status::OK();
   }
 
-  template <int T2>
+  template <int TYPE>
   inline typename std::enable_if<
-    (T2 != arrow::Type::DATE) & arrow_traits<T2>::is_pandas_numeric_nullable, Status>::type
-  ConvertValues(const std::shared_ptr<arrow::ChunkedArray>& data) {
-    typedef typename arrow_traits<T2>::T T;
-    size_t chunk_offset = 0;
+      (TYPE != arrow::Type::DATE) & arrow_traits<TYPE>::is_numeric_nullable, Status>::type
+  ConvertValues() {
+    typedef typename arrow_traits<TYPE>::T T;
+    int npy_type = arrow_traits<TYPE>::npy_type;
 
-    if (data->num_chunks() == 1 && data->null_count() == 0) {
-      return ConvertValuesZeroCopy<TYPE>(data->chunk(0));
+    if (data_.num_chunks() == 1 && data_.null_count() == 0) {
+      return ConvertValuesZeroCopy<TYPE>(npy_type, data_.chunk(0));
     }
 
-    RETURN_NOT_OK(AllocateOutput(arrow_traits<T2>::npy_type));
+    RETURN_NOT_OK(AllocateOutput(npy_type));
+    auto out_values = reinterpret_cast<T*>(PyArray_DATA(out_));
+    ConvertNumericNullable<T>(data_, arrow_traits<TYPE>::na_value, out_values);
 
-    for (int c = 0; c < data->num_chunks(); c++) {
-      const std::shared_ptr<Array> arr = data->chunk(c);
-      auto prim_arr = static_cast<arrow::PrimitiveArray*>(arr.get());
-      auto in_values = reinterpret_cast<const T*>(prim_arr->data()->data());
-      auto out_values = reinterpret_cast<T*>(PyArray_DATA(out_)) + chunk_offset;
+    return Status::OK();
+  }
 
-      if (arr->null_count() > 0) {
-        for (int64_t i = 0; i < arr->length(); ++i) {
-          out_values[i] = arr->IsNull(i) ? arrow_traits<T2>::na_value : in_values[i];
-        }
-      } else {
-        memcpy(out_values, in_values, sizeof(T) * arr->length());
-      }
+  template <int TYPE>
+  inline typename std::enable_if<TYPE == arrow::Type::DATE, Status>::type
+  ConvertValues() {
+    typedef typename arrow_traits<TYPE>::T T;
+
+    RETURN_NOT_OK(AllocateOutput(arrow_traits<TYPE>::npy_type));
+    auto out_values = reinterpret_cast<T*>(PyArray_DATA(out_));
+    ConvertDates<T>(data_, arrow_traits<TYPE>::na_value, out_values);
+    return Status::OK();
+  }
 
-      chunk_offset += arr->length();
+  // Integer specialization
+  template <int TYPE>
+  inline
+      typename std::enable_if<arrow_traits<TYPE>::is_numeric_not_nullable, Status>::type
+      ConvertValues() {
+    typedef typename arrow_traits<TYPE>::T T;
+    int npy_type = arrow_traits<TYPE>::npy_type;
+
+    if (data_.num_chunks() == 1 && data_.null_count() == 0) {
+      return ConvertValuesZeroCopy<TYPE>(npy_type, data_.chunk(0));
+    }
+
+    if (data_.null_count() > 0) {
+      RETURN_NOT_OK(AllocateOutput(NPY_FLOAT64));
+      auto out_values = reinterpret_cast<double*>(PyArray_DATA(out_));
+      ConvertIntegerWithNulls<T>(data_, out_values);
+    } else {
+      RETURN_NOT_OK(AllocateOutput(arrow_traits<TYPE>::npy_type));
+      auto out_values = reinterpret_cast<T*>(PyArray_DATA(out_));
+      ConvertIntegerNoNullsSameType<T>(data_, out_values);
     }
 
     return Status::OK();
   }
 
+  // Boolean specialization
+  template <int TYPE>
+  inline typename std::enable_if<arrow_traits<TYPE>::is_boolean, Status>::type
+  ConvertValues() {
+    if (data_.null_count() > 0) {
+      RETURN_NOT_OK(AllocateOutput(NPY_OBJECT));
+      auto out_values = reinterpret_cast<PyObject**>(PyArray_DATA(out_));
+      RETURN_NOT_OK(ConvertBooleanWithNulls(data_, out_values));
+    } else {
+      RETURN_NOT_OK(AllocateOutput(arrow_traits<TYPE>::npy_type));
+      auto out_values = reinterpret_cast<uint8_t*>(PyArray_DATA(out_));
+      ConvertBooleanNoNulls(data_, out_values);
+    }
+    return Status::OK();
+  }
+
+  // UTF8 strings
+  template <int TYPE>
+  inline typename std::enable_if<TYPE == arrow::Type::STRING, Status>::type
+  ConvertValues() {
+    RETURN_NOT_OK(AllocateOutput(NPY_OBJECT));
+    auto out_values = reinterpret_cast<PyObject**>(PyArray_DATA(out_));
+    return ConvertBinaryLike<arrow::StringArray>(data_, out_values);
+  }
+
   template <int T2>
-  inline typename std::enable_if<
-    T2 == arrow::Type::DATE, Status>::type
-  ConvertValues(const std::shared_ptr<arrow::ChunkedArray>& data) {
-    typedef typename arrow_traits<T2>::T T;
-    size_t chunk_offset = 0;
+  inline typename std::enable_if<T2 == arrow::Type::BINARY, Status>::type
+  ConvertValues() {
+    RETURN_NOT_OK(AllocateOutput(NPY_OBJECT));
+    auto out_values = reinterpret_cast<PyObject**>(PyArray_DATA(out_));
+    return ConvertBinaryLike<arrow::BinaryArray>(data_, out_values);
+  }
+
+ private:
+  std::shared_ptr<Column> col_;
+  const arrow::ChunkedArray& data_;
+  PyObject* py_ref_;
+  PyArrayObject* out_;
+};
+
+Status ConvertArrayToPandas(
+    const std::shared_ptr<Array>& arr, PyObject* py_ref, PyObject** out) {
+  static std::string dummy_name = "dummy";
+  auto field = std::make_shared<Field>(dummy_name, arr->type());
+  auto col = std::make_shared<Column>(field, arr);
+  return ConvertColumnToPandas(col, py_ref, out);
+}
 
-    RETURN_NOT_OK(AllocateOutput(arrow_traits<T2>::npy_type));
+Status ConvertColumnToPandas(
+    const std::shared_ptr<Column>& col, PyObject* py_ref, PyObject** out) {
+  ArrowDeserializer converter(col, py_ref);
+  return converter.Convert(out);
+}
 
-    for (int c = 0; c < data->num_chunks(); c++) {
-      const std::shared_ptr<Array> arr = data->chunk(c);
-      auto prim_arr = static_cast<arrow::PrimitiveArray*>(arr.get());
-      auto in_values = reinterpret_cast<const T*>(prim_arr->data()->data());
-      auto out_values = reinterpret_cast<T*>(PyArray_DATA(out_)) + chunk_offset;
+// ----------------------------------------------------------------------
+// pandas 0.x DataFrame conversion internals
 
-      for (int64_t i = 0; i < arr->length(); ++i) {
-        // There are 1000 * 60 * 60 * 24 = 86400000ms in a day
-        out_values[i] = arr->IsNull(i) ? arrow_traits<T2>::na_value : in_values[i] / 86400000;
-      }
+class PandasBlock {
+ public:
+  enum type {
+    OBJECT,
+    UINT8,
+    INT8,
+    UINT16,
+    INT16,
+    UINT32,
+    INT32,
+    UINT64,
+    INT64,
+    FLOAT,
+    DOUBLE,
+    BOOL,
+    DATETIME,
+    CATEGORICAL
+  };
+
+  PandasBlock(int64_t num_rows, int num_columns)
+      : num_rows_(num_rows), num_columns_(num_columns) {}
+
+  virtual Status Allocate() = 0;
+  virtual Status WriteNext(const std::shared_ptr<Column>& col, int64_t placement) = 0;
 
-      chunk_offset += arr->length();
+  PyObject* block_arr() { return block_arr_.obj(); }
+
+  PyObject* placement_arr() { return placement_arr_.obj(); }
+
+ protected:
+  Status AllocateNDArray(int npy_type) {
+    PyAcquireGIL lock;
+
+    npy_intp block_dims[2] = {num_columns_, num_rows_};
+    PyObject* block_arr = PyArray_SimpleNew(2, block_dims, npy_type);
+    if (block_arr == NULL) {
+      // TODO(wesm): propagating Python exception
+      return Status::OK();
     }
 
+    npy_intp placement_dims[1] = {num_columns_};
+    PyObject* placement_arr = PyArray_SimpleNew(1, placement_dims, NPY_INT64);
+    if (placement_arr == NULL) {
+      // TODO(wesm): propagating Python exception
+      return Status::OK();
+    }
+
+    block_arr_.reset(block_arr);
+    placement_arr_.reset(placement_arr);
+    current_placement_index_ = 0;
+
+    block_data_ = reinterpret_cast<uint8_t*>(
+        PyArray_DATA(reinterpret_cast<PyArrayObject*>(block_arr)));
+
+    placement_data_ = reinterpret_cast<int64_t*>(
+        PyArray_DATA(reinterpret_cast<PyArrayObject*>(placement_arr)));
+
     return Status::OK();
   }
 
-  // Integer specialization
-  template <int T2>
-  inline typename std::enable_if<
-    arrow_traits<T2>::is_pandas_numeric_not_nullable, Status>::type
-  ConvertValues(const std::shared_ptr<arrow::ChunkedArray>& data) {
-    typedef typename arrow_traits<T2>::T T;
-    size_t chunk_offset = 0;
+  int64_t num_rows_;
+  int num_columns_;
+  int current_placement_index_;
 
-    if (data->num_chunks() == 1 && data->null_count() == 0) {
-      return ConvertValuesZeroCopy<TYPE>(data->chunk(0));
-    }
+  OwnedRef block_arr_;
+  uint8_t* block_data_;
 
-    if (data->null_count() > 0) {
-      RETURN_NOT_OK(AllocateOutput(NPY_FLOAT64));
+  // ndarray<int32>
+  OwnedRef placement_arr_;
+  int64_t* placement_data_;
 
-      for (int c = 0; c < data->num_chunks(); c++) {
-        const std::shared_ptr<Array> arr = data->chunk(c);
-        auto prim_arr = static_cast<arrow::PrimitiveArray*>(arr.get());
-        auto in_values = reinterpret_cast<const T*>(prim_arr->data()->data());
-        // Upcast to double, set NaN as appropriate
-        auto out_values = reinterpret_cast<double*>(PyArray_DATA(out_)) + chunk_offset;
+  DISALLOW_COPY_AND_ASSIGN(PandasBlock);
+};
 
-        for (int i = 0; i < arr->length(); ++i) {
-          out_values[i] = prim_arr->IsNull(i) ? NAN : in_values[i];
-        }
+class ObjectBlock : public PandasBlock {
+ public:
+  using PandasBlock::PandasBlock;
 
-        chunk_offset += arr->length();
-      }
-    } else {
-      RETURN_NOT_OK(AllocateOutput(arrow_traits<TYPE>::npy_type));
+  Status Allocate() override { return AllocateNDArray(NPY_OBJECT); }
 
-      for (int c = 0; c < data->num_chunks(); c++) {
-        const std::shared_ptr<Array> arr = data->chunk(c);
-        auto prim_arr = static_cast<arrow::PrimitiveArray*>(arr.get());
-        auto in_values = reinterpret_cast<const T*>(prim_arr->data()->data());
-        auto out_values = reinterpret_cast<T*>(PyArray_DATA(out_)) + chunk_offset;
+  Status WriteNext(const std::shared_ptr<Column>& col, int64_t placement) override {
+    Type::type type = col->type()->type;
 
-        memcpy(out_values, in_values, sizeof(T) * arr->length());
+    PyObject** out_buffer =
+        reinterpret_cast<PyObject**>(block_data_) + current_placement_index_ * num_rows_;
 
-        chunk_offset += arr->length();
-      }
+    const ChunkedArray& data = *col->data().get();
+
+    if (type == Type::BOOL) {
+      RETURN_NOT_OK(ConvertBooleanWithNulls(data, out_buffer));
+    } else if (type == Type::BINARY) {
+      RETURN_NOT_OK(ConvertBinaryLike<arrow::BinaryArray>(data, out_buffer));
+    } else if (type == Type::STRING) {
+      RETURN_NOT_OK(ConvertBinaryLike<arrow::StringArray>(data, out_buffer));
+    } else {
+      std::stringstream ss;
+      ss << "Unsupported type for object array output: " << col->type()->ToString();
+      return Status::NotImplemented(ss.str());
     }
 
+    placement_data_[current_placement_index_++] = placement;
     return Status::OK();
   }
+};
 
-  // Boolean specialization
-  template <int T2>
-  inline typename std::enable_if<
-    arrow_traits<T2>::is_boolean, Status>::type
-  ConvertValues(const std::shared_ptr<arrow::ChunkedArray>& data) {
-    size_t chunk_offset = 0;
-    PyAcquireGIL lock;
+template <int ARROW_TYPE, typename C_TYPE>
+class IntBlock : public PandasBlock {
+ public:
+  using PandasBlock::PandasBlock;
 
-    if (data->null_count() > 0) {
-      RETURN_NOT_OK(AllocateOutput(NPY_OBJECT));
+  Status Allocate() override {
+    return AllocateNDArray(arrow_traits<ARROW_TYPE>::npy_type);
+  }
 
-      for (int c = 0; c < data->num_chunks(); c++) {
-        const std::shared_ptr<Array> arr = data->chunk(c);
-        auto bool_arr = static_cast<arrow::BooleanArray*>(arr.get());
-        auto out_values = reinterpret_cast<PyObject**>(PyArray_DATA(out_)) + chunk_offset;
-
-        for (int64_t i = 0; i < arr->length(); ++i) {
-          if (bool_arr->IsNull(i)) {
-            Py_INCREF(Py_None);
-            out_values[i] = Py_None;
-          } else if (bool_arr->Value(i)) {
-            // True
-            Py_INCREF(Py_True);
-            out_values[i] = Py_True;
-          } else {
-            // False
-            Py_INCREF(Py_False);
-            out_values[i] = Py_False;
-          }
-        }
+  Status WriteNext(const std::shared_ptr<Column>& col, int64_t placement) override {
+    Type::type type = col->type()->type;
 
-        chunk_offset += bool_arr->length();
-      }
-    } else {
-      RETURN_NOT_OK(AllocateOutput(arrow_traits<TYPE>::npy_type));
+    C_TYPE* out_buffer =
+        reinterpret_cast<C_TYPE*>(block_data_) + current_placement_index_ * num_rows_;
 
-      for (int c = 0; c < data->num_chunks(); c++) {
-        const std::shared_ptr<Array> arr = data->chunk(c);
-        auto bool_arr = static_cast<arrow::BooleanArray*>(arr.get());
-        auto out_values = reinterpret_cast<uint8_t*>(PyArray_DATA(out_)) + chunk_offset;
+    const ChunkedArray& data = *col->data().get();
 
-        for (int64_t i = 0; i < arr->length(); ++i) {
-          out_values[i] = static_cast<uint8_t>(bool_arr->Value(i));
-        }
+    if (type != ARROW_TYPE) { return Status::NotImplemented(col->type()->ToString()); }
 
-        chunk_offset += bool_arr->length();
-      }
+    ConvertIntegerNoNullsSameType<C_TYPE>(data, out_buffer);
+    placement_data_[current_placement_index_++] = placement;
+    return Status::OK();
+  }
+};
+
+using UInt8Block = IntBlock<Type::UINT8, uint8_t>;
+using Int8Block = IntBlock<Type::INT8, int8_t>;
+using UInt16Block = IntBlock<Type::UINT16, uint16_t>;
+using Int16Block = IntBlock<Type::INT16, int16_t>;
+using UInt32Block = IntBlock<Type::UINT32, uint32_t>;
+using Int32Block = IntBlock<Type::INT32, int32_t>;
+using UInt64Block = IntBlock<Type::UINT64, uint64_t>;
+using Int64Block = IntBlock<Type::INT64, int64_t>;
+
+class Float32Block : public PandasBlock {
+ public:
+  using PandasBlock::PandasBlock;
+
+  Status Allocate() override { return AllocateNDArray(NPY_FLOAT32); }
+
+  Status WriteNext(const std::shared_ptr<Column>& col, int64_t placement) override {
+    Type::type type = col->type()->type;
+
+    if (type != Type::FLOAT) { return Status::NotImplemented(col->type()->ToString()); }
+
+    float* out_buffer =
+        reinterpret_cast<float*>(block_data_) + current_placement_index_ * num_rows_;
+
+    ConvertNumericNullable<float>(*col->data().get(), NAN, out_buffer);
+    placement_data_[current_placement_index_++] = placement;
+    return Status::OK();
+  }
+};
+
+class Float64Block : public PandasBlock {
+ public:
+  using PandasBlock::PandasBlock;
+
+  Status Allocate() override { return AllocateNDArray(NPY_FLOAT64); }
+
+  Status WriteNext(const std::shared_ptr<Column>& col, int64_t placement) override {
+    Type::type type = col->type()->type;
+
+    double* out_buffer =
+        reinterpret_cast<double*>(block_data_) + current_placement_index_ * num_rows_;
+
+    const ChunkedArray& data = *col->data().get();
+
+#define INTEGER_CASE(IN_TYPE)                         \
+  ConvertIntegerWithNulls<IN_TYPE>(data, out_buffer); \
+  break;
+
+    switch (type) {
+      case Type::UINT8:
+        INTEGER_CASE(uint8_t);
+      case Type::INT8:
+        INTEGER_CASE(int8_t);
+      case Type::UINT16:
+        INTEGER_CASE(uint16_t);
+      case Type::INT16:
+        INTEGER_CASE(int16_t);
+      case Type::UINT32:
+        INTEGER_CASE(uint32_t);
+      case Type::INT32:
+        INTEGER_CASE(int32_t);
+      case Type::UINT64:
+        INTEGER_CASE(uint64_t);
+      case Type::INT64:
+        INTEGER_CASE(int64_t);
+      case Type::FLOAT:
+        ConvertNumericNullableCast<float, double>(data, NAN, out_buffer);
+        break;
+      case Type::DOUBLE:
+        ConvertNumericNullable<double>(data, NAN, out_buffer);
+        break;
+      default:
+        return Status::NotImplemented(col->type()->ToString());
     }
 
+#undef INTEGER_CASE
+
+    placement_data_[current_placement_index_++] = placement;
     return Status::OK();
   }
+};
 
-  // UTF8 strings
-  template <int T2>
-  inline typename std::enable_if<
-    T2 == arrow::Type::STRING, Status>::type
-  ConvertValues(const std::shared_ptr<arrow::ChunkedArray>& data) {
-    size_t chunk_offset = 0;
-    PyAcquireGIL lock;
+class BoolBlock : public PandasBlock {
+ public:
+  using PandasBlock::PandasBlock;
 
-    RETURN_NOT_OK(AllocateOutput(NPY_OBJECT));
+  Status Allocate() override { return AllocateNDArray(NPY_BOOL); }
 
-    for (int c = 0; c < data->num_chunks(); c++) {
-      const std::shared_ptr<Array> arr = data->chunk(c);
-      auto string_arr = static_cast<arrow::StringArray*>(arr.get());
-      auto out_values = reinterpret_cast<PyObject**>(PyArray_DATA(out_)) + chunk_offset;
-
-      const uint8_t* data_ptr;
-      int32_t length;
-      if (data->null_count() > 0) {
-        for (int64_t i = 0; i < arr->length(); ++i) {
-          if (string_arr->IsNull(i)) {
-            Py_INCREF(Py_None);
-            out_values[i] = Py_None;
-          } else {
-            data_ptr = string_arr->GetValue(i, &length);
-
-            out_values[i] = make_pystring(data_ptr, length);
-            if (out_values[i] == nullptr) {
-              return Status::UnknownError("String initialization failed");
-            }
-          }
-        }
+  Status WriteNext(const std::shared_ptr<Column>& col, int64_t placement) override {
+    Type::type type = col->type()->type;
+
+    if (type != Type::BOOL) { return Status::NotImplemented(col->type()->ToString()); }
+
+    uint8_t* out_buffer =
+        reinterpret_cast<uint8_t*>(block_data_) + current_placement_index_ * num_rows_;
+
+    ConvertBooleanNoNulls(*col->data().get(), out_buffer);
+    placement_data_[current_placement_index_++] = placement;
+    return Status::OK();
+  }
+};
+
+class DatetimeBlock : public PandasBlock {
+ public:
+  using PandasBlock::PandasBlock;
+
+  Status Allocate() override {
+    RETURN_NOT_OK(AllocateNDArray(NPY_DATETIME));
+
+    PyAcquireGIL lock;
+    auto date_dtype = reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(
+        PyArray_DESCR(reinterpret_cast<PyArrayObject*>(block_arr_.obj()))->c_metadata);
+    date_dtype->meta.base = NPY_FR_ns;
+    return Status::OK();
+  }
+
+  Status WriteNext(const std::shared_ptr<Column>& col, int64_t placement) override {
+    Type::type type = col->type()->type;
+
+    int64_t* out_buffer =
+        reinterpret_cast<int64_t*>(block_data_) + current_placement_index_ * num_rows_;
+
+    const ChunkedArray& data = *col.get()->data();
+
+    if (type == Type::DATE) {
+      // DateType is millisecond timestamp stored as int64_t
+      // TODO(wesm): Do we want to make sure to zero out the milliseconds?
+      ConvertDatetimeNanos<int64_t, 1000000L>(data, out_buffer);
+    } else if (type == Type::TIMESTAMP) {
+      auto ts_type = static_cast<arrow::TimestampType*>(col->type().get());
+
+      if (ts_type->unit == arrow::TimeUnit::NANO) {
+        ConvertNumericNullable<int64_t>(data, kPandasTimestampNull, out_buffer);
+      } else if (ts_type->unit == arrow::TimeUnit::MICRO) {
+        ConvertDatetimeNanos<int64_t, 1000L>(data, out_buffer);
+      } else if (ts_type->unit == arrow::TimeUnit::MILLI) {
+        ConvertDatetimeNanos<int64_t, 1000000L>(data, out_buffer);
+      } else if (ts_type->unit == arrow::TimeUnit::SECOND) {
+        ConvertDatetimeNanos<int64_t, 1000000000L>(data, out_buffer);
       } else {
-        for (int64_t i = 0; i < arr->length(); ++i) {
-          data_ptr = string_arr->GetValue(i, &length);
-          out_values[i] = make_pystring(data_ptr, length);
-          if (out_values[i] == nullptr) {
-            return Status::UnknownError("String initialization failed");
-          }
-        }
+        return Status::NotImplemented("Unsupported time unit");
       }
-
-      chunk_offset += string_arr->length();
+    } else {
+      return Status::NotImplemented(col->type()->ToString());
     }
 
+    placement_data_[current_placement_index_++] = placement;
     return Status::OK();
   }
+};
 
-  template <int T2>
-  inline typename std::enable_if<
-    T2 == arrow::Type::BINARY, Status>::type
-  ConvertValues(const std::shared_ptr<arrow::ChunkedArray>& data) {
-    size_t chunk_offset = 0;
-    PyAcquireGIL lock;
+// class CategoricalBlock : public PandasBlock {};
 
-    RETURN_NOT_OK(AllocateOutput(NPY_OBJECT));
+Status MakeBlock(PandasBlock::type type, int64_t num_rows, int num_columns,
+    std::shared_ptr<PandasBlock>* block) {
+#define BLOCK_CASE(NAME, TYPE)                              \
+  case PandasBlock::NAME:                                   \
+    *block = std::make_shared<TYPE>(num_rows, num_columns); \
+    break;
 
-    for (int c = 0; c < data->num_chunks(); c++) {
-      const std::shared_ptr<Array> arr = data->chunk(c);
-      auto binary_arr = static_cast<arrow::BinaryArray*>(arr.get());
-      auto out_values = reinterpret_cast<PyObject**>(PyArray_DATA(out_)) + chunk_offset;
-
-      const uint8_t* data_ptr;
-      int32_t length;
-      if (data->null_count() > 0) {
-        for (int64_t i = 0; i < arr->length(); ++i) {
-          if (binary_arr->IsNull(i)) {
-            Py_INCREF(Py_None);
-            out_values[i] = Py_None;
-          } else {
-            data_ptr = binary_arr->GetValue(i, &length);
-
-            out_values[i] = PyBytes_FromStringAndSize(
-                reinterpret_cast<const char*>(data_ptr), length);
-            if (out_values[i] == nullptr) {
-              return Status::UnknownError("String initialization failed");
-            }
-          }
-        }
+  switch (type) {
+    BLOCK_CASE(OBJECT, ObjectBlock);
+    BLOCK_CASE(UINT8, UInt8Block);
+    BLOCK_CASE(INT8, Int8Block);
+    BLOCK_CASE(UINT16, UInt16Block);
+    BLOCK_CASE(INT16, Int16Block);
+    BLOCK_CASE(UINT32, UInt32Block);
+    BLOCK_CASE(INT32, Int32Block);
+    BLOCK_CASE(UINT64, UInt64Block);
+    BLOCK_CASE(INT64, Int64Block);
+    BLOCK_CASE(FLOAT, Float32Block);
+    BLOCK_CASE(DOUBLE, Float64Block);
+    BLOCK_CASE(BOOL, BoolBlock);
+    BLOCK_CASE(DATETIME, DatetimeBlock);
+    case PandasBlock::CATEGORICAL:
+      return Status::NotImplemented("categorical");
+  }
+
+#undef BLOCK_CASE
+
+  return (*block)->Allocate();
+}
+
+// Construct the exact pandas 0.x "BlockManager" memory layout
+//
+// * For each column determine the correct output pandas type
+// * Allocate 2D blocks (ncols x nrows) for each distinct data type in output
+// * Allocate  block placement arrays
+// * Write Arrow columns out into each slice of memory; populate block
+// * placement arrays as we go
+class DataFrameBlockCreator {
+ public:
+  DataFrameBlockCreator(const std::shared_ptr<Table>& table) : table_(table) {}
+
+  Status Convert(int nthreads, PyObject** output) {
+    column_types_.resize(table_->num_columns());
+    type_counts_.clear();
+    blocks_.clear();
+
+    RETURN_NOT_OK(CountColumnTypes());
+    RETURN_NOT_OK(CreateBlocks());
+    RETURN_NOT_OK(WriteTableToBlocks(nthreads));
+
+    return GetResultList(output);
+  }
+
+  Status CountColumnTypes() {
+    for (int i = 0; i < table_->num_columns(); ++i) {
+      std::shared_ptr<Column> col = table_->column(i);
+      PandasBlock::type output_type;
+
+      switch (col->type()->type) {
+        case Type::BOOL:
+          output_type = col->null_count() > 0 ? PandasBlock::OBJECT : PandasBlock::BOOL;
+          break;
+        case Type::UINT8:
+          output_type = col->null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::UINT8;
+          break;
+        case Type::INT8:
+          output_type = col->null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::INT8;
+          break;
+        case Type::UINT16:
+          output_type = col->null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::UINT16;
+          break;
+        case Type::INT16:
+          output_type = col->null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::INT16;
+          break;
+        case Type::UINT32:
+          output_type = col->null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::UINT32;
+          break;
+        case Type::INT32:
+          output_type = col->null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::INT32;
+          break;
+        case Type::INT64:
+          output_type = col->null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::INT64;
+          break;
+        case Type::UINT64:
+          output_type = col->null_count() > 0 ? PandasBlock::DOUBLE : PandasBlock::UINT64;
+          break;
+        case Type::FLOAT:
+          output_type = PandasBlock::FLOAT;
+          break;
+        case Type::DOUBLE:
+          output_type = PandasBlock::DOUBLE;
+          break;
+        case Type::STRING:
+        case Type::BINARY:
+          output_type = PandasBlock::OBJECT;
+          break;
+        case Type::DATE:
+          output_type = PandasBlock::DATETIME;
+          break;
+        case Type::TIMESTAMP:
+          output_type = PandasBlock::DATETIME;
+          break;
+        default:
+          return Status::NotImplemented(col->type()->ToString());
+      }
+
+      auto it = type_counts_.find(output_type);
+      if (it != type_counts_.end()) {
+        // Increment count
+        it->second += 1;
       } else {
-        for (int64_t i = 0; i < arr->length(); ++i) {
-          data_ptr = binary_arr->GetValue(i, &length);
-          out_values[i] = PyBytes_FromStringAndSize(
-              reinterpret_cast<const char*>(data_ptr), length);
-          if (out_values[i] == nullptr) {
-            return Status::UnknownError("String initialization failed");
-          }
-        }
+        // Add key to map
+        type_counts_[output_type] = 1;
       }
 
-      chunk_offset += binary_arr->length();
+      column_types_[i] = output_type;
     }
+    return Status::OK();
+  }
 
+  Status CreateBlocks() {
+    for (const auto& it : type_counts_) {
+      PandasBlock::type type = static_cast<PandasBlock::type>(it.first);
+      std::shared_ptr<PandasBlock> block;
+      RETURN_NOT_OK(MakeBlock(type, table_->num_rows(), it.second, &block));
+      blocks_[type] = block;
+    }
     return Status::OK();
   }
 
- private:
-  std::shared_ptr<Column> col_;
-  PyObject* py_ref_;
-  PyArrayObject* out_;
-};
+  Status WriteTableToBlocks(int nthreads) {
+    if (nthreads > 1) {
+      return Status::NotImplemented("multithreading not yet implemented");
+    }
 
-#define FROM_ARROW_CASE(TYPE)                                       \
-  case arrow::Type::TYPE:                                           \
-    {                                                               \
-      ArrowDeserializer<arrow::Type::TYPE> converter(col, py_ref);  \
-      return converter.Convert(out);                                \
-    }                                                               \
-    break;
+    for (int i = 0; i < table_->num_columns(); ++i) {
+      std::shared_ptr<Column> col = table_->column(i);
+      PandasBlock::type output_type = column_types_[i];
 
-Status ConvertArrayToPandas(const std::shared_ptr<Array>& arr, PyObject* py_ref,
-        PyObject** out) {
-  static std::string dummy_name = "dummy";
-  auto field = std::make_shared<Field>(dummy_name, arr->type());
-  auto col = std::make_shared<Column>(field, arr);
-  return ConvertColumnToPandas(col, py_ref, out);
-}
+      auto it = blocks_.find(output_type);
+      if (it == blocks_.end()) { return Status::KeyError("No block allocated"); }
+      RETURN_NOT_OK(it->second->WriteNext(col, i));
+    }
+    return Status::OK();
+  }
 
-Status ConvertColumnToPandas(const std::shared_ptr<Column>& col, PyObject* py_ref,
-        PyObject** out) {
-  switch(col->type()->type) {
-    FROM_ARROW_CASE(BOOL);
-    FROM_ARROW_CASE(INT8);
-    FROM_ARROW_CASE(INT16);
-    FROM_ARROW_CASE(INT32);
-    FROM_ARROW_CASE(INT64);
-    FROM_ARROW_CASE(UINT8);
-    FROM_ARROW_CASE(UINT16);
-    FROM_ARROW_CASE(UINT32);
-    FROM_ARROW_CASE(UINT64);
-    FROM_ARROW_CASE(FLOAT);
-    FROM_ARROW_CASE(DOUBLE);
-    FROM_ARROW_CASE(BINARY);
-    FROM_ARROW_CASE(STRING);
-    FROM_ARROW_CASE(DATE);
-    FROM_ARROW_CASE(TIMESTAMP);
-    default:
-      return Status::NotImplemented("Arrow type reading not implemented");
+  Status GetResultList(PyObject** out) {
+    auto num_blocks = static_cast<Py_ssize_t>(blocks_.size());
+    PyObject* result = PyList_New(num_blocks);
+    RETURN_IF_PYERROR();
+
+    int i = 0;
+    for (const auto& it : blocks_) {
+      const std::shared_ptr<PandasBlock> block = it.second;
+
+      PyObject* item = PyTuple_New(2);
+      RETURN_IF_PYERROR();
+
+      PyObject* block_arr = block->block_arr();
+      PyObject* placement_arr = block->placement_arr();
+      Py_INCREF(block_arr);
+      Py_INCREF(placement_arr);
+      PyTuple_SET_ITEM(item, 0, block_arr);
+      PyTuple_SET_ITEM(item, 1, placement_arr);
+
+      if (PyList_SET_ITEM(result, i++, item) < 0) { RETURN_IF_PYERROR(); }
+    }
+    *out = result;
+    return Status::OK();
   }
-  return Status::OK();
+
+ private:
+  std::shared_ptr<Table> table_;
+  std::vector<PandasBlock::type> column_types_;
+
+  // block type -> type count
+  std::unordered_map<int, int> type_counts_;
+
+  // block type -> block
+  std::unordered_map<int, std::shared_ptr<PandasBlock>> blocks_;
+};
+
+Status ConvertTableToPandas(
+    const std::shared_ptr<Table>& table, int nthreads, PyObject** out) {
+  DataFrameBlockCreator helper(table);
+  return helper.Convert(nthreads, out);
 }
 
-} // namespace pyarrow
+}  // namespace pyarrow

http://git-wip-us.apache.org/repos/asf/arrow/blob/65af9ea1/python/src/pyarrow/adapters/pandas.h
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/adapters/pandas.h b/python/src/pyarrow/adapters/pandas.h
index 532495d..60dadd4 100644
--- a/python/src/pyarrow/adapters/pandas.h
+++ b/python/src/pyarrow/adapters/pandas.h
@@ -33,27 +33,42 @@ class Array;
 class Column;
 class MemoryPool;
 class Status;
+class Table;
 
-} // namespace arrow
+}  // namespace arrow
 
 namespace pyarrow {
 
 PYARROW_EXPORT
-arrow::Status ConvertArrayToPandas(const std::shared_ptr<arrow::Array>& arr,
-    PyObject* py_ref, PyObject** out);
+arrow::Status ConvertArrayToPandas(
+    const std::shared_ptr<arrow::Array>& arr, PyObject* py_ref, PyObject** out);
 
 PYARROW_EXPORT
-arrow::Status ConvertColumnToPandas(const std::shared_ptr<arrow::Column>& col,
-    PyObject* py_ref, PyObject** out);
+arrow::Status ConvertColumnToPandas(
+    const std::shared_ptr<arrow::Column>& col, PyObject* py_ref, PyObject** out);
+
+struct PandasOptions {
+  bool strings_to_categorical;
+};
+
+// Convert a whole table as efficiently as possible to a pandas.DataFrame.
+//
+// The returned Python object is a list of tuples consisting of the exact 2D
+// BlockManager structure of the pandas.DataFrame used as of pandas 0.19.x.
+//
+// tuple item: (indices: ndarray[int32], block: ndarray[TYPE, ndim=2])
+PYARROW_EXPORT
+arrow::Status ConvertTableToPandas(
+    const std::shared_ptr<arrow::Table>& table, int nthreads, PyObject** out);
 
 PYARROW_EXPORT
 arrow::Status PandasMaskedToArrow(arrow::MemoryPool* pool, PyObject* ao, PyObject* mo,
     std::shared_ptr<arrow::Array>* out);
 
 PYARROW_EXPORT
-arrow::Status PandasToArrow(arrow::MemoryPool* pool, PyObject* ao,
-    std::shared_ptr<arrow::Array>* out);
+arrow::Status PandasToArrow(
+    arrow::MemoryPool* pool, PyObject* ao, std::shared_ptr<arrow::Array>* out);
 
-} // namespace pyarrow
+}  // namespace pyarrow
 
-#endif // PYARROW_ADAPTERS_PANDAS_H
+#endif  // PYARROW_ADAPTERS_PANDAS_H

http://git-wip-us.apache.org/repos/asf/arrow/blob/65af9ea1/python/src/pyarrow/api.h
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/api.h b/python/src/pyarrow/api.h
index 6dbbc45..f65cc09 100644
--- a/python/src/pyarrow/api.h
+++ b/python/src/pyarrow/api.h
@@ -23,4 +23,4 @@
 #include "pyarrow/adapters/builtin.h"
 #include "pyarrow/adapters/pandas.h"
 
-#endif // PYARROW_API_H
+#endif  // PYARROW_API_H

http://git-wip-us.apache.org/repos/asf/arrow/blob/65af9ea1/python/src/pyarrow/common.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/common.cc b/python/src/pyarrow/common.cc
index fb4d349..8660ac8 100644
--- a/python/src/pyarrow/common.cc
+++ b/python/src/pyarrow/common.cc
@@ -73,7 +73,7 @@ arrow::MemoryPool* get_memory_pool() {
 
 PyBytesBuffer::PyBytesBuffer(PyObject* obj)
     : Buffer(reinterpret_cast<const uint8_t*>(PyBytes_AS_STRING(obj)),
-        PyBytes_GET_SIZE(obj)),
+          PyBytes_GET_SIZE(obj)),
       obj_(obj) {
   Py_INCREF(obj_);
 }
@@ -83,4 +83,4 @@ PyBytesBuffer::~PyBytesBuffer() {
   Py_DECREF(obj_);
 }
 
-} // namespace pyarrow
+}  // namespace pyarrow

http://git-wip-us.apache.org/repos/asf/arrow/blob/65af9ea1/python/src/pyarrow/common.h
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/common.h b/python/src/pyarrow/common.h
index 7e33826..639918d 100644
--- a/python/src/pyarrow/common.h
+++ b/python/src/pyarrow/common.h
@@ -24,7 +24,9 @@
 #include "arrow/buffer.h"
 #include "arrow/util/macros.h"
 
-namespace arrow { class MemoryPool; }
+namespace arrow {
+class MemoryPool;
+}
 
 namespace pyarrow {
 
@@ -34,27 +36,18 @@ class OwnedRef {
  public:
   OwnedRef() : obj_(nullptr) {}
 
-  OwnedRef(PyObject* obj) :
-      obj_(obj) {}
+  OwnedRef(PyObject* obj) : obj_(obj) {}
 
-  ~OwnedRef() {
-    Py_XDECREF(obj_);
-  }
+  ~OwnedRef() { Py_XDECREF(obj_); }
 
   void reset(PyObject* obj) {
-    if (obj_ != nullptr) {
-      Py_XDECREF(obj_);
-    }
+    if (obj_ != nullptr) { Py_XDECREF(obj_); }
     obj_ = obj;
   }
 
-  void release() {
-    obj_ = nullptr;
-  }
+  void release() { obj_ = nullptr; }
 
-  PyObject* obj() const{
-    return obj_;
-  }
+  PyObject* obj() const { return obj_; }
 
  private:
   PyObject* obj_;
@@ -78,13 +71,10 @@ struct PyObjectStringify {
 
 class PyGILGuard {
  public:
-  PyGILGuard() {
-    state_ = PyGILState_Ensure();
-  }
+  PyGILGuard() { state_ = PyGILState_Ensure(); }
+
+  ~PyGILGuard() { PyGILState_Release(state_); }
 
-  ~PyGILGuard() {
-    PyGILState_Release(state_);
-  }
  private:
   PyGILState_STATE state_;
   DISALLOW_COPY_AND_ASSIGN(PyGILGuard);
@@ -108,8 +98,7 @@ PYARROW_EXPORT arrow::MemoryPool* get_memory_pool();
 
 class PYARROW_EXPORT NumPyBuffer : public arrow::Buffer {
  public:
-  NumPyBuffer(PyArrayObject* arr)
-    : Buffer(nullptr, 0) {
+  NumPyBuffer(PyArrayObject* arr) : Buffer(nullptr, 0) {
     arr_ = arr;
     Py_INCREF(arr);
 
@@ -118,9 +107,7 @@ class PYARROW_EXPORT NumPyBuffer : public arrow::Buffer {
     capacity_ = size_;
   }
 
-  virtual ~NumPyBuffer() {
-    Py_XDECREF(arr_);
-  }
+  virtual ~NumPyBuffer() { Py_XDECREF(arr_); }
 
  private:
   PyArrayObject* arr_;
@@ -135,22 +122,17 @@ class PYARROW_EXPORT PyBytesBuffer : public arrow::Buffer {
   PyObject* obj_;
 };
 
-
 class PyAcquireGIL {
  public:
-  PyAcquireGIL() {
-    state_ = PyGILState_Ensure();
-  }
+  PyAcquireGIL() { state_ = PyGILState_Ensure(); }
 
-  ~PyAcquireGIL() {
-    PyGILState_Release(state_);
-  }
+  ~PyAcquireGIL() { PyGILState_Release(state_); }
 
  private:
   PyGILState_STATE state_;
   DISALLOW_COPY_AND_ASSIGN(PyAcquireGIL);
 };
 
-} // namespace pyarrow
+}  // namespace pyarrow
 
-#endif // PYARROW_COMMON_H
+#endif  // PYARROW_COMMON_H

http://git-wip-us.apache.org/repos/asf/arrow/blob/65af9ea1/python/src/pyarrow/config.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/config.cc b/python/src/pyarrow/config.cc
index 730d2db..e1002bf 100644
--- a/python/src/pyarrow/config.cc
+++ b/python/src/pyarrow/config.cc
@@ -21,8 +21,7 @@
 
 namespace pyarrow {
 
-void pyarrow_init() {
-}
+void pyarrow_init() {}
 
 PyObject* numpy_nan = nullptr;
 
@@ -31,4 +30,4 @@ void pyarrow_set_numpy_nan(PyObject* obj) {
   numpy_nan = obj;
 }
 
-} // namespace pyarrow
+}  // namespace pyarrow

http://git-wip-us.apache.org/repos/asf/arrow/blob/65af9ea1/python/src/pyarrow/config.h
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/config.h b/python/src/pyarrow/config.h
index 82936b1..386ee4b 100644
--- a/python/src/pyarrow/config.h
+++ b/python/src/pyarrow/config.h
@@ -24,7 +24,7 @@
 #include "pyarrow/visibility.h"
 
 #if PY_MAJOR_VERSION >= 3
-  #define PyString_Check PyUnicode_Check
+#define PyString_Check PyUnicode_Check
 #endif
 
 namespace pyarrow {
@@ -38,6 +38,6 @@ void pyarrow_init();
 PYARROW_EXPORT
 void pyarrow_set_numpy_nan(PyObject* obj);
 
-} // namespace pyarrow
+}  // namespace pyarrow
 
-#endif // PYARROW_CONFIG_H
+#endif  // PYARROW_CONFIG_H

http://git-wip-us.apache.org/repos/asf/arrow/blob/65af9ea1/python/src/pyarrow/helpers.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/helpers.cc b/python/src/pyarrow/helpers.cc
index b42199c..3f65032 100644
--- a/python/src/pyarrow/helpers.cc
+++ b/python/src/pyarrow/helpers.cc
@@ -23,36 +23,35 @@ using namespace arrow;
 
 namespace pyarrow {
 
-
-#define GET_PRIMITIVE_TYPE(NAME, FACTORY)       \
-  case Type::NAME:                              \
-    return FACTORY();                           \
+#define GET_PRIMITIVE_TYPE(NAME, FACTORY) \
+  case Type::NAME:                        \
+    return FACTORY();                     \
     break;
 
 std::shared_ptr<DataType> GetPrimitiveType(Type::type type) {
   switch (type) {
     case Type::NA:
       return null();
-    GET_PRIMITIVE_TYPE(UINT8, uint8);
-    GET_PRIMITIVE_TYPE(INT8, int8);
-    GET_PRIMITIVE_TYPE(UINT16, uint16);
-    GET_PRIMITIVE_TYPE(INT16, int16);
-    GET_PRIMITIVE_TYPE(UINT32, uint32);
-    GET_PRIMITIVE_TYPE(INT32, int32);
-    GET_PRIMITIVE_TYPE(UINT64, uint64);
-    GET_PRIMITIVE_TYPE(INT64, int64);
-    GET_PRIMITIVE_TYPE(DATE, date);
+      GET_PRIMITIVE_TYPE(UINT8, uint8);
+      GET_PRIMITIVE_TYPE(INT8, int8);
+      GET_PRIMITIVE_TYPE(UINT16, uint16);
+      GET_PRIMITIVE_TYPE(INT16, int16);
+      GET_PRIMITIVE_TYPE(UINT32, uint32);
+      GET_PRIMITIVE_TYPE(INT32, int32);
+      GET_PRIMITIVE_TYPE(UINT64, uint64);
+      GET_PRIMITIVE_TYPE(INT64, int64);
+      GET_PRIMITIVE_TYPE(DATE, date);
     case Type::TIMESTAMP:
       return arrow::timestamp(arrow::TimeUnit::MICRO);
       break;
-    GET_PRIMITIVE_TYPE(BOOL, boolean);
-    GET_PRIMITIVE_TYPE(FLOAT, float32);
-    GET_PRIMITIVE_TYPE(DOUBLE, float64);
-    GET_PRIMITIVE_TYPE(BINARY, binary);
-    GET_PRIMITIVE_TYPE(STRING, utf8);
+      GET_PRIMITIVE_TYPE(BOOL, boolean);
+      GET_PRIMITIVE_TYPE(FLOAT, float32);
+      GET_PRIMITIVE_TYPE(DOUBLE, float64);
+      GET_PRIMITIVE_TYPE(BINARY, binary);
+      GET_PRIMITIVE_TYPE(STRING, utf8);
     default:
       return nullptr;
   }
 }
 
-} // namespace pyarrow
+}  // namespace pyarrow

http://git-wip-us.apache.org/repos/asf/arrow/blob/65af9ea1/python/src/pyarrow/helpers.h
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/helpers.h b/python/src/pyarrow/helpers.h
index 8334d97..788c3ee 100644
--- a/python/src/pyarrow/helpers.h
+++ b/python/src/pyarrow/helpers.h
@@ -31,6 +31,6 @@ using arrow::Type;
 PYARROW_EXPORT
 std::shared_ptr<DataType> GetPrimitiveType(Type::type type);
 
-} // namespace pyarrow
+}  // namespace pyarrow
 
-#endif // PYARROW_HELPERS_H
+#endif  // PYARROW_HELPERS_H

http://git-wip-us.apache.org/repos/asf/arrow/blob/65af9ea1/python/src/pyarrow/io.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/io.cc b/python/src/pyarrow/io.cc
index 12f5ba0..ac1aa63 100644
--- a/python/src/pyarrow/io.cc
+++ b/python/src/pyarrow/io.cc
@@ -33,8 +33,7 @@ namespace pyarrow {
 // ----------------------------------------------------------------------
 // Python file
 
-PythonFile::PythonFile(PyObject* file)
-    : file_(file) {
+PythonFile::PythonFile(PyObject* file) : file_(file) {
   Py_INCREF(file_);
 }
 
@@ -81,8 +80,8 @@ Status PythonFile::Read(int64_t nbytes, PyObject** out) {
 }
 
 Status PythonFile::Write(const uint8_t* data, int64_t nbytes) {
-  PyObject* py_data = PyBytes_FromStringAndSize(
-      reinterpret_cast<const char*>(data), nbytes);
+  PyObject* py_data =
+      PyBytes_FromStringAndSize(reinterpret_cast<const char*>(data), nbytes);
   ARROW_RETURN_NOT_OK(CheckPyError());
 
   PyObject* result = PyObject_CallMethod(file_, "write", "(O)", py_data);
@@ -102,7 +101,7 @@ Status PythonFile::Tell(int64_t* position) {
   // PyLong_AsLongLong can raise OverflowError
   ARROW_RETURN_NOT_OK(CheckPyError());
 
- return Status::OK();
+  return Status::OK();
 }
 
 // ----------------------------------------------------------------------
@@ -156,7 +155,8 @@ Status PyReadableFile::Read(int64_t nbytes, std::shared_ptr<arrow::Buffer>* out)
 Status PyReadableFile::GetSize(int64_t* size) {
   PyGILGuard lock;
 
-  int64_t current_position;;
+  int64_t current_position;
+  ;
   ARROW_RETURN_NOT_OK(file_->Tell(&current_position));
 
   ARROW_RETURN_NOT_OK(file_->Seek(0, 2));
@@ -204,7 +204,7 @@ Status PyOutputStream::Write(const uint8_t* data, int64_t nbytes) {
 
 PyBytesReader::PyBytesReader(PyObject* obj)
     : arrow::io::BufferReader(reinterpret_cast<const uint8_t*>(PyBytes_AS_STRING(obj)),
-        PyBytes_GET_SIZE(obj)),
+          PyBytes_GET_SIZE(obj)),
       obj_(obj) {
   Py_INCREF(obj_);
 }

http://git-wip-us.apache.org/repos/asf/arrow/blob/65af9ea1/python/src/pyarrow/io.h
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/io.h b/python/src/pyarrow/io.h
index e14aa8c..fd3e7c0 100644
--- a/python/src/pyarrow/io.h
+++ b/python/src/pyarrow/io.h
@@ -24,7 +24,9 @@
 #include "pyarrow/config.h"
 #include "pyarrow/visibility.h"
 
-namespace arrow { class MemoryPool; }
+namespace arrow {
+class MemoryPool;
+}
 
 namespace pyarrow {
 
@@ -92,6 +94,6 @@ class PYARROW_EXPORT PyBytesReader : public arrow::io::BufferReader {
 
 // TODO(wesm): seekable output files
 
-} // namespace pyarrow
+}  // namespace pyarrow
 
 #endif  // PYARROW_IO_H