You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by uw...@apache.org on 2016/12/21 08:32:05 UTC

arrow git commit: ARROW-374: More precise handling of bytes vs unicode in Python API

Repository: arrow
Updated Branches:
  refs/heads/master 73455b56f -> 268ffbeff


ARROW-374: More precise handling of bytes vs unicode in Python API

Python built-in types that are not all unicode become `arrow::BinaryArray` instead of `arrow::StringArray`, since we cannot be sure that the PyBytes objects are UTF-8-encoded strings.

Author: Wes McKinney <we...@twosigma.com>

Closes #249 from wesm/ARROW-374 and squashes the following commits:

1371a30 [Wes McKinney] py3 fixes
8ac3a49 [Wes McKinney] Consistently convert PyBytes to BinaryArray with pandas, too
83d1c05 [Wes McKinney] Remove print statement
c8df606 [Wes McKinney] Timestamp and time cannot be static
4a9aaf4 [Wes McKinney] Add Python interface to BinaryArray, convert PyBytes to binary instead of assuming utf8 unicode


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/268ffbef
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/268ffbef
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/268ffbef

Branch: refs/heads/master
Commit: 268ffbeffb1cd0617e52d381d500a2d10f61124c
Parents: 73455b5
Author: Wes McKinney <we...@twosigma.com>
Authored: Wed Dec 21 09:31:56 2016 +0100
Committer: Uwe L. Korn <uw...@xhochy.com>
Committed: Wed Dec 21 09:31:56 2016 +0100

----------------------------------------------------------------------
 cpp/src/arrow/type.cc                        |  6 +-
 python/pyarrow/__init__.py                   |  5 +-
 python/pyarrow/array.pyx                     |  5 ++
 python/pyarrow/includes/libarrow.pxd         |  6 +-
 python/pyarrow/scalar.pyx                    | 16 ++++-
 python/pyarrow/schema.pyx                    |  6 ++
 python/pyarrow/tests/test_convert_builtin.py | 31 ++++++---
 python/pyarrow/tests/test_convert_pandas.py  | 18 +++--
 python/pyarrow/tests/test_scalars.py         | 22 +++++--
 python/src/pyarrow/adapters/builtin.cc       | 80 ++++++++++++++++-------
 python/src/pyarrow/adapters/pandas.cc        | 65 +++++++++++++++++-
 python/src/pyarrow/helpers.cc                | 50 +++++---------
 python/src/pyarrow/helpers.h                 | 16 -----
 13 files changed, 227 insertions(+), 99 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/268ffbef/cpp/src/arrow/type.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc
index 4748cc3..8ff9eea 100644
--- a/cpp/src/arrow/type.cc
+++ b/cpp/src/arrow/type.cc
@@ -155,13 +155,11 @@ TYPE_FACTORY(binary, BinaryType);
 TYPE_FACTORY(date, DateType);
 
 std::shared_ptr<DataType> timestamp(TimeUnit unit) {
-  static std::shared_ptr<DataType> result = std::make_shared<TimestampType>();
-  return result;
+  return std::make_shared<TimestampType>(unit);
 }
 
 std::shared_ptr<DataType> time(TimeUnit unit) {
-  static std::shared_ptr<DataType> result = std::make_shared<TimeType>();
-  return result;
+  return std::make_shared<TimeType>(unit);
 }
 
 std::shared_ptr<DataType> list(const std::shared_ptr<DataType>& value_type) {

http://git-wip-us.apache.org/repos/asf/arrow/blob/268ffbef/python/pyarrow/__init__.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index 39ba4c7..9ede934 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -40,13 +40,14 @@ from pyarrow.scalar import (ArrayValue, Scalar, NA, NAType,
                             BooleanValue,
                             Int8Value, Int16Value, Int32Value, Int64Value,
                             UInt8Value, UInt16Value, UInt32Value, UInt64Value,
-                            FloatValue, DoubleValue, ListValue, StringValue)
+                            FloatValue, DoubleValue, ListValue,
+                            BinaryValue, StringValue)
 
 from pyarrow.schema import (null, bool_,
                             int8, int16, int32, int64,
                             uint8, uint16, uint32, uint64,
                             timestamp, date,
-                            float_, double, string,
+                            float_, double, binary, string,
                             list_, struct, field,
                             DataType, Field, Schema, schema)
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/268ffbef/python/pyarrow/array.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/array.pyx b/python/pyarrow/array.pyx
index 84f1705..c178d5c 100644
--- a/python/pyarrow/array.pyx
+++ b/python/pyarrow/array.pyx
@@ -238,6 +238,10 @@ cdef class StringArray(Array):
     pass
 
 
+cdef class BinaryArray(Array):
+    pass
+
+
 cdef dict _array_classes = {
     Type_NA: NullArray,
     Type_BOOL: BooleanArray,
@@ -253,6 +257,7 @@ cdef dict _array_classes = {
     Type_FLOAT: FloatArray,
     Type_DOUBLE: DoubleArray,
     Type_LIST: ListArray,
+    Type_BINARY: BinaryArray,
     Type_STRING: StringArray,
     Type_TIMESTAMP: Int64Array,
 }

http://git-wip-us.apache.org/repos/asf/arrow/blob/268ffbef/python/pyarrow/includes/libarrow.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 419dd74..40fb60d 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -40,6 +40,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
 
         Type_TIMESTAMP" arrow::Type::TIMESTAMP"
         Type_DATE" arrow::Type::DATE"
+        Type_BINARY" arrow::Type::BINARY"
         Type_STRING" arrow::Type::STRING"
 
         Type_LIST" arrow::Type::LIST"
@@ -161,7 +162,10 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
         shared_ptr[CArray] values()
         shared_ptr[CDataType] value_type()
 
-    cdef cppclass CStringArray" arrow::StringArray"(CListArray):
+    cdef cppclass CBinaryArray" arrow::BinaryArray"(CListArray):
+        const uint8_t* GetValue(int i, int32_t* length)
+
+    cdef cppclass CStringArray" arrow::StringArray"(CBinaryArray):
         c_string GetString(int i)
 
     cdef cppclass CChunkedArray" arrow::ChunkedArray":

http://git-wip-us.apache.org/repos/asf/arrow/blob/268ffbef/python/pyarrow/scalar.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/scalar.pyx b/python/pyarrow/scalar.pyx
index 623e3e4..a0610a1 100644
--- a/python/pyarrow/scalar.pyx
+++ b/python/pyarrow/scalar.pyx
@@ -22,6 +22,7 @@ import pyarrow.schema as schema
 
 import datetime
 
+cimport cpython as cp
 
 NA = None
 
@@ -170,6 +171,18 @@ cdef class StringValue(ArrayValue):
         return frombytes(ap.GetString(self.index))
 
 
+cdef class BinaryValue(ArrayValue):
+
+    def as_py(self):
+        cdef:
+            const uint8_t* ptr
+            int32_t length
+            CBinaryArray* ap = <CBinaryArray*> self.sp_array.get()
+
+        ptr = ap.GetValue(self.index, &length)
+        return cp.PyBytes_FromStringAndSize(<const char*>(ptr), length)
+
+
 cdef class ListValue(ArrayValue):
 
     def __len__(self):
@@ -218,7 +231,8 @@ cdef dict _scalar_classes = {
     Type_FLOAT: FloatValue,
     Type_DOUBLE: DoubleValue,
     Type_LIST: ListValue,
-    Type_STRING: StringValue
+    Type_BINARY: BinaryValue,
+    Type_STRING: StringValue,
 }
 
 cdef object box_arrow_scalar(DataType type,

http://git-wip-us.apache.org/repos/asf/arrow/blob/268ffbef/python/pyarrow/schema.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/schema.pyx b/python/pyarrow/schema.pyx
index d05ac9e..7a69b0f 100644
--- a/python/pyarrow/schema.pyx
+++ b/python/pyarrow/schema.pyx
@@ -215,6 +215,12 @@ def string():
     """
     return primitive_type(Type_STRING)
 
+def binary():
+    """
+    Binary (PyBytes-like) type
+    """
+    return primitive_type(Type_BINARY)
+
 def list_(DataType value_type):
     cdef DataType out = DataType()
     cdef shared_ptr[CDataType] list_type

http://git-wip-us.apache.org/repos/asf/arrow/blob/268ffbef/python/pyarrow/tests/test_convert_builtin.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py
index 7dc1c1b..a5f7aa5 100644
--- a/python/pyarrow/tests/test_convert_builtin.py
+++ b/python/pyarrow/tests/test_convert_builtin.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-from pyarrow.compat import unittest
+from pyarrow.compat import unittest, u
 import pyarrow
 
 import datetime
@@ -71,16 +71,28 @@ class TestConvertList(unittest.TestCase):
         assert arr.type == pyarrow.double()
         assert arr.to_pylist() == data
 
-    def test_string(self):
-        data = ['foo', b'bar', None, 'arrow']
+    def test_unicode(self):
+        data = [u('foo'), u('bar'), None, u('arrow')]
         arr = pyarrow.from_pylist(data)
         assert len(arr) == 4
         assert arr.null_count == 1
         assert arr.type == pyarrow.string()
-        assert arr.to_pylist() == ['foo', 'bar', None, 'arrow']
+        assert arr.to_pylist() == [u('foo'), u('bar'), None, u('arrow')]
+
+    def test_bytes(self):
+        u1 = b'ma\xc3\xb1ana'
+        data = [b'foo',
+                u1.decode('utf-8'),  # unicode gets encoded,
+                None]
+        arr = pyarrow.from_pylist(data)
+        assert len(arr) == 3
+        assert arr.null_count == 1
+        assert arr.type == pyarrow.binary()
+        assert arr.to_pylist() == [b'foo', u1, None]
 
     def test_date(self):
-        data = [datetime.date(2000, 1, 1), None, datetime.date(1970, 1, 1), datetime.date(2040, 2, 26)]
+        data = [datetime.date(2000, 1, 1), None, datetime.date(1970, 1, 1),
+                datetime.date(2040, 2, 26)]
         arr = pyarrow.from_pylist(data)
         assert len(arr) == 4
         assert arr.type == pyarrow.date()
@@ -101,10 +113,13 @@ class TestConvertList(unittest.TestCase):
         assert len(arr) == 4
         assert arr.type == pyarrow.timestamp()
         assert arr.null_count == 1
-        assert arr[0].as_py() == datetime.datetime(2007, 7, 13, 1, 23, 34, 123456)
+        assert arr[0].as_py() == datetime.datetime(2007, 7, 13, 1,
+                                                   23, 34, 123456)
         assert arr[1].as_py() is None
-        assert arr[2].as_py() == datetime.datetime(2006, 1, 13, 12, 34, 56, 432539)
-        assert arr[3].as_py() == datetime.datetime(2010, 8, 13, 5, 46, 57, 437699)
+        assert arr[2].as_py() == datetime.datetime(2006, 1, 13, 12,
+                                                   34, 56, 432539)
+        assert arr[3].as_py() == datetime.datetime(2010, 8, 13, 5,
+                                                   46, 57, 437699)
 
     def test_mixed_nesting_levels(self):
         pyarrow.from_pylist([1, 2, None])

http://git-wip-us.apache.org/repos/asf/arrow/blob/268ffbef/python/pyarrow/tests/test_convert_pandas.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index cf50f3d..da34f85 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -23,6 +23,7 @@ import numpy as np
 import pandas as pd
 import pandas.util.testing as tm
 
+from pyarrow.compat import u
 import pyarrow as A
 
 
@@ -157,13 +158,22 @@ class TestPandasConversion(unittest.TestCase):
         df = pd.DataFrame({'bools': arr})
         self._check_pandas_roundtrip(df)
 
-    def test_strings(self):
+    def test_unicode(self):
         repeats = 1000
-        values = [b'foo', None, u'bar', 'qux', np.nan]
+        values = [u('foo'), None, u('bar'), u('qux'), np.nan]
         df = pd.DataFrame({'strings': values * repeats})
 
-        values = ['foo', None, u'bar', 'qux', None]
-        expected = pd.DataFrame({'strings': values * repeats})
+        self._check_pandas_roundtrip(df)
+
+    def test_bytes_to_binary(self):
+        values = [u('qux'), b'foo', None, 'bar', 'qux', np.nan]
+        df = pd.DataFrame({'strings': values})
+
+        table = A.from_pandas_dataframe(df)
+        assert table[0].type == A.binary()
+
+        values2 = [b'qux', b'foo', None, b'bar', b'qux', np.nan]
+        expected = pd.DataFrame({'strings': values2})
         self._check_pandas_roundtrip(df, expected)
 
     def test_timestamps_notimezone_no_nulls(self):

http://git-wip-us.apache.org/repos/asf/arrow/blob/268ffbef/python/pyarrow/tests/test_scalars.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py
index 4fb850a..19cfacb 100644
--- a/python/pyarrow/tests/test_scalars.py
+++ b/python/pyarrow/tests/test_scalars.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-from pyarrow.compat import unittest, u
+from pyarrow.compat import unittest, u, unicode_type
 import pyarrow as A
 
 
@@ -58,20 +58,32 @@ class TestScalars(unittest.TestCase):
         v = arr[2]
         assert v.as_py() == 3.0
 
-    def test_string(self):
-        arr = A.from_pylist(['foo', None, u('bar')])
+    def test_string_unicode(self):
+        arr = A.from_pylist([u('foo'), None, u('bar')])
 
         v = arr[0]
         assert isinstance(v, A.StringValue)
-        assert repr(v) == "'foo'"
         assert v.as_py() == 'foo'
 
         assert arr[1] is A.NA
 
         v = arr[2].as_py()
-        assert v == 'bar'
+        assert v == u('bar')
         assert isinstance(v, str)
 
+    def test_bytes(self):
+        arr = A.from_pylist([b'foo', None, u('bar')])
+
+        v = arr[0]
+        assert isinstance(v, A.BinaryValue)
+        assert v.as_py() == b'foo'
+
+        assert arr[1] is A.NA
+
+        v = arr[2].as_py()
+        assert v == b'bar'
+        assert isinstance(v, bytes)
+
     def test_list(self):
         arr = A.from_pylist([['foo', None], None, ['bar'], []])
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/268ffbef/python/src/pyarrow/adapters/builtin.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/adapters/builtin.cc b/python/src/pyarrow/adapters/builtin.cc
index e0cb7c2..2a13944 100644
--- a/python/src/pyarrow/adapters/builtin.cc
+++ b/python/src/pyarrow/adapters/builtin.cc
@@ -42,14 +42,6 @@ static inline bool IsPyInteger(PyObject* obj) {
 #endif
 }
 
-static inline bool IsPyBaseString(PyObject* obj) {
-#if PYARROW_IS_PY2
-  return PyString_Check(obj) || PyUnicode_Check(obj);
-#else
-  return PyUnicode_Check(obj);
-#endif
-}
-
 class ScalarVisitor {
  public:
   ScalarVisitor() :
@@ -60,7 +52,8 @@ class ScalarVisitor {
       date_count_(0),
       timestamp_count_(0),
       float_count_(0),
-      string_count_(0) {}
+      binary_count_(0),
+      unicode_count_(0) {}
 
   void Visit(PyObject* obj) {
     ++total_count_;
@@ -76,8 +69,10 @@ class ScalarVisitor {
       ++date_count_;
     } else if (PyDateTime_CheckExact(obj)) {
       ++timestamp_count_;
-    } else if (IsPyBaseString(obj)) {
-      ++string_count_;
+    } else if (PyBytes_Check(obj)) {
+      ++binary_count_;
+    } else if (PyUnicode_Check(obj)) {
+      ++unicode_count_;
     } else {
       // TODO(wesm): accumulate error information somewhere
     }
@@ -86,20 +81,22 @@ class ScalarVisitor {
   std::shared_ptr<DataType> GetType() {
     // TODO(wesm): handling mixed-type cases
     if (float_count_) {
-      return DOUBLE;
+      return arrow::float64();
     } else if (int_count_) {
       // TODO(wesm): tighter type later
-      return INT64;
+      return arrow::int64();
     } else if (date_count_) {
-      return DATE;
+      return arrow::date();
     } else if (timestamp_count_) {
-      return TIMESTAMP_US;
+      return arrow::timestamp(arrow::TimeUnit::MICRO);
     } else if (bool_count_) {
-      return BOOL;
-    } else if (string_count_) {
-      return STRING;
+      return arrow::boolean();
+    } else if (binary_count_) {
+      return arrow::binary();
+    } else if (unicode_count_) {
+      return arrow::utf8();
     } else {
-      return NA;
+      return arrow::null();
     }
   }
 
@@ -115,7 +112,8 @@ class ScalarVisitor {
   int64_t date_count_;
   int64_t timestamp_count_;
   int64_t float_count_;
-  int64_t string_count_;
+  int64_t binary_count_;
+  int64_t unicode_count_;
 
   // Place to accumulate errors
   // std::vector<Status> errors_;
@@ -163,7 +161,7 @@ class SeqVisitor {
   std::shared_ptr<DataType> GetType() {
     if (scalars_.total_count() == 0) {
       if (max_nesting_level_ == 0) {
-        return NA;
+        return arrow::null();
       } else {
         return nullptr;
       }
@@ -227,7 +225,7 @@ static Status InferArrowType(PyObject* obj, int64_t* size,
 
   // For 0-length sequences, refuse to guess
   if (*size == 0) {
-    *out_type = NA;
+    *out_type = arrow::null();
   }
 
   SeqVisitor seq_visitor;
@@ -381,7 +379,7 @@ class DoubleConverter : public TypedConverter<arrow::DoubleBuilder> {
   }
 };
 
-class StringConverter : public TypedConverter<arrow::StringBuilder> {
+class BytesConverter : public TypedConverter<arrow::BinaryBuilder> {
  public:
   Status AppendData(PyObject* seq) override {
     PyObject* item;
@@ -415,6 +413,38 @@ class StringConverter : public TypedConverter<arrow::StringBuilder> {
   }
 };
 
+class UTF8Converter : public TypedConverter<arrow::StringBuilder> {
+ public:
+  Status AppendData(PyObject* seq) override {
+    PyObject* item;
+    PyObject* bytes_obj;
+    OwnedRef tmp;
+    const char* bytes;
+    int32_t length;
+    Py_ssize_t size = PySequence_Size(seq);
+    for (int64_t i = 0; i < size; ++i) {
+      item = PySequence_GetItem(seq, i);
+      OwnedRef holder(item);
+
+      if (item == Py_None) {
+        RETURN_NOT_OK(typed_builder_->AppendNull());
+        continue;
+      } else if (!PyUnicode_Check(item)) {
+        return Status::TypeError("Non-unicode value encountered");
+      }
+      tmp.reset(PyUnicode_AsUTF8String(item));
+      RETURN_IF_PYERROR();
+      bytes_obj = tmp.obj();
+
+      // No error checking
+      length = PyBytes_GET_SIZE(bytes_obj);
+      bytes = PyBytes_AS_STRING(bytes_obj);
+      RETURN_NOT_OK(typed_builder_->Append(bytes, length));
+    }
+    return Status::OK();
+  }
+};
+
 class ListConverter : public TypedConverter<arrow::ListBuilder> {
  public:
   Status Init(const std::shared_ptr<ArrayBuilder>& builder) override;
@@ -449,8 +479,10 @@ std::shared_ptr<SeqConverter> GetConverter(const std::shared_ptr<DataType>& type
       return std::make_shared<TimestampConverter>();
     case Type::DOUBLE:
       return std::make_shared<DoubleConverter>();
+    case Type::BINARY:
+      return std::make_shared<BytesConverter>();
     case Type::STRING:
-      return std::make_shared<StringConverter>();
+      return std::make_shared<UTF8Converter>();
     case Type::LIST:
       return std::make_shared<ListConverter>();
     case Type::STRUCT:

http://git-wip-us.apache.org/repos/asf/arrow/blob/268ffbef/python/src/pyarrow/adapters/pandas.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/adapters/pandas.cc b/python/src/pyarrow/adapters/pandas.cc
index f8dff6d..38f3b6f 100644
--- a/python/src/pyarrow/adapters/pandas.cc
+++ b/python/src/pyarrow/adapters/pandas.cc
@@ -193,6 +193,9 @@ class ArrowSerializer {
   Status ConvertObjectStrings(std::shared_ptr<Array>* out) {
     PyAcquireGIL lock;
 
+    // The output type at this point is inconclusive because there may be bytes
+    // and unicode mixed in the object array
+
     PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
     arrow::TypePtr string_type(new arrow::StringType());
     arrow::StringBuilder string_builder(pool_, string_type);
@@ -200,6 +203,7 @@ class ArrowSerializer {
 
     Status s;
     PyObject* obj;
+    bool have_bytes = false;
     for (int64_t i = 0; i < length_; ++i) {
       obj = objects[i];
       if (PyUnicode_Check(obj)) {
@@ -215,13 +219,21 @@ class ArrowSerializer {
           return s;
         }
       } else if (PyBytes_Check(obj)) {
+        have_bytes = true;
         const int32_t length = PyBytes_GET_SIZE(obj);
         RETURN_NOT_OK(string_builder.Append(PyBytes_AS_STRING(obj), length));
       } else {
         string_builder.AppendNull();
       }
     }
-    return string_builder.Finish(out);
+    RETURN_NOT_OK(string_builder.Finish(out));
+
+    if (have_bytes) {
+      const auto& arr = static_cast<const arrow::StringArray&>(*out->get());
+      *out = std::make_shared<arrow::BinaryArray>(arr.length(), arr.offsets(),
+          arr.data(), arr.null_count(), arr.null_bitmap());
+    }
+    return Status::OK();
   }
 
   Status ConvertBooleans(std::shared_ptr<Array>* out) {
@@ -865,7 +877,7 @@ class ArrowDeserializer {
     return Status::OK();
   }
 
-  // UTF8
+  // UTF8 strings
   template <int T2>
   inline typename std::enable_if<
     T2 == arrow::Type::STRING, Status>::type
@@ -912,6 +924,54 @@ class ArrowDeserializer {
     return Status::OK();
   }
 
+  template <int T2>
+  inline typename std::enable_if<
+    T2 == arrow::Type::BINARY, Status>::type
+  ConvertValues(const std::shared_ptr<arrow::ChunkedArray>& data) {
+    size_t chunk_offset = 0;
+    PyAcquireGIL lock;
+
+    RETURN_NOT_OK(AllocateOutput(NPY_OBJECT));
+
+    for (int c = 0; c < data->num_chunks(); c++) {
+      const std::shared_ptr<Array> arr = data->chunk(c);
+      auto binary_arr = static_cast<arrow::BinaryArray*>(arr.get());
+      auto out_values = reinterpret_cast<PyObject**>(PyArray_DATA(out_)) + chunk_offset;
+
+      const uint8_t* data_ptr;
+      int32_t length;
+      if (data->null_count() > 0) {
+        for (int64_t i = 0; i < arr->length(); ++i) {
+          if (binary_arr->IsNull(i)) {
+            Py_INCREF(Py_None);
+            out_values[i] = Py_None;
+          } else {
+            data_ptr = binary_arr->GetValue(i, &length);
+
+            out_values[i] = PyBytes_FromStringAndSize(
+                reinterpret_cast<const char*>(data_ptr), length);
+            if (out_values[i] == nullptr) {
+              return Status::UnknownError("String initialization failed");
+            }
+          }
+        }
+      } else {
+        for (int64_t i = 0; i < arr->length(); ++i) {
+          data_ptr = binary_arr->GetValue(i, &length);
+          out_values[i] = PyBytes_FromStringAndSize(
+              reinterpret_cast<const char*>(data_ptr), length);
+          if (out_values[i] == nullptr) {
+            return Status::UnknownError("String initialization failed");
+          }
+        }
+      }
+
+      chunk_offset += binary_arr->length();
+    }
+
+    return Status::OK();
+  }
+
  private:
   std::shared_ptr<Column> col_;
   PyObject* py_ref_;
@@ -948,6 +1008,7 @@ Status ConvertColumnToPandas(const std::shared_ptr<Column>& col, PyObject* py_re
     FROM_ARROW_CASE(UINT64);
     FROM_ARROW_CASE(FLOAT);
     FROM_ARROW_CASE(DOUBLE);
+    FROM_ARROW_CASE(BINARY);
     FROM_ARROW_CASE(STRING);
     FROM_ARROW_CASE(DATE);
     FROM_ARROW_CASE(TIMESTAMP);

http://git-wip-us.apache.org/repos/asf/arrow/blob/268ffbef/python/src/pyarrow/helpers.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/helpers.cc b/python/src/pyarrow/helpers.cc
index af92744..b42199c 100644
--- a/python/src/pyarrow/helpers.cc
+++ b/python/src/pyarrow/helpers.cc
@@ -23,47 +23,33 @@ using namespace arrow;
 
 namespace pyarrow {
 
-const std::shared_ptr<NullType> NA = std::make_shared<NullType>();
-const std::shared_ptr<BooleanType> BOOL = std::make_shared<BooleanType>();
-const std::shared_ptr<UInt8Type> UINT8 = std::make_shared<UInt8Type>();
-const std::shared_ptr<UInt16Type> UINT16 = std::make_shared<UInt16Type>();
-const std::shared_ptr<UInt32Type> UINT32 = std::make_shared<UInt32Type>();
-const std::shared_ptr<UInt64Type> UINT64 = std::make_shared<UInt64Type>();
-const std::shared_ptr<Int8Type> INT8 = std::make_shared<Int8Type>();
-const std::shared_ptr<Int16Type> INT16 = std::make_shared<Int16Type>();
-const std::shared_ptr<Int32Type> INT32 = std::make_shared<Int32Type>();
-const std::shared_ptr<Int64Type> INT64 = std::make_shared<Int64Type>();
-const std::shared_ptr<DateType> DATE = std::make_shared<DateType>();
-const std::shared_ptr<TimestampType> TIMESTAMP_US = std::make_shared<TimestampType>(TimeUnit::MICRO);
-const std::shared_ptr<FloatType> FLOAT = std::make_shared<FloatType>();
-const std::shared_ptr<DoubleType> DOUBLE = std::make_shared<DoubleType>();
-const std::shared_ptr<StringType> STRING = std::make_shared<StringType>();
 
-#define GET_PRIMITIVE_TYPE(NAME, Class)         \
+#define GET_PRIMITIVE_TYPE(NAME, FACTORY)       \
   case Type::NAME:                              \
-    return NAME;                                \
+    return FACTORY();                           \
     break;
 
 std::shared_ptr<DataType> GetPrimitiveType(Type::type type) {
   switch (type) {
     case Type::NA:
-      return NA;
-    GET_PRIMITIVE_TYPE(UINT8, UInt8Type);
-    GET_PRIMITIVE_TYPE(INT8, Int8Type);
-    GET_PRIMITIVE_TYPE(UINT16, UInt16Type);
-    GET_PRIMITIVE_TYPE(INT16, Int16Type);
-    GET_PRIMITIVE_TYPE(UINT32, UInt32Type);
-    GET_PRIMITIVE_TYPE(INT32, Int32Type);
-    GET_PRIMITIVE_TYPE(UINT64, UInt64Type);
-    GET_PRIMITIVE_TYPE(INT64, Int64Type);
-    GET_PRIMITIVE_TYPE(DATE, DateType);
+      return null();
+    GET_PRIMITIVE_TYPE(UINT8, uint8);
+    GET_PRIMITIVE_TYPE(INT8, int8);
+    GET_PRIMITIVE_TYPE(UINT16, uint16);
+    GET_PRIMITIVE_TYPE(INT16, int16);
+    GET_PRIMITIVE_TYPE(UINT32, uint32);
+    GET_PRIMITIVE_TYPE(INT32, int32);
+    GET_PRIMITIVE_TYPE(UINT64, uint64);
+    GET_PRIMITIVE_TYPE(INT64, int64);
+    GET_PRIMITIVE_TYPE(DATE, date);
     case Type::TIMESTAMP:
-      return TIMESTAMP_US;
+      return arrow::timestamp(arrow::TimeUnit::MICRO);
       break;
-    GET_PRIMITIVE_TYPE(BOOL, BooleanType);
-    GET_PRIMITIVE_TYPE(FLOAT, FloatType);
-    GET_PRIMITIVE_TYPE(DOUBLE, DoubleType);
-    GET_PRIMITIVE_TYPE(STRING, StringType);
+    GET_PRIMITIVE_TYPE(BOOL, boolean);
+    GET_PRIMITIVE_TYPE(FLOAT, float32);
+    GET_PRIMITIVE_TYPE(DOUBLE, float64);
+    GET_PRIMITIVE_TYPE(BINARY, binary);
+    GET_PRIMITIVE_TYPE(STRING, utf8);
     default:
       return nullptr;
   }

http://git-wip-us.apache.org/repos/asf/arrow/blob/268ffbef/python/src/pyarrow/helpers.h
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/helpers.h b/python/src/pyarrow/helpers.h
index e714bba..8334d97 100644
--- a/python/src/pyarrow/helpers.h
+++ b/python/src/pyarrow/helpers.h
@@ -28,22 +28,6 @@ namespace pyarrow {
 using arrow::DataType;
 using arrow::Type;
 
-extern const std::shared_ptr<arrow::NullType> NA;
-extern const std::shared_ptr<arrow::BooleanType> BOOL;
-extern const std::shared_ptr<arrow::UInt8Type> UINT8;
-extern const std::shared_ptr<arrow::UInt16Type> UINT16;
-extern const std::shared_ptr<arrow::UInt32Type> UINT32;
-extern const std::shared_ptr<arrow::UInt64Type> UINT64;
-extern const std::shared_ptr<arrow::Int8Type> INT8;
-extern const std::shared_ptr<arrow::Int16Type> INT16;
-extern const std::shared_ptr<arrow::Int32Type> INT32;
-extern const std::shared_ptr<arrow::Int64Type> INT64;
-extern const std::shared_ptr<arrow::DateType> DATE;
-extern const std::shared_ptr<arrow::TimestampType> TIMESTAMP_US;
-extern const std::shared_ptr<arrow::FloatType> FLOAT;
-extern const std::shared_ptr<arrow::DoubleType> DOUBLE;
-extern const std::shared_ptr<arrow::StringType> STRING;
-
 PYARROW_EXPORT
 std::shared_ptr<DataType> GetPrimitiveType(Type::type type);