You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by uw...@apache.org on 2016/12/21 08:32:05 UTC
arrow git commit: ARROW-374: More precise handling of bytes vs
unicode in Python API
Repository: arrow
Updated Branches:
refs/heads/master 73455b56f -> 268ffbeff
ARROW-374: More precise handling of bytes vs unicode in Python API
Python built-in types that are not all unicode become `arrow::BinaryArray` instead of `arrow::StringArray`, since we cannot be sure that the PyBytes objects are UTF-8-encoded strings.
Author: Wes McKinney <we...@twosigma.com>
Closes #249 from wesm/ARROW-374 and squashes the following commits:
1371a30 [Wes McKinney] py3 fixes
8ac3a49 [Wes McKinney] Consistently convert PyBytes to BinaryArray with pandas, too
83d1c05 [Wes McKinney] Remove print statement
c8df606 [Wes McKinney] Timestamp and time cannot be static
4a9aaf4 [Wes McKinney] Add Python interface to BinaryArray, convert PyBytes to binary instead of assuming utf8 unicode
Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/268ffbef
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/268ffbef
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/268ffbef
Branch: refs/heads/master
Commit: 268ffbeffb1cd0617e52d381d500a2d10f61124c
Parents: 73455b5
Author: Wes McKinney <we...@twosigma.com>
Authored: Wed Dec 21 09:31:56 2016 +0100
Committer: Uwe L. Korn <uw...@xhochy.com>
Committed: Wed Dec 21 09:31:56 2016 +0100
----------------------------------------------------------------------
cpp/src/arrow/type.cc | 6 +-
python/pyarrow/__init__.py | 5 +-
python/pyarrow/array.pyx | 5 ++
python/pyarrow/includes/libarrow.pxd | 6 +-
python/pyarrow/scalar.pyx | 16 ++++-
python/pyarrow/schema.pyx | 6 ++
python/pyarrow/tests/test_convert_builtin.py | 31 ++++++---
python/pyarrow/tests/test_convert_pandas.py | 18 +++--
python/pyarrow/tests/test_scalars.py | 22 +++++--
python/src/pyarrow/adapters/builtin.cc | 80 ++++++++++++++++-------
python/src/pyarrow/adapters/pandas.cc | 65 +++++++++++++++++-
python/src/pyarrow/helpers.cc | 50 +++++---------
python/src/pyarrow/helpers.h | 16 -----
13 files changed, 227 insertions(+), 99 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/arrow/blob/268ffbef/cpp/src/arrow/type.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc
index 4748cc3..8ff9eea 100644
--- a/cpp/src/arrow/type.cc
+++ b/cpp/src/arrow/type.cc
@@ -155,13 +155,11 @@ TYPE_FACTORY(binary, BinaryType);
TYPE_FACTORY(date, DateType);
std::shared_ptr<DataType> timestamp(TimeUnit unit) {
- static std::shared_ptr<DataType> result = std::make_shared<TimestampType>();
- return result;
+ return std::make_shared<TimestampType>(unit);
}
std::shared_ptr<DataType> time(TimeUnit unit) {
- static std::shared_ptr<DataType> result = std::make_shared<TimeType>();
- return result;
+ return std::make_shared<TimeType>(unit);
}
std::shared_ptr<DataType> list(const std::shared_ptr<DataType>& value_type) {
http://git-wip-us.apache.org/repos/asf/arrow/blob/268ffbef/python/pyarrow/__init__.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index 39ba4c7..9ede934 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -40,13 +40,14 @@ from pyarrow.scalar import (ArrayValue, Scalar, NA, NAType,
BooleanValue,
Int8Value, Int16Value, Int32Value, Int64Value,
UInt8Value, UInt16Value, UInt32Value, UInt64Value,
- FloatValue, DoubleValue, ListValue, StringValue)
+ FloatValue, DoubleValue, ListValue,
+ BinaryValue, StringValue)
from pyarrow.schema import (null, bool_,
int8, int16, int32, int64,
uint8, uint16, uint32, uint64,
timestamp, date,
- float_, double, string,
+ float_, double, binary, string,
list_, struct, field,
DataType, Field, Schema, schema)
http://git-wip-us.apache.org/repos/asf/arrow/blob/268ffbef/python/pyarrow/array.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/array.pyx b/python/pyarrow/array.pyx
index 84f1705..c178d5c 100644
--- a/python/pyarrow/array.pyx
+++ b/python/pyarrow/array.pyx
@@ -238,6 +238,10 @@ cdef class StringArray(Array):
pass
+cdef class BinaryArray(Array):
+ pass
+
+
cdef dict _array_classes = {
Type_NA: NullArray,
Type_BOOL: BooleanArray,
@@ -253,6 +257,7 @@ cdef dict _array_classes = {
Type_FLOAT: FloatArray,
Type_DOUBLE: DoubleArray,
Type_LIST: ListArray,
+ Type_BINARY: BinaryArray,
Type_STRING: StringArray,
Type_TIMESTAMP: Int64Array,
}
http://git-wip-us.apache.org/repos/asf/arrow/blob/268ffbef/python/pyarrow/includes/libarrow.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 419dd74..40fb60d 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -40,6 +40,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
Type_TIMESTAMP" arrow::Type::TIMESTAMP"
Type_DATE" arrow::Type::DATE"
+ Type_BINARY" arrow::Type::BINARY"
Type_STRING" arrow::Type::STRING"
Type_LIST" arrow::Type::LIST"
@@ -161,7 +162,10 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
shared_ptr[CArray] values()
shared_ptr[CDataType] value_type()
- cdef cppclass CStringArray" arrow::StringArray"(CListArray):
+ cdef cppclass CBinaryArray" arrow::BinaryArray"(CListArray):
+ const uint8_t* GetValue(int i, int32_t* length)
+
+ cdef cppclass CStringArray" arrow::StringArray"(CBinaryArray):
c_string GetString(int i)
cdef cppclass CChunkedArray" arrow::ChunkedArray":
http://git-wip-us.apache.org/repos/asf/arrow/blob/268ffbef/python/pyarrow/scalar.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/scalar.pyx b/python/pyarrow/scalar.pyx
index 623e3e4..a0610a1 100644
--- a/python/pyarrow/scalar.pyx
+++ b/python/pyarrow/scalar.pyx
@@ -22,6 +22,7 @@ import pyarrow.schema as schema
import datetime
+cimport cpython as cp
NA = None
@@ -170,6 +171,18 @@ cdef class StringValue(ArrayValue):
return frombytes(ap.GetString(self.index))
+cdef class BinaryValue(ArrayValue):
+
+ def as_py(self):
+ cdef:
+ const uint8_t* ptr
+ int32_t length
+ CBinaryArray* ap = <CBinaryArray*> self.sp_array.get()
+
+ ptr = ap.GetValue(self.index, &length)
+ return cp.PyBytes_FromStringAndSize(<const char*>(ptr), length)
+
+
cdef class ListValue(ArrayValue):
def __len__(self):
@@ -218,7 +231,8 @@ cdef dict _scalar_classes = {
Type_FLOAT: FloatValue,
Type_DOUBLE: DoubleValue,
Type_LIST: ListValue,
- Type_STRING: StringValue
+ Type_BINARY: BinaryValue,
+ Type_STRING: StringValue,
}
cdef object box_arrow_scalar(DataType type,
http://git-wip-us.apache.org/repos/asf/arrow/blob/268ffbef/python/pyarrow/schema.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/schema.pyx b/python/pyarrow/schema.pyx
index d05ac9e..7a69b0f 100644
--- a/python/pyarrow/schema.pyx
+++ b/python/pyarrow/schema.pyx
@@ -215,6 +215,12 @@ def string():
"""
return primitive_type(Type_STRING)
+def binary():
+ """
+ Binary (PyBytes-like) type
+ """
+ return primitive_type(Type_BINARY)
+
def list_(DataType value_type):
cdef DataType out = DataType()
cdef shared_ptr[CDataType] list_type
http://git-wip-us.apache.org/repos/asf/arrow/blob/268ffbef/python/pyarrow/tests/test_convert_builtin.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py
index 7dc1c1b..a5f7aa5 100644
--- a/python/pyarrow/tests/test_convert_builtin.py
+++ b/python/pyarrow/tests/test_convert_builtin.py
@@ -15,7 +15,7 @@
# specific language governing permissions and limitations
# under the License.
-from pyarrow.compat import unittest
+from pyarrow.compat import unittest, u
import pyarrow
import datetime
@@ -71,16 +71,28 @@ class TestConvertList(unittest.TestCase):
assert arr.type == pyarrow.double()
assert arr.to_pylist() == data
- def test_string(self):
- data = ['foo', b'bar', None, 'arrow']
+ def test_unicode(self):
+ data = [u('foo'), u('bar'), None, u('arrow')]
arr = pyarrow.from_pylist(data)
assert len(arr) == 4
assert arr.null_count == 1
assert arr.type == pyarrow.string()
- assert arr.to_pylist() == ['foo', 'bar', None, 'arrow']
+ assert arr.to_pylist() == [u('foo'), u('bar'), None, u('arrow')]
+
+ def test_bytes(self):
+ u1 = b'ma\xc3\xb1ana'
+ data = [b'foo',
+ u1.decode('utf-8'), # unicode gets encoded,
+ None]
+ arr = pyarrow.from_pylist(data)
+ assert len(arr) == 3
+ assert arr.null_count == 1
+ assert arr.type == pyarrow.binary()
+ assert arr.to_pylist() == [b'foo', u1, None]
def test_date(self):
- data = [datetime.date(2000, 1, 1), None, datetime.date(1970, 1, 1), datetime.date(2040, 2, 26)]
+ data = [datetime.date(2000, 1, 1), None, datetime.date(1970, 1, 1),
+ datetime.date(2040, 2, 26)]
arr = pyarrow.from_pylist(data)
assert len(arr) == 4
assert arr.type == pyarrow.date()
@@ -101,10 +113,13 @@ class TestConvertList(unittest.TestCase):
assert len(arr) == 4
assert arr.type == pyarrow.timestamp()
assert arr.null_count == 1
- assert arr[0].as_py() == datetime.datetime(2007, 7, 13, 1, 23, 34, 123456)
+ assert arr[0].as_py() == datetime.datetime(2007, 7, 13, 1,
+ 23, 34, 123456)
assert arr[1].as_py() is None
- assert arr[2].as_py() == datetime.datetime(2006, 1, 13, 12, 34, 56, 432539)
- assert arr[3].as_py() == datetime.datetime(2010, 8, 13, 5, 46, 57, 437699)
+ assert arr[2].as_py() == datetime.datetime(2006, 1, 13, 12,
+ 34, 56, 432539)
+ assert arr[3].as_py() == datetime.datetime(2010, 8, 13, 5,
+ 46, 57, 437699)
def test_mixed_nesting_levels(self):
pyarrow.from_pylist([1, 2, None])
http://git-wip-us.apache.org/repos/asf/arrow/blob/268ffbef/python/pyarrow/tests/test_convert_pandas.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index cf50f3d..da34f85 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -23,6 +23,7 @@ import numpy as np
import pandas as pd
import pandas.util.testing as tm
+from pyarrow.compat import u
import pyarrow as A
@@ -157,13 +158,22 @@ class TestPandasConversion(unittest.TestCase):
df = pd.DataFrame({'bools': arr})
self._check_pandas_roundtrip(df)
- def test_strings(self):
+ def test_unicode(self):
repeats = 1000
- values = [b'foo', None, u'bar', 'qux', np.nan]
+ values = [u('foo'), None, u('bar'), u('qux'), np.nan]
df = pd.DataFrame({'strings': values * repeats})
- values = ['foo', None, u'bar', 'qux', None]
- expected = pd.DataFrame({'strings': values * repeats})
+ self._check_pandas_roundtrip(df)
+
+ def test_bytes_to_binary(self):
+ values = [u('qux'), b'foo', None, 'bar', 'qux', np.nan]
+ df = pd.DataFrame({'strings': values})
+
+ table = A.from_pandas_dataframe(df)
+ assert table[0].type == A.binary()
+
+ values2 = [b'qux', b'foo', None, b'bar', b'qux', np.nan]
+ expected = pd.DataFrame({'strings': values2})
self._check_pandas_roundtrip(df, expected)
def test_timestamps_notimezone_no_nulls(self):
http://git-wip-us.apache.org/repos/asf/arrow/blob/268ffbef/python/pyarrow/tests/test_scalars.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py
index 4fb850a..19cfacb 100644
--- a/python/pyarrow/tests/test_scalars.py
+++ b/python/pyarrow/tests/test_scalars.py
@@ -15,7 +15,7 @@
# specific language governing permissions and limitations
# under the License.
-from pyarrow.compat import unittest, u
+from pyarrow.compat import unittest, u, unicode_type
import pyarrow as A
@@ -58,20 +58,32 @@ class TestScalars(unittest.TestCase):
v = arr[2]
assert v.as_py() == 3.0
- def test_string(self):
- arr = A.from_pylist(['foo', None, u('bar')])
+ def test_string_unicode(self):
+ arr = A.from_pylist([u('foo'), None, u('bar')])
v = arr[0]
assert isinstance(v, A.StringValue)
- assert repr(v) == "'foo'"
assert v.as_py() == 'foo'
assert arr[1] is A.NA
v = arr[2].as_py()
- assert v == 'bar'
+ assert v == u('bar')
assert isinstance(v, str)
+ def test_bytes(self):
+ arr = A.from_pylist([b'foo', None, u('bar')])
+
+ v = arr[0]
+ assert isinstance(v, A.BinaryValue)
+ assert v.as_py() == b'foo'
+
+ assert arr[1] is A.NA
+
+ v = arr[2].as_py()
+ assert v == b'bar'
+ assert isinstance(v, bytes)
+
def test_list(self):
arr = A.from_pylist([['foo', None], None, ['bar'], []])
http://git-wip-us.apache.org/repos/asf/arrow/blob/268ffbef/python/src/pyarrow/adapters/builtin.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/adapters/builtin.cc b/python/src/pyarrow/adapters/builtin.cc
index e0cb7c2..2a13944 100644
--- a/python/src/pyarrow/adapters/builtin.cc
+++ b/python/src/pyarrow/adapters/builtin.cc
@@ -42,14 +42,6 @@ static inline bool IsPyInteger(PyObject* obj) {
#endif
}
-static inline bool IsPyBaseString(PyObject* obj) {
-#if PYARROW_IS_PY2
- return PyString_Check(obj) || PyUnicode_Check(obj);
-#else
- return PyUnicode_Check(obj);
-#endif
-}
-
class ScalarVisitor {
public:
ScalarVisitor() :
@@ -60,7 +52,8 @@ class ScalarVisitor {
date_count_(0),
timestamp_count_(0),
float_count_(0),
- string_count_(0) {}
+ binary_count_(0),
+ unicode_count_(0) {}
void Visit(PyObject* obj) {
++total_count_;
@@ -76,8 +69,10 @@ class ScalarVisitor {
++date_count_;
} else if (PyDateTime_CheckExact(obj)) {
++timestamp_count_;
- } else if (IsPyBaseString(obj)) {
- ++string_count_;
+ } else if (PyBytes_Check(obj)) {
+ ++binary_count_;
+ } else if (PyUnicode_Check(obj)) {
+ ++unicode_count_;
} else {
// TODO(wesm): accumulate error information somewhere
}
@@ -86,20 +81,22 @@ class ScalarVisitor {
std::shared_ptr<DataType> GetType() {
// TODO(wesm): handling mixed-type cases
if (float_count_) {
- return DOUBLE;
+ return arrow::float64();
} else if (int_count_) {
// TODO(wesm): tighter type later
- return INT64;
+ return arrow::int64();
} else if (date_count_) {
- return DATE;
+ return arrow::date();
} else if (timestamp_count_) {
- return TIMESTAMP_US;
+ return arrow::timestamp(arrow::TimeUnit::MICRO);
} else if (bool_count_) {
- return BOOL;
- } else if (string_count_) {
- return STRING;
+ return arrow::boolean();
+ } else if (binary_count_) {
+ return arrow::binary();
+ } else if (unicode_count_) {
+ return arrow::utf8();
} else {
- return NA;
+ return arrow::null();
}
}
@@ -115,7 +112,8 @@ class ScalarVisitor {
int64_t date_count_;
int64_t timestamp_count_;
int64_t float_count_;
- int64_t string_count_;
+ int64_t binary_count_;
+ int64_t unicode_count_;
// Place to accumulate errors
// std::vector<Status> errors_;
@@ -163,7 +161,7 @@ class SeqVisitor {
std::shared_ptr<DataType> GetType() {
if (scalars_.total_count() == 0) {
if (max_nesting_level_ == 0) {
- return NA;
+ return arrow::null();
} else {
return nullptr;
}
@@ -227,7 +225,7 @@ static Status InferArrowType(PyObject* obj, int64_t* size,
// For 0-length sequences, refuse to guess
if (*size == 0) {
- *out_type = NA;
+ *out_type = arrow::null();
}
SeqVisitor seq_visitor;
@@ -381,7 +379,7 @@ class DoubleConverter : public TypedConverter<arrow::DoubleBuilder> {
}
};
-class StringConverter : public TypedConverter<arrow::StringBuilder> {
+class BytesConverter : public TypedConverter<arrow::BinaryBuilder> {
public:
Status AppendData(PyObject* seq) override {
PyObject* item;
@@ -415,6 +413,38 @@ class StringConverter : public TypedConverter<arrow::StringBuilder> {
}
};
+class UTF8Converter : public TypedConverter<arrow::StringBuilder> {
+ public:
+ Status AppendData(PyObject* seq) override {
+ PyObject* item;
+ PyObject* bytes_obj;
+ OwnedRef tmp;
+ const char* bytes;
+ int32_t length;
+ Py_ssize_t size = PySequence_Size(seq);
+ for (int64_t i = 0; i < size; ++i) {
+ item = PySequence_GetItem(seq, i);
+ OwnedRef holder(item);
+
+ if (item == Py_None) {
+ RETURN_NOT_OK(typed_builder_->AppendNull());
+ continue;
+ } else if (!PyUnicode_Check(item)) {
+ return Status::TypeError("Non-unicode value encountered");
+ }
+ tmp.reset(PyUnicode_AsUTF8String(item));
+ RETURN_IF_PYERROR();
+ bytes_obj = tmp.obj();
+
+ // No error checking
+ length = PyBytes_GET_SIZE(bytes_obj);
+ bytes = PyBytes_AS_STRING(bytes_obj);
+ RETURN_NOT_OK(typed_builder_->Append(bytes, length));
+ }
+ return Status::OK();
+ }
+};
+
class ListConverter : public TypedConverter<arrow::ListBuilder> {
public:
Status Init(const std::shared_ptr<ArrayBuilder>& builder) override;
@@ -449,8 +479,10 @@ std::shared_ptr<SeqConverter> GetConverter(const std::shared_ptr<DataType>& type
return std::make_shared<TimestampConverter>();
case Type::DOUBLE:
return std::make_shared<DoubleConverter>();
+ case Type::BINARY:
+ return std::make_shared<BytesConverter>();
case Type::STRING:
- return std::make_shared<StringConverter>();
+ return std::make_shared<UTF8Converter>();
case Type::LIST:
return std::make_shared<ListConverter>();
case Type::STRUCT:
http://git-wip-us.apache.org/repos/asf/arrow/blob/268ffbef/python/src/pyarrow/adapters/pandas.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/adapters/pandas.cc b/python/src/pyarrow/adapters/pandas.cc
index f8dff6d..38f3b6f 100644
--- a/python/src/pyarrow/adapters/pandas.cc
+++ b/python/src/pyarrow/adapters/pandas.cc
@@ -193,6 +193,9 @@ class ArrowSerializer {
Status ConvertObjectStrings(std::shared_ptr<Array>* out) {
PyAcquireGIL lock;
+ // The output type at this point is inconclusive because there may be bytes
+ // and unicode mixed in the object array
+
PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
arrow::TypePtr string_type(new arrow::StringType());
arrow::StringBuilder string_builder(pool_, string_type);
@@ -200,6 +203,7 @@ class ArrowSerializer {
Status s;
PyObject* obj;
+ bool have_bytes = false;
for (int64_t i = 0; i < length_; ++i) {
obj = objects[i];
if (PyUnicode_Check(obj)) {
@@ -215,13 +219,21 @@ class ArrowSerializer {
return s;
}
} else if (PyBytes_Check(obj)) {
+ have_bytes = true;
const int32_t length = PyBytes_GET_SIZE(obj);
RETURN_NOT_OK(string_builder.Append(PyBytes_AS_STRING(obj), length));
} else {
string_builder.AppendNull();
}
}
- return string_builder.Finish(out);
+ RETURN_NOT_OK(string_builder.Finish(out));
+
+ if (have_bytes) {
+ const auto& arr = static_cast<const arrow::StringArray&>(*out->get());
+ *out = std::make_shared<arrow::BinaryArray>(arr.length(), arr.offsets(),
+ arr.data(), arr.null_count(), arr.null_bitmap());
+ }
+ return Status::OK();
}
Status ConvertBooleans(std::shared_ptr<Array>* out) {
@@ -865,7 +877,7 @@ class ArrowDeserializer {
return Status::OK();
}
- // UTF8
+ // UTF8 strings
template <int T2>
inline typename std::enable_if<
T2 == arrow::Type::STRING, Status>::type
@@ -912,6 +924,54 @@ class ArrowDeserializer {
return Status::OK();
}
+ template <int T2>
+ inline typename std::enable_if<
+ T2 == arrow::Type::BINARY, Status>::type
+ ConvertValues(const std::shared_ptr<arrow::ChunkedArray>& data) {
+ size_t chunk_offset = 0;
+ PyAcquireGIL lock;
+
+ RETURN_NOT_OK(AllocateOutput(NPY_OBJECT));
+
+ for (int c = 0; c < data->num_chunks(); c++) {
+ const std::shared_ptr<Array> arr = data->chunk(c);
+ auto binary_arr = static_cast<arrow::BinaryArray*>(arr.get());
+ auto out_values = reinterpret_cast<PyObject**>(PyArray_DATA(out_)) + chunk_offset;
+
+ const uint8_t* data_ptr;
+ int32_t length;
+ if (data->null_count() > 0) {
+ for (int64_t i = 0; i < arr->length(); ++i) {
+ if (binary_arr->IsNull(i)) {
+ Py_INCREF(Py_None);
+ out_values[i] = Py_None;
+ } else {
+ data_ptr = binary_arr->GetValue(i, &length);
+
+ out_values[i] = PyBytes_FromStringAndSize(
+ reinterpret_cast<const char*>(data_ptr), length);
+ if (out_values[i] == nullptr) {
+ return Status::UnknownError("String initialization failed");
+ }
+ }
+ }
+ } else {
+ for (int64_t i = 0; i < arr->length(); ++i) {
+ data_ptr = binary_arr->GetValue(i, &length);
+ out_values[i] = PyBytes_FromStringAndSize(
+ reinterpret_cast<const char*>(data_ptr), length);
+ if (out_values[i] == nullptr) {
+ return Status::UnknownError("String initialization failed");
+ }
+ }
+ }
+
+ chunk_offset += binary_arr->length();
+ }
+
+ return Status::OK();
+ }
+
private:
std::shared_ptr<Column> col_;
PyObject* py_ref_;
@@ -948,6 +1008,7 @@ Status ConvertColumnToPandas(const std::shared_ptr<Column>& col, PyObject* py_re
FROM_ARROW_CASE(UINT64);
FROM_ARROW_CASE(FLOAT);
FROM_ARROW_CASE(DOUBLE);
+ FROM_ARROW_CASE(BINARY);
FROM_ARROW_CASE(STRING);
FROM_ARROW_CASE(DATE);
FROM_ARROW_CASE(TIMESTAMP);
http://git-wip-us.apache.org/repos/asf/arrow/blob/268ffbef/python/src/pyarrow/helpers.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/helpers.cc b/python/src/pyarrow/helpers.cc
index af92744..b42199c 100644
--- a/python/src/pyarrow/helpers.cc
+++ b/python/src/pyarrow/helpers.cc
@@ -23,47 +23,33 @@ using namespace arrow;
namespace pyarrow {
-const std::shared_ptr<NullType> NA = std::make_shared<NullType>();
-const std::shared_ptr<BooleanType> BOOL = std::make_shared<BooleanType>();
-const std::shared_ptr<UInt8Type> UINT8 = std::make_shared<UInt8Type>();
-const std::shared_ptr<UInt16Type> UINT16 = std::make_shared<UInt16Type>();
-const std::shared_ptr<UInt32Type> UINT32 = std::make_shared<UInt32Type>();
-const std::shared_ptr<UInt64Type> UINT64 = std::make_shared<UInt64Type>();
-const std::shared_ptr<Int8Type> INT8 = std::make_shared<Int8Type>();
-const std::shared_ptr<Int16Type> INT16 = std::make_shared<Int16Type>();
-const std::shared_ptr<Int32Type> INT32 = std::make_shared<Int32Type>();
-const std::shared_ptr<Int64Type> INT64 = std::make_shared<Int64Type>();
-const std::shared_ptr<DateType> DATE = std::make_shared<DateType>();
-const std::shared_ptr<TimestampType> TIMESTAMP_US = std::make_shared<TimestampType>(TimeUnit::MICRO);
-const std::shared_ptr<FloatType> FLOAT = std::make_shared<FloatType>();
-const std::shared_ptr<DoubleType> DOUBLE = std::make_shared<DoubleType>();
-const std::shared_ptr<StringType> STRING = std::make_shared<StringType>();
-#define GET_PRIMITIVE_TYPE(NAME, Class) \
+#define GET_PRIMITIVE_TYPE(NAME, FACTORY) \
case Type::NAME: \
- return NAME; \
+ return FACTORY(); \
break;
std::shared_ptr<DataType> GetPrimitiveType(Type::type type) {
switch (type) {
case Type::NA:
- return NA;
- GET_PRIMITIVE_TYPE(UINT8, UInt8Type);
- GET_PRIMITIVE_TYPE(INT8, Int8Type);
- GET_PRIMITIVE_TYPE(UINT16, UInt16Type);
- GET_PRIMITIVE_TYPE(INT16, Int16Type);
- GET_PRIMITIVE_TYPE(UINT32, UInt32Type);
- GET_PRIMITIVE_TYPE(INT32, Int32Type);
- GET_PRIMITIVE_TYPE(UINT64, UInt64Type);
- GET_PRIMITIVE_TYPE(INT64, Int64Type);
- GET_PRIMITIVE_TYPE(DATE, DateType);
+ return null();
+ GET_PRIMITIVE_TYPE(UINT8, uint8);
+ GET_PRIMITIVE_TYPE(INT8, int8);
+ GET_PRIMITIVE_TYPE(UINT16, uint16);
+ GET_PRIMITIVE_TYPE(INT16, int16);
+ GET_PRIMITIVE_TYPE(UINT32, uint32);
+ GET_PRIMITIVE_TYPE(INT32, int32);
+ GET_PRIMITIVE_TYPE(UINT64, uint64);
+ GET_PRIMITIVE_TYPE(INT64, int64);
+ GET_PRIMITIVE_TYPE(DATE, date);
case Type::TIMESTAMP:
- return TIMESTAMP_US;
+ return arrow::timestamp(arrow::TimeUnit::MICRO);
break;
- GET_PRIMITIVE_TYPE(BOOL, BooleanType);
- GET_PRIMITIVE_TYPE(FLOAT, FloatType);
- GET_PRIMITIVE_TYPE(DOUBLE, DoubleType);
- GET_PRIMITIVE_TYPE(STRING, StringType);
+ GET_PRIMITIVE_TYPE(BOOL, boolean);
+ GET_PRIMITIVE_TYPE(FLOAT, float32);
+ GET_PRIMITIVE_TYPE(DOUBLE, float64);
+ GET_PRIMITIVE_TYPE(BINARY, binary);
+ GET_PRIMITIVE_TYPE(STRING, utf8);
default:
return nullptr;
}
http://git-wip-us.apache.org/repos/asf/arrow/blob/268ffbef/python/src/pyarrow/helpers.h
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/helpers.h b/python/src/pyarrow/helpers.h
index e714bba..8334d97 100644
--- a/python/src/pyarrow/helpers.h
+++ b/python/src/pyarrow/helpers.h
@@ -28,22 +28,6 @@ namespace pyarrow {
using arrow::DataType;
using arrow::Type;
-extern const std::shared_ptr<arrow::NullType> NA;
-extern const std::shared_ptr<arrow::BooleanType> BOOL;
-extern const std::shared_ptr<arrow::UInt8Type> UINT8;
-extern const std::shared_ptr<arrow::UInt16Type> UINT16;
-extern const std::shared_ptr<arrow::UInt32Type> UINT32;
-extern const std::shared_ptr<arrow::UInt64Type> UINT64;
-extern const std::shared_ptr<arrow::Int8Type> INT8;
-extern const std::shared_ptr<arrow::Int16Type> INT16;
-extern const std::shared_ptr<arrow::Int32Type> INT32;
-extern const std::shared_ptr<arrow::Int64Type> INT64;
-extern const std::shared_ptr<arrow::DateType> DATE;
-extern const std::shared_ptr<arrow::TimestampType> TIMESTAMP_US;
-extern const std::shared_ptr<arrow::FloatType> FLOAT;
-extern const std::shared_ptr<arrow::DoubleType> DOUBLE;
-extern const std::shared_ptr<arrow::StringType> STRING;
-
PYARROW_EXPORT
std::shared_ptr<DataType> GetPrimitiveType(Type::type type);