You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/01/18 23:28:09 UTC
arrow git commit: ARROW-468: Python: Conversion of nested data in
pd.DataFrames
Repository: arrow
Updated Branches:
refs/heads/master b1472305c -> 353772f84
ARROW-468: Python: Conversion of nested data in pd.DataFrames
Author: Uwe L. Korn <uw...@xhochy.com>
Closes #289 from xhochy/ARROW-468 and squashes the following commits:
6fab6b2 [Uwe L. Korn] clang-format
c30da77 [Uwe L. Korn] Conversion for Lists of String and Timestamp
3ac373e [Uwe L. Korn] Fix string conversion
23fdc97 [Uwe L. Korn] Conversion of nested arrays to Pandas
a8197f7 [Uwe L. Korn] ARROW-468: Python: Conversion of nested data in pd.DataFrames
Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/353772f8
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/353772f8
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/353772f8
Branch: refs/heads/master
Commit: 353772f844e227038ea8a3c5328a70e5fe553773
Parents: b147230
Author: Uwe L. Korn <uw...@xhochy.com>
Authored: Wed Jan 18 18:28:02 2017 -0500
Committer: Wes McKinney <we...@twosigma.com>
Committed: Wed Jan 18 18:28:02 2017 -0500
----------------------------------------------------------------------
python/pyarrow/array.pyx | 14 +-
python/pyarrow/includes/libarrow.pxd | 1 +
python/pyarrow/includes/pyarrow.pxd | 9 +-
python/pyarrow/schema.pxd | 1 +
python/pyarrow/schema.pyx | 19 +
python/pyarrow/table.pyx | 26 +-
python/pyarrow/tests/test_convert_pandas.py | 53 ++-
python/src/pyarrow/adapters/pandas.cc | 448 +++++++++++++++++++----
python/src/pyarrow/adapters/pandas.h | 7 +-
9 files changed, 477 insertions(+), 101 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/arrow/blob/353772f8/python/pyarrow/array.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/array.pyx b/python/pyarrow/array.pyx
index 266768f..4299ba6 100644
--- a/python/pyarrow/array.pyx
+++ b/python/pyarrow/array.pyx
@@ -33,7 +33,7 @@ from pyarrow.error cimport check_status
cimport pyarrow.scalar as scalar
from pyarrow.scalar import NA
-from pyarrow.schema cimport Schema
+from pyarrow.schema cimport Field, Schema
import pyarrow.schema as schema
cimport cpython
@@ -322,7 +322,7 @@ def from_pylist(object list_obj, DataType type=None):
return box_arrow_array(sp_array)
-def from_pandas_series(object series, object mask=None, timestamps_to_ms=False):
+def from_pandas_series(object series, object mask=None, timestamps_to_ms=False, Field field=None):
"""
Convert pandas.Series to an Arrow Array.
@@ -338,26 +338,32 @@ def from_pandas_series(object series, object mask=None, timestamps_to_ms=False):
compability with other functionality like Parquet I/O which
only supports milliseconds.
+ field: pyarrow.Field
+ Schema indicator to what type this column should render in Arrow
+
Returns
-------
pyarrow.array.Array
"""
cdef:
shared_ptr[CArray] out
+ shared_ptr[CField] c_field
series_values = series_as_ndarray(series)
if series_values.dtype.type == np.datetime64 and timestamps_to_ms:
series_values = series_values.astype('datetime64[ms]')
+ if field is not None:
+ c_field = field.sp_field
if mask is None:
with nogil:
check_status(pyarrow.PandasToArrow(pyarrow.get_memory_pool(),
- series_values, &out))
+ series_values, c_field, &out))
else:
mask = series_as_ndarray(mask)
with nogil:
check_status(pyarrow.PandasMaskedToArrow(
- pyarrow.get_memory_pool(), series_values, mask, &out))
+ pyarrow.get_memory_pool(), series_values, mask, c_field, &out))
return box_arrow_array(out)
http://git-wip-us.apache.org/repos/asf/arrow/blob/353772f8/python/pyarrow/includes/libarrow.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 8cfaaf7..8b0e3b6 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -107,6 +107,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
c_bool Equals(const shared_ptr[CSchema]& other)
shared_ptr[CField] field(int i)
+ shared_ptr[CField] GetFieldByName(c_string& name)
int num_fields()
c_string ToString()
http://git-wip-us.apache.org/repos/asf/arrow/blob/353772f8/python/pyarrow/includes/pyarrow.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/pyarrow.pxd b/python/pyarrow/includes/pyarrow.pxd
index 901e6c9..b7b8d7c 100644
--- a/python/pyarrow/includes/pyarrow.pxd
+++ b/python/pyarrow/includes/pyarrow.pxd
@@ -18,9 +18,9 @@
# distutils: language = c++
from pyarrow.includes.common cimport *
-from pyarrow.includes.libarrow cimport (CArray, CBuffer, CColumn, CTable,
- CDataType, CStatus, Type, MemoryPool,
- TimeUnit)
+from pyarrow.includes.libarrow cimport (CArray, CBuffer, CColumn, CField,
+ CTable, CDataType, CStatus, Type,
+ MemoryPool, TimeUnit)
cimport pyarrow.includes.libarrow_io as arrow_io
@@ -30,9 +30,10 @@ cdef extern from "pyarrow/api.h" namespace "pyarrow" nogil:
shared_ptr[CDataType] GetTimestampType(TimeUnit unit)
CStatus ConvertPySequence(object obj, shared_ptr[CArray]* out)
- CStatus PandasToArrow(MemoryPool* pool, object ao,
+ CStatus PandasToArrow(MemoryPool* pool, object ao, shared_ptr[CField] field,
shared_ptr[CArray]* out)
CStatus PandasMaskedToArrow(MemoryPool* pool, object ao, object mo,
+ shared_ptr[CField] field,
shared_ptr[CArray]* out)
CStatus ConvertArrayToPandas(const shared_ptr[CArray]& arr,
http://git-wip-us.apache.org/repos/asf/arrow/blob/353772f8/python/pyarrow/schema.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/schema.pxd b/python/pyarrow/schema.pxd
index f2cb776..42588d4 100644
--- a/python/pyarrow/schema.pxd
+++ b/python/pyarrow/schema.pxd
@@ -44,4 +44,5 @@ cdef class Schema:
cdef init_schema(self, const shared_ptr[CSchema]& schema)
cdef DataType box_data_type(const shared_ptr[CDataType]& type)
+cdef Field box_field(const shared_ptr[CField]& field)
cdef Schema box_schema(const shared_ptr[CSchema]& schema)
http://git-wip-us.apache.org/repos/asf/arrow/blob/353772f8/python/pyarrow/schema.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/schema.pyx b/python/pyarrow/schema.pyx
index a6aa9d5..85b1617 100644
--- a/python/pyarrow/schema.pyx
+++ b/python/pyarrow/schema.pyx
@@ -133,6 +133,20 @@ cdef class Schema:
return self.sp_schema.get().Equals(_other.sp_schema)
+ def field_by_name(self, name):
+ """
+ Access a field by its name rather than the column index.
+
+ Parameters
+ ----------
+ name: str
+
+ Returns
+ -------
+ field: pyarrow.Field
+ """
+ return box_field(self.schema.GetFieldByName(tobytes(name)))
+
@classmethod
def from_fields(cls, fields):
cdef:
@@ -287,6 +301,11 @@ cdef DataType box_data_type(const shared_ptr[CDataType]& type):
out.init(type)
return out
+cdef Field box_field(const shared_ptr[CField]& field):
+ cdef Field out = Field()
+ out.init(field)
+ return out
+
cdef Schema box_schema(const shared_ptr[CSchema]& type):
cdef Schema out = Schema()
out.init_schema(type)
http://git-wip-us.apache.org/repos/asf/arrow/blob/353772f8/python/pyarrow/table.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/table.pyx b/python/pyarrow/table.pyx
index dce125a..b720a47 100644
--- a/python/pyarrow/table.pyx
+++ b/python/pyarrow/table.pyx
@@ -30,7 +30,7 @@ import pyarrow.config
from pyarrow.array cimport Array, box_arrow_array
from pyarrow.error import ArrowException
from pyarrow.error cimport check_status
-from pyarrow.schema cimport box_data_type, box_schema
+from pyarrow.schema cimport box_data_type, box_schema, Field
from pyarrow.compat import frombytes, tobytes
@@ -277,16 +277,20 @@ cdef _schema_from_arrays(arrays, names, shared_ptr[CSchema]* schema):
-cdef _dataframe_to_arrays(df, name, timestamps_to_ms):
+cdef _dataframe_to_arrays(df, name, timestamps_to_ms, Schema schema):
from pyarrow.array import from_pandas_series
cdef:
list names = []
list arrays = []
+ Field field = None
for name in df.columns:
col = df[name]
- arr = from_pandas_series(col, timestamps_to_ms=timestamps_to_ms)
+ if schema is not None:
+ field = schema.field_by_name(name)
+ arr = from_pandas_series(col, timestamps_to_ms=timestamps_to_ms,
+ field=field)
names.append(name)
arrays.append(arr)
@@ -424,19 +428,22 @@ cdef class RecordBatch:
return pd.DataFrame(dict(zip(names, data)), columns=names)
@classmethod
- def from_pandas(cls, df):
+ def from_pandas(cls, df, schema=None):
"""
Convert pandas.DataFrame to an Arrow RecordBatch
Parameters
----------
df: pandas.DataFrame
+ schema: pyarrow.Schema (optional)
+ The expected schema of the RecordBatch. This can be used to
+ indicate the type of columns if we cannot infer it automatically.
Returns
-------
pyarrow.table.RecordBatch
"""
- names, arrays = _dataframe_to_arrays(df, None, False)
+ names, arrays = _dataframe_to_arrays(df, None, False, schema)
return cls.from_arrays(names, arrays)
@staticmethod
@@ -552,7 +559,7 @@ cdef class Table:
return result
@classmethod
- def from_pandas(cls, df, name=None, timestamps_to_ms=False):
+ def from_pandas(cls, df, name=None, timestamps_to_ms=False, schema=None):
"""
Convert pandas.DataFrame to an Arrow Table
@@ -567,6 +574,10 @@ cdef class Table:
compability with other functionality like Parquet I/O which
only supports milliseconds.
+ schema: pyarrow.Schema (optional)
+ The expected schema of the Arrow Table. This can be used to
+ indicate the type of columns if we cannot infer it automatically.
+
Returns
-------
pyarrow.table.Table
@@ -584,7 +595,8 @@ cdef class Table:
<pyarrow.table.Table object at 0x7f05d1fb1b40>
"""
names, arrays = _dataframe_to_arrays(df, name=name,
- timestamps_to_ms=timestamps_to_ms)
+ timestamps_to_ms=timestamps_to_ms,
+ schema=schema)
return cls.from_arrays(names, arrays, name=name)
@staticmethod
http://git-wip-us.apache.org/repos/asf/arrow/blob/353772f8/python/pyarrow/tests/test_convert_pandas.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index 261eaa8..3928a1f 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -16,6 +16,8 @@
# specific language governing permissions and limitations
# under the License.
+from collections import OrderedDict
+
import datetime
import unittest
@@ -60,8 +62,8 @@ class TestPandasConversion(unittest.TestCase):
pass
def _check_pandas_roundtrip(self, df, expected=None, nthreads=1,
- timestamps_to_ms=False, expected_schema=None):
- table = A.Table.from_pandas(df, timestamps_to_ms=timestamps_to_ms)
+ timestamps_to_ms=False, expected_schema=None, schema=None):
+ table = A.Table.from_pandas(df, timestamps_to_ms=timestamps_to_ms, schema=schema)
result = table.to_pandas(nthreads=nthreads)
if expected_schema:
assert table.schema.equals(expected_schema)
@@ -284,6 +286,53 @@ class TestPandasConversion(unittest.TestCase):
expected['date'] = pd.to_datetime(df['date'])
tm.assert_frame_equal(result, expected)
+ def test_column_of_lists(self):
+ dtypes = [('i1', A.int8()), ('i2', A.int16()),
+ ('i4', A.int32()), ('i8', A.int64()),
+ ('u1', A.uint8()), ('u2', A.uint16()),
+ ('u4', A.uint32()), ('u8', A.uint64()),
+ ('f4', A.float_()), ('f8', A.double())]
+
+ arrays = OrderedDict()
+ fields = []
+ for dtype, arrow_dtype in dtypes:
+ fields.append(A.field(dtype, A.list_(arrow_dtype)))
+ arrays[dtype] = [
+ np.arange(10, dtype=dtype),
+ np.arange(5, dtype=dtype),
+ None,
+ np.arange(1, dtype=dtype)
+ ]
+
+ fields.append(A.field('str', A.list_(A.string())))
+ arrays['str'] = [
+ np.array([u"1", u"�"], dtype="object"),
+ None,
+ np.array([u"1"], dtype="object"),
+ np.array([u"1", u"2", u"3"], dtype="object")
+ ]
+
+ fields.append(A.field('datetime64', A.list_(A.timestamp('ns'))))
+ arrays['datetime64'] = [
+ np.array(['2007-07-13T01:23:34.123456789',
+ None,
+ '2010-08-13T05:46:57.437699912'],
+ dtype='datetime64[ns]'),
+ None,
+ None,
+ np.array(['2007-07-13T02',
+ None,
+ '2010-08-13T05:46:57.437699912'],
+ dtype='datetime64[ns]'),
+ ]
+
+ df = pd.DataFrame(arrays)
+ schema = A.Schema.from_fields(fields)
+ self._check_pandas_roundtrip(df, schema=schema, expected_schema=schema)
+ table = A.Table.from_pandas(df, schema=schema)
+ assert table.schema.equals(schema)
+ df_new = table.to_pandas(nthreads=1)
+
def test_threaded_conversion(self):
df = _alltypes_example()
self._check_pandas_roundtrip(df, nthreads=2,
http://git-wip-us.apache.org/repos/asf/arrow/blob/353772f8/python/src/pyarrow/adapters/pandas.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/adapters/pandas.cc b/python/src/pyarrow/adapters/pandas.cc
index ad18eca..8c2d350 100644
--- a/python/src/pyarrow/adapters/pandas.cc
+++ b/python/src/pyarrow/adapters/pandas.cc
@@ -36,6 +36,7 @@
#include "arrow/api.h"
#include "arrow/status.h"
#include "arrow/type_fwd.h"
+#include "arrow/type_traits.h"
#include "arrow/util/bit-util.h"
#include "arrow/util/macros.h"
@@ -50,6 +51,8 @@ using arrow::ChunkedArray;
using arrow::Column;
using arrow::Field;
using arrow::DataType;
+using arrow::ListType;
+using arrow::ListBuilder;
using arrow::Status;
using arrow::Table;
using arrow::Type;
@@ -66,6 +69,7 @@ template <>
struct npy_traits<NPY_BOOL> {
typedef uint8_t value_type;
using TypeClass = arrow::BooleanType;
+ using BuilderClass = arrow::BooleanBuilder;
static constexpr bool supports_nulls = false;
static inline bool isnull(uint8_t v) { return false; }
@@ -76,6 +80,7 @@ struct npy_traits<NPY_BOOL> {
struct npy_traits<NPY_##TYPE> { \
typedef T value_type; \
using TypeClass = arrow::CapType##Type; \
+ using BuilderClass = arrow::CapType##Builder; \
\
static constexpr bool supports_nulls = false; \
static inline bool isnull(T v) { return false; } \
@@ -94,6 +99,7 @@ template <>
struct npy_traits<NPY_FLOAT32> {
typedef float value_type;
using TypeClass = arrow::FloatType;
+ using BuilderClass = arrow::FloatBuilder;
static constexpr bool supports_nulls = true;
@@ -104,6 +110,7 @@ template <>
struct npy_traits<NPY_FLOAT64> {
typedef double value_type;
using TypeClass = arrow::DoubleType;
+ using BuilderClass = arrow::DoubleBuilder;
static constexpr bool supports_nulls = true;
@@ -114,6 +121,7 @@ template <>
struct npy_traits<NPY_DATETIME> {
typedef int64_t value_type;
using TypeClass = arrow::TimestampType;
+ using BuilderClass = arrow::TimestampBuilder;
static constexpr bool supports_nulls = true;
@@ -132,6 +140,107 @@ struct npy_traits<NPY_OBJECT> {
static constexpr bool supports_nulls = true;
};
+static inline bool PyObject_is_null(const PyObject* obj) {
+ return obj == Py_None || obj == numpy_nan;
+}
+
+static inline bool PyObject_is_string(const PyObject* obj) {
+#if PY_MAJOR_VERSION >= 3
+ return PyUnicode_Check(obj) || PyBytes_Check(obj);
+#else
+ return PyString_Check(obj) || PyUnicode_Check(obj);
+#endif
+}
+
+static inline bool PyObject_is_bool(const PyObject* obj) {
+#if PY_MAJOR_VERSION >= 3
+ return PyString_Check(obj) || PyBytes_Check(obj);
+#else
+ return PyString_Check(obj) || PyUnicode_Check(obj);
+#endif
+}
+
+template <int TYPE>
+static int64_t ValuesToBitmap(const void* data, int64_t length, uint8_t* bitmap) {
+ typedef npy_traits<TYPE> traits;
+ typedef typename traits::value_type T;
+
+ int64_t null_count = 0;
+ const T* values = reinterpret_cast<const T*>(data);
+
+ // TODO(wesm): striding
+ for (int i = 0; i < length; ++i) {
+ if (traits::isnull(values[i])) {
+ ++null_count;
+ } else {
+ BitUtil::SetBit(bitmap, i);
+ }
+ }
+
+ return null_count;
+}
+
+template <int TYPE>
+static int64_t ValuesToBytemap(const void* data, int64_t length, uint8_t* valid_bytes) {
+ typedef npy_traits<TYPE> traits;
+ typedef typename traits::value_type T;
+
+ int64_t null_count = 0;
+ const T* values = reinterpret_cast<const T*>(data);
+
+ // TODO(wesm): striding
+ for (int i = 0; i < length; ++i) {
+ valid_bytes[i] = not traits::isnull(values[i]);
+ if (traits::isnull(values[i])) null_count++;
+ }
+
+ return null_count;
+}
+
+Status CheckFlatNumpyArray(PyArrayObject* numpy_array, int np_type) {
+ if (PyArray_NDIM(numpy_array) != 1) {
+ return Status::Invalid("only handle 1-dimensional arrays");
+ }
+
+ if (PyArray_DESCR(numpy_array)->type_num != np_type) {
+ return Status::Invalid("can only handle exact conversions");
+ }
+
+ npy_intp* astrides = PyArray_STRIDES(numpy_array);
+ if (astrides[0] != PyArray_DESCR(numpy_array)->elsize) {
+ return Status::Invalid("No support for strided arrays in lists yet");
+ }
+ return Status::OK();
+}
+
+Status AppendObjectStrings(arrow::StringBuilder& string_builder, PyObject** objects,
+ int64_t objects_length, bool* have_bytes) {
+ PyObject* obj;
+
+ for (int64_t i = 0; i < objects_length; ++i) {
+ obj = objects[i];
+ if (PyUnicode_Check(obj)) {
+ obj = PyUnicode_AsUTF8String(obj);
+ if (obj == NULL) {
+ PyErr_Clear();
+ return Status::TypeError("failed converting unicode to UTF8");
+ }
+ const int32_t length = PyBytes_GET_SIZE(obj);
+ Status s = string_builder.Append(PyBytes_AS_STRING(obj), length);
+ Py_DECREF(obj);
+ if (!s.ok()) { return s; }
+ } else if (PyBytes_Check(obj)) {
+ *have_bytes = true;
+ const int32_t length = PyBytes_GET_SIZE(obj);
+ RETURN_NOT_OK(string_builder.Append(PyBytes_AS_STRING(obj), length));
+ } else {
+ string_builder.AppendNull();
+ }
+ }
+
+ return Status::OK();
+}
+
template <int TYPE>
class ArrowSerializer {
public:
@@ -140,6 +249,8 @@ class ArrowSerializer {
length_ = PyArray_SIZE(arr_);
}
+ void IndicateType(const std::shared_ptr<Field> field) { field_indicator_ = field; }
+
Status Convert(std::shared_ptr<Array>* out);
int stride() const { return PyArray_STRIDES(arr_)[0]; }
@@ -198,28 +309,8 @@ class ArrowSerializer {
RETURN_NOT_OK(string_builder.Resize(length_));
Status s;
- PyObject* obj;
bool have_bytes = false;
- for (int64_t i = 0; i < length_; ++i) {
- obj = objects[i];
- if (PyUnicode_Check(obj)) {
- obj = PyUnicode_AsUTF8String(obj);
- if (obj == NULL) {
- PyErr_Clear();
- return Status::TypeError("failed converting unicode to UTF8");
- }
- const int32_t length = PyBytes_GET_SIZE(obj);
- s = string_builder.Append(PyBytes_AS_STRING(obj), length);
- Py_DECREF(obj);
- if (!s.ok()) { return s; }
- } else if (PyBytes_Check(obj)) {
- have_bytes = true;
- const int32_t length = PyBytes_GET_SIZE(obj);
- RETURN_NOT_OK(string_builder.Append(PyBytes_AS_STRING(obj), length));
- } else {
- string_builder.AppendNull();
- }
- }
+ RETURN_NOT_OK(AppendObjectStrings(string_builder, objects, length_, &have_bytes));
RETURN_NOT_OK(string_builder.Finish(out));
if (have_bytes) {
@@ -258,6 +349,36 @@ class ArrowSerializer {
return Status::OK();
}
+ template <int ITEM_TYPE, typename ArrowType>
+ Status ConvertTypedLists(
+ const std::shared_ptr<Field>& field, std::shared_ptr<Array>* out);
+
+#define LIST_CASE(TYPE, NUMPY_TYPE, ArrowType) \
+ case Type::TYPE: { \
+ return ConvertTypedLists<NUMPY_TYPE, ::arrow::ArrowType>(field, out); \
+ }
+
+ Status ConvertLists(const std::shared_ptr<Field>& field, std::shared_ptr<Array>* out) {
+ switch (field->type->type) {
+ LIST_CASE(UINT8, NPY_UINT8, UInt8Type)
+ LIST_CASE(INT8, NPY_INT8, Int8Type)
+ LIST_CASE(UINT16, NPY_UINT16, UInt16Type)
+ LIST_CASE(INT16, NPY_INT16, Int16Type)
+ LIST_CASE(UINT32, NPY_UINT32, UInt32Type)
+ LIST_CASE(INT32, NPY_INT32, Int32Type)
+ LIST_CASE(UINT64, NPY_UINT64, UInt64Type)
+ LIST_CASE(INT64, NPY_INT64, Int64Type)
+ LIST_CASE(TIMESTAMP, NPY_DATETIME, TimestampType)
+ LIST_CASE(FLOAT, NPY_FLOAT, FloatType)
+ LIST_CASE(DOUBLE, NPY_DOUBLE, DoubleType)
+ LIST_CASE(STRING, NPY_OBJECT, StringType)
+ default:
+ return Status::TypeError("Unknown list item type");
+ }
+
+ return Status::TypeError("Unknown list type");
+ }
+
Status MakeDataType(std::shared_ptr<DataType>* out);
arrow::MemoryPool* pool_;
@@ -267,6 +388,7 @@ class ArrowSerializer {
int64_t length_;
+ std::shared_ptr<Field> field_indicator_;
std::shared_ptr<arrow::Buffer> data_;
std::shared_ptr<arrow::ResizableBuffer> null_bitmap_;
uint8_t* null_bitmap_data_;
@@ -288,26 +410,6 @@ static int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap
}
template <int TYPE>
-static int64_t ValuesToBitmap(const void* data, int64_t length, uint8_t* bitmap) {
- typedef npy_traits<TYPE> traits;
- typedef typename traits::value_type T;
-
- int64_t null_count = 0;
- const T* values = reinterpret_cast<const T*>(data);
-
- // TODO(wesm): striding
- for (int i = 0; i < length; ++i) {
- if (traits::isnull(values[i])) {
- ++null_count;
- } else {
- BitUtil::SetBit(bitmap, i);
- }
- }
-
- return null_count;
-}
-
-template <int TYPE>
inline Status ArrowSerializer<TYPE>::MakeDataType(std::shared_ptr<DataType>* out) {
out->reset(new typename npy_traits<TYPE>::TypeClass());
return Status::OK();
@@ -361,26 +463,6 @@ inline Status ArrowSerializer<TYPE>::Convert(std::shared_ptr<Array>* out) {
return Status::OK();
}
-static inline bool PyObject_is_null(const PyObject* obj) {
- return obj == Py_None || obj == numpy_nan;
-}
-
-static inline bool PyObject_is_string(const PyObject* obj) {
-#if PY_MAJOR_VERSION >= 3
- return PyUnicode_Check(obj) || PyBytes_Check(obj);
-#else
- return PyString_Check(obj) || PyUnicode_Check(obj);
-#endif
-}
-
-static inline bool PyObject_is_bool(const PyObject* obj) {
-#if PY_MAJOR_VERSION >= 3
- return PyString_Check(obj) || PyBytes_Check(obj);
-#else
- return PyString_Check(obj) || PyUnicode_Check(obj);
-#endif
-}
-
template <>
inline Status ArrowSerializer<NPY_OBJECT>::Convert(std::shared_ptr<Array>* out) {
// Python object arrays are annoying, since we could have one of:
@@ -401,17 +483,34 @@ inline Status ArrowSerializer<NPY_OBJECT>::Convert(std::shared_ptr<Array>* out)
PyDateTime_IMPORT;
}
- for (int64_t i = 0; i < length_; ++i) {
- if (PyObject_is_null(objects[i])) {
- continue;
- } else if (PyObject_is_string(objects[i])) {
- return ConvertObjectStrings(out);
- } else if (PyBool_Check(objects[i])) {
- return ConvertBooleans(out);
- } else if (PyDate_CheckExact(objects[i])) {
- return ConvertDates(out);
- } else {
- return Status::TypeError("unhandled python type");
+ if (field_indicator_) {
+ switch (field_indicator_->type->type) {
+ case Type::STRING:
+ return ConvertObjectStrings(out);
+ case Type::BOOL:
+ return ConvertBooleans(out);
+ case Type::DATE:
+ return ConvertDates(out);
+ case Type::LIST: {
+ auto list_field = static_cast<ListType*>(field_indicator_->type.get());
+ return ConvertLists(list_field->value_field(), out);
+ }
+ default:
+ return Status::TypeError("No known conversion to Arrow type");
+ }
+ } else {
+ for (int64_t i = 0; i < length_; ++i) {
+ if (PyObject_is_null(objects[i])) {
+ continue;
+ } else if (PyObject_is_string(objects[i])) {
+ return ConvertObjectStrings(out);
+ } else if (PyBool_Check(objects[i])) {
+ return ConvertBooleans(out);
+ } else if (PyDate_CheckExact(objects[i])) {
+ return ConvertDates(out);
+ } else {
+ return Status::TypeError("unhandled python type");
+ }
}
}
@@ -449,6 +548,81 @@ inline Status ArrowSerializer<NPY_BOOL>::ConvertData() {
return Status::OK();
}
+template <int TYPE>
+template <int ITEM_TYPE, typename ArrowType>
+inline Status ArrowSerializer<TYPE>::ConvertTypedLists(
+ const std::shared_ptr<Field>& field, std::shared_ptr<Array>* out) {
+ typedef npy_traits<ITEM_TYPE> traits;
+ typedef typename traits::value_type T;
+ typedef typename traits::BuilderClass BuilderT;
+
+ auto value_builder = std::make_shared<BuilderT>(pool_, field->type);
+ ListBuilder list_builder(pool_, value_builder);
+ PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
+ for (int64_t i = 0; i < length_; ++i) {
+ if (PyObject_is_null(objects[i])) {
+ RETURN_NOT_OK(list_builder.AppendNull());
+ } else if (PyArray_Check(objects[i])) {
+ auto numpy_array = reinterpret_cast<PyArrayObject*>(objects[i]);
+ RETURN_NOT_OK(list_builder.Append(true));
+
+ // TODO(uwe): Support more complex numpy array structures
+ RETURN_NOT_OK(CheckFlatNumpyArray(numpy_array, ITEM_TYPE));
+
+ int32_t size = PyArray_DIM(numpy_array, 0);
+ auto data = reinterpret_cast<const T*>(PyArray_DATA(numpy_array));
+ if (traits::supports_nulls) {
+ null_bitmap_->Resize(size, false);
+ // TODO(uwe): A bitmap would be more space-efficient but the Builder API doesn't
+ // currently support this.
+ // ValuesToBitmap<ITEM_TYPE>(data, size, null_bitmap_->mutable_data());
+ ValuesToBytemap<ITEM_TYPE>(data, size, null_bitmap_->mutable_data());
+ RETURN_NOT_OK(value_builder->Append(data, size, null_bitmap_->data()));
+ } else {
+ RETURN_NOT_OK(value_builder->Append(data, size));
+ }
+ } else if (PyList_Check(objects[i])) {
+ return Status::TypeError("Python lists are not yet supported");
+ } else {
+ return Status::TypeError("Unsupported Python type for list items");
+ }
+ }
+ return list_builder.Finish(out);
+}
+
+template <>
+template <>
+inline Status
+ArrowSerializer<NPY_OBJECT>::ConvertTypedLists<NPY_OBJECT, ::arrow::StringType>(
+ const std::shared_ptr<Field>& field, std::shared_ptr<Array>* out) {
+ // TODO: If there are bytes involed, convert to Binary representation
+ bool have_bytes = false;
+
+ auto value_builder = std::make_shared<arrow::StringBuilder>(pool_, field->type);
+ ListBuilder list_builder(pool_, value_builder);
+ PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
+ for (int64_t i = 0; i < length_; ++i) {
+ if (PyObject_is_null(objects[i])) {
+ RETURN_NOT_OK(list_builder.AppendNull());
+ } else if (PyArray_Check(objects[i])) {
+ auto numpy_array = reinterpret_cast<PyArrayObject*>(objects[i]);
+ RETURN_NOT_OK(list_builder.Append(true));
+
+ // TODO(uwe): Support more complex numpy array structures
+ RETURN_NOT_OK(CheckFlatNumpyArray(numpy_array, NPY_OBJECT));
+
+ int32_t size = PyArray_DIM(numpy_array, 0);
+ auto data = reinterpret_cast<PyObject**>(PyArray_DATA(numpy_array));
+ RETURN_NOT_OK(AppendObjectStrings(*value_builder.get(), data, size, &have_bytes));
+ } else if (PyList_Check(objects[i])) {
+ return Status::TypeError("Python lists are not yet supported");
+ } else {
+ return Status::TypeError("Unsupported Python type for list items");
+ }
+ }
+ return list_builder.Finish(out);
+}
+
template <>
inline Status ArrowSerializer<NPY_OBJECT>::ConvertData() {
return Status::TypeError("NYI");
@@ -460,8 +634,8 @@ inline Status ArrowSerializer<NPY_OBJECT>::ConvertData() {
RETURN_NOT_OK(converter.Convert(out)); \
} break;
-Status PandasMaskedToArrow(
- arrow::MemoryPool* pool, PyObject* ao, PyObject* mo, std::shared_ptr<Array>* out) {
+Status PandasMaskedToArrow(arrow::MemoryPool* pool, PyObject* ao, PyObject* mo,
+ const std::shared_ptr<Field>& field, std::shared_ptr<Array>* out) {
PyArrayObject* arr = reinterpret_cast<PyArrayObject*>(ao);
PyArrayObject* mask = nullptr;
@@ -484,7 +658,11 @@ Status PandasMaskedToArrow(
TO_ARROW_CASE(FLOAT32);
TO_ARROW_CASE(FLOAT64);
TO_ARROW_CASE(DATETIME);
- TO_ARROW_CASE(OBJECT);
+ case NPY_OBJECT: {
+ ArrowSerializer<NPY_OBJECT> converter(pool, arr, mask);
+ converter.IndicateType(field);
+ RETURN_NOT_OK(converter.Convert(out));
+ } break;
default:
std::stringstream ss;
ss << "unsupported type " << PyArray_DESCR(arr)->type_num << std::endl;
@@ -493,8 +671,9 @@ Status PandasMaskedToArrow(
return Status::OK();
}
-Status PandasToArrow(arrow::MemoryPool* pool, PyObject* ao, std::shared_ptr<Array>* out) {
- return PandasMaskedToArrow(pool, ao, nullptr, out);
+Status PandasToArrow(arrow::MemoryPool* pool, PyObject* ao,
+ const std::shared_ptr<Field>& field, std::shared_ptr<Array>* out) {
+ return PandasMaskedToArrow(pool, ao, nullptr, field, out);
}
// ----------------------------------------------------------------------
@@ -739,6 +918,56 @@ inline Status ConvertBinaryLike(const ChunkedArray& data, PyObject** out_values)
return Status::OK();
}
+template <typename ArrowType>
+inline Status ConvertListsLike(
+ const std::shared_ptr<Column>& col, PyObject** out_values) {
+ typedef arrow_traits<ArrowType::type_id> traits;
+ typedef typename ::arrow::TypeTraits<ArrowType>::ArrayType ArrayType;
+
+ const ChunkedArray& data = *col->data().get();
+ auto list_type = std::static_pointer_cast<ListType>(col->type());
+
+ // Get column of underlying value arrays
+ std::vector<std::shared_ptr<Array>> value_arrays;
+ for (int c = 0; c < data.num_chunks(); c++) {
+ auto arr = std::static_pointer_cast<arrow::ListArray>(data.chunk(c));
+ value_arrays.emplace_back(arr->values());
+ }
+ auto flat_column = std::make_shared<Column>(list_type->value_field(), value_arrays);
+ // TODO(ARROW-489): Currently we don't have a Python reference for single columns.
+ // Storing a reference to the whole Array would be to expensive.
+ PyObject* numpy_array;
+ RETURN_NOT_OK(ConvertColumnToPandas(flat_column, nullptr, &numpy_array));
+
+ PyAcquireGIL lock;
+
+ for (int c = 0; c < data.num_chunks(); c++) {
+ auto arr = std::static_pointer_cast<arrow::ListArray>(data.chunk(c));
+
+ const uint8_t* data_ptr;
+ int32_t length;
+ const bool has_nulls = data.null_count() > 0;
+ for (int64_t i = 0; i < arr->length(); ++i) {
+ if (has_nulls && arr->IsNull(i)) {
+ Py_INCREF(Py_None);
+ *out_values = Py_None;
+ } else {
+ PyObject* start = PyLong_FromLong(arr->value_offset(i));
+ PyObject* end = PyLong_FromLong(arr->value_offset(i + 1));
+ PyObject* slice = PySlice_New(start, end, NULL);
+ *out_values = PyObject_GetItem(numpy_array, slice);
+ Py_DECREF(start);
+ Py_DECREF(end);
+ Py_DECREF(slice);
+ }
+ ++out_values;
+ }
+ }
+
+ Py_XDECREF(numpy_array);
+ return Status::OK();
+}
+
template <typename T>
inline void ConvertNumericNullable(const ChunkedArray& data, T na_value, T* out_values) {
for (int c = 0; c < data.num_chunks(); c++) {
@@ -886,8 +1115,11 @@ class ArrowDeserializer {
CONVERT_CASE(STRING);
CONVERT_CASE(DATE);
CONVERT_CASE(TIMESTAMP);
- default:
- return Status::NotImplemented("Arrow type reading not implemented");
+ default: {
+ std::stringstream ss;
+ ss << "Arrow type reading not implemented for " << col_->type()->ToString();
+ return Status::NotImplemented(ss.str());
+ }
}
#undef CONVERT_CASE
@@ -903,7 +1135,7 @@ class ArrowDeserializer {
typedef typename arrow_traits<TYPE>::T T;
int npy_type = arrow_traits<TYPE>::npy_type;
- if (data_.num_chunks() == 1 && data_.null_count() == 0) {
+ if (data_.num_chunks() == 1 && data_.null_count() == 0 && py_ref_ != nullptr) {
return ConvertValuesZeroCopy<TYPE>(npy_type, data_.chunk(0));
}
@@ -933,7 +1165,7 @@ class ArrowDeserializer {
typedef typename arrow_traits<TYPE>::T T;
int npy_type = arrow_traits<TYPE>::npy_type;
- if (data_.num_chunks() == 1 && data_.null_count() == 0) {
+ if (data_.num_chunks() == 1 && data_.null_count() == 0 && py_ref_ != nullptr) {
return ConvertValuesZeroCopy<TYPE>(npy_type, data_.chunk(0));
}
@@ -1028,6 +1260,7 @@ class PandasBlock {
PandasBlock(int64_t num_rows, int num_columns)
: num_rows_(num_rows), num_columns_(num_columns) {}
+ virtual ~PandasBlock() {}
virtual Status Allocate() = 0;
virtual Status Write(const std::shared_ptr<Column>& col, int64_t abs_placement,
@@ -1080,9 +1313,15 @@ class PandasBlock {
DISALLOW_COPY_AND_ASSIGN(PandasBlock);
};
+#define CONVERTLISTSLIKE_CASE(ArrowType, ArrowEnum) \
+ case Type::ArrowEnum: \
+ RETURN_NOT_OK((ConvertListsLike<::arrow::ArrowType>(col, out_buffer))); \
+ break;
+
class ObjectBlock : public PandasBlock {
public:
using PandasBlock::PandasBlock;
+ virtual ~ObjectBlock() {}
Status Allocate() override { return AllocateNDArray(NPY_OBJECT); }
@@ -1101,6 +1340,27 @@ class ObjectBlock : public PandasBlock {
RETURN_NOT_OK(ConvertBinaryLike<arrow::BinaryArray>(data, out_buffer));
} else if (type == Type::STRING) {
RETURN_NOT_OK(ConvertBinaryLike<arrow::StringArray>(data, out_buffer));
+ } else if (type == Type::LIST) {
+ auto list_type = std::static_pointer_cast<ListType>(col->type());
+ switch (list_type->value_type()->type) {
+ CONVERTLISTSLIKE_CASE(UInt8Type, UINT8)
+ CONVERTLISTSLIKE_CASE(Int8Type, INT8)
+ CONVERTLISTSLIKE_CASE(UInt16Type, UINT16)
+ CONVERTLISTSLIKE_CASE(Int16Type, INT16)
+ CONVERTLISTSLIKE_CASE(UInt32Type, UINT32)
+ CONVERTLISTSLIKE_CASE(Int32Type, INT32)
+ CONVERTLISTSLIKE_CASE(UInt64Type, UINT64)
+ CONVERTLISTSLIKE_CASE(Int64Type, INT64)
+ CONVERTLISTSLIKE_CASE(TimestampType, TIMESTAMP)
+ CONVERTLISTSLIKE_CASE(FloatType, FLOAT)
+ CONVERTLISTSLIKE_CASE(DoubleType, DOUBLE)
+ CONVERTLISTSLIKE_CASE(StringType, STRING)
+ default: {
+ std::stringstream ss;
+ ss << "Not implemented type for lists: " << list_type->value_type()->ToString();
+ return Status::NotImplemented(ss.str());
+ }
+ }
} else {
std::stringstream ss;
ss << "Unsupported type for object array output: " << col->type()->ToString();
@@ -1396,6 +1656,32 @@ class DataFrameBlockCreator {
case Type::TIMESTAMP:
output_type = PandasBlock::DATETIME;
break;
+ case Type::LIST: {
+ auto list_type = std::static_pointer_cast<ListType>(col->type());
+ switch (list_type->value_type()->type) {
+ case Type::UINT8:
+ case Type::INT8:
+ case Type::UINT16:
+ case Type::INT16:
+ case Type::UINT32:
+ case Type::INT32:
+ case Type::INT64:
+ case Type::UINT64:
+ case Type::FLOAT:
+ case Type::DOUBLE:
+ case Type::STRING:
+ case Type::TIMESTAMP:
+ // The above types are all supported.
+ break;
+ default: {
+ std::stringstream ss;
+ ss << "Not implemented type for lists: "
+ << list_type->value_type()->ToString();
+ return Status::NotImplemented(ss.str());
+ }
+ }
+ output_type = PandasBlock::OBJECT;
+ } break;
default:
return Status::NotImplemented(col->type()->ToString());
}
http://git-wip-us.apache.org/repos/asf/arrow/blob/353772f8/python/src/pyarrow/adapters/pandas.h
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/adapters/pandas.h b/python/src/pyarrow/adapters/pandas.h
index 60dadd4..664365e 100644
--- a/python/src/pyarrow/adapters/pandas.h
+++ b/python/src/pyarrow/adapters/pandas.h
@@ -31,6 +31,7 @@ namespace arrow {
class Array;
class Column;
+class Field;
class MemoryPool;
class Status;
class Table;
@@ -63,11 +64,11 @@ arrow::Status ConvertTableToPandas(
PYARROW_EXPORT
arrow::Status PandasMaskedToArrow(arrow::MemoryPool* pool, PyObject* ao, PyObject* mo,
- std::shared_ptr<arrow::Array>* out);
+ const std::shared_ptr<arrow::Field>& field, std::shared_ptr<arrow::Array>* out);
PYARROW_EXPORT
-arrow::Status PandasToArrow(
- arrow::MemoryPool* pool, PyObject* ao, std::shared_ptr<arrow::Array>* out);
+arrow::Status PandasToArrow(arrow::MemoryPool* pool, PyObject* ao,
+ const std::shared_ptr<arrow::Field>& field, std::shared_ptr<arrow::Array>* out);
} // namespace pyarrow