You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2016/03/08 07:39:14 UTC
arrow git commit: ARROW-44: Python: prototype object model for array
slot values ("scalars")
Repository: arrow
Updated Branches:
refs/heads/master 9afb66778 -> ae95dbd18
ARROW-44: Python: prototype object model for array slot values ("scalars")
Non-exhaustive, but this will facilitate inspecting Arrow data while the library is in development.
```python
In [2]: arr = arrow.from_pylist([['foo', None], None, [], ['qux']])
In [3]: arr
Out[3]: <arrow.array.ListArray at 0x7f1970030f98>
In [4]: arr[0]
Out[4]: ['foo', None]
In [5]: type(arr[0])
Out[5]: arrow.scalar.ListValue
In [6]: arr[0][0]
Out[6]: 'foo'
In [7]: arr[0][1]
Out[7]: NA
In [8]: arr[1]
Out[8]: NA
In [9]: arr[2]
Out[9]: []
In [10]: len(arr[2])
Out[10]: 0
In [11]: arr.type
Out[11]: DataType(list<string>)
```
Author: Wes McKinney <we...@apache.org>
Closes #20 from wesm/ARROW-44 and squashes the following commits:
df06ba1 [Wes McKinney] Add tests for scalars proxying implemented Python list type conversions, fix associated bugs
20fbdc1 [Wes McKinney] Draft scalar box types, no tests yet
Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/ae95dbd1
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/ae95dbd1
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/ae95dbd1
Branch: refs/heads/master
Commit: ae95dbd189477442d39e55fb0a1aede206906cd9
Parents: 9afb667
Author: Wes McKinney <we...@apache.org>
Authored: Mon Mar 7 22:39:07 2016 -0800
Committer: Wes McKinney <we...@apache.org>
Committed: Mon Mar 7 22:39:07 2016 -0800
----------------------------------------------------------------------
cpp/src/arrow/types/list.h | 6 +-
python/arrow/__init__.py | 6 +-
python/arrow/array.pxd | 1 -
python/arrow/array.pyx | 17 ++-
python/arrow/compat.py | 6 +
python/arrow/includes/arrow.pxd | 36 +++++-
python/arrow/scalar.pxd | 25 ++++-
python/arrow/scalar.pyx | 165 ++++++++++++++++++++++++++++
python/arrow/schema.pxd | 2 +
python/arrow/schema.pyx | 14 +++
python/arrow/tests/test_scalars.py | 82 ++++++++++++++
python/src/pyarrow/adapters/builtin.cc | 2 +-
12 files changed, 342 insertions(+), 20 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/arrow/blob/ae95dbd1/cpp/src/arrow/types/list.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/list.h b/cpp/src/arrow/types/list.h
index f40a824..210c76a 100644
--- a/cpp/src/arrow/types/list.h
+++ b/cpp/src/arrow/types/list.h
@@ -63,7 +63,11 @@ class ListArray : public Array {
// Return a shared pointer in case the requestor desires to share ownership
// with this array.
- const ArrayPtr& values() const {return values_;}
+ const std::shared_ptr<Array>& values() const {return values_;}
+
+ const std::shared_ptr<DataType>& value_type() const {
+ return values_->type();
+ }
const int32_t* offsets() const { return offsets_;}
http://git-wip-us.apache.org/repos/asf/arrow/blob/ae95dbd1/python/arrow/__init__.py
----------------------------------------------------------------------
diff --git a/python/arrow/__init__.py b/python/arrow/__init__.py
index 3c049b8..3507ea0 100644
--- a/python/arrow/__init__.py
+++ b/python/arrow/__init__.py
@@ -24,7 +24,11 @@ from arrow.array import (Array, from_pylist, total_allocated_bytes,
from arrow.error import ArrowException
-from arrow.scalar import ArrayValue, NA, Scalar
+from arrow.scalar import (ArrayValue, Scalar, NA, NAType,
+ BooleanValue,
+ Int8Value, Int16Value, Int32Value, Int64Value,
+ UInt8Value, UInt16Value, UInt32Value, UInt64Value,
+ FloatValue, DoubleValue, ListValue, StringValue)
from arrow.schema import (null, bool_,
int8, int16, int32, int64,
http://git-wip-us.apache.org/repos/asf/arrow/blob/ae95dbd1/python/arrow/array.pxd
----------------------------------------------------------------------
diff --git a/python/arrow/array.pxd b/python/arrow/array.pxd
index e32d277..04dd8d1 100644
--- a/python/arrow/array.pxd
+++ b/python/arrow/array.pxd
@@ -34,7 +34,6 @@ cdef class Array:
DataType type
cdef init(self, const shared_ptr[CArray]& sp_array)
- cdef _getitem(self, int i)
cdef class BooleanArray(Array):
http://git-wip-us.apache.org/repos/asf/arrow/blob/ae95dbd1/python/arrow/array.pyx
----------------------------------------------------------------------
diff --git a/python/arrow/array.pyx b/python/arrow/array.pyx
index 3a3210d..8ebd01d 100644
--- a/python/arrow/array.pyx
+++ b/python/arrow/array.pyx
@@ -25,6 +25,7 @@ cimport arrow.includes.pyarrow as pyarrow
from arrow.compat import frombytes, tobytes
from arrow.error cimport check_status
+cimport arrow.scalar as scalar
from arrow.scalar import NA
def total_allocated_bytes():
@@ -73,13 +74,7 @@ cdef class Array:
while key < 0:
key += len(self)
- if self.ap.IsNull(key):
- return NA
- else:
- return self._getitem(key)
-
- cdef _getitem(self, int i):
- raise NotImplementedError
+ return scalar.box_arrow_scalar(self.type, self.sp_array, key)
def slice(self, start, end):
pass
@@ -168,12 +163,16 @@ cdef object box_arrow_array(const shared_ptr[CArray]& sp_array):
return arr
-def from_pylist(object list_obj, type=None):
+def from_pylist(object list_obj, DataType type=None):
"""
Convert Python list to Arrow array
"""
cdef:
shared_ptr[CArray] sp_array
- check_status(pyarrow.ConvertPySequence(list_obj, &sp_array))
+ if type is None:
+ check_status(pyarrow.ConvertPySequence(list_obj, &sp_array))
+ else:
+ raise NotImplementedError
+
return box_arrow_array(sp_array)
http://git-wip-us.apache.org/repos/asf/arrow/blob/ae95dbd1/python/arrow/compat.py
----------------------------------------------------------------------
diff --git a/python/arrow/compat.py b/python/arrow/compat.py
index 2ac41ac..08f0f23 100644
--- a/python/arrow/compat.py
+++ b/python/arrow/compat.py
@@ -54,6 +54,9 @@ if PY2:
range = xrange
long = long
+ def u(s):
+ return unicode(s, "unicode_escape")
+
def tobytes(o):
if isinstance(o, unicode):
return o.encode('utf8')
@@ -73,6 +76,9 @@ else:
from decimal import Decimal
range = range
+ def u(s):
+ return s
+
def tobytes(o):
if isinstance(o, str):
return o.encode('utf8')
http://git-wip-us.apache.org/repos/asf/arrow/blob/ae95dbd1/python/arrow/includes/arrow.pxd
----------------------------------------------------------------------
diff --git a/python/arrow/includes/arrow.pxd b/python/arrow/includes/arrow.pxd
index fde5de9..0cc44c0 100644
--- a/python/arrow/includes/arrow.pxd
+++ b/python/arrow/includes/arrow.pxd
@@ -84,13 +84,41 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
c_bool IsNull(int i)
cdef cppclass CUInt8Array" arrow::UInt8Array"(CArray):
- pass
+ uint8_t Value(int i)
cdef cppclass CInt8Array" arrow::Int8Array"(CArray):
- pass
+ int8_t Value(int i)
+
+ cdef cppclass CUInt16Array" arrow::UInt16Array"(CArray):
+ uint16_t Value(int i)
+
+ cdef cppclass CInt16Array" arrow::Int16Array"(CArray):
+ int16_t Value(int i)
+
+ cdef cppclass CUInt32Array" arrow::UInt32Array"(CArray):
+ uint32_t Value(int i)
+
+ cdef cppclass CInt32Array" arrow::Int32Array"(CArray):
+ int32_t Value(int i)
+
+ cdef cppclass CUInt64Array" arrow::UInt64Array"(CArray):
+ uint64_t Value(int i)
+
+ cdef cppclass CInt64Array" arrow::Int64Array"(CArray):
+ int64_t Value(int i)
+
+ cdef cppclass CFloatArray" arrow::FloatArray"(CArray):
+ float Value(int i)
+
+ cdef cppclass CDoubleArray" arrow::DoubleArray"(CArray):
+ double Value(int i)
cdef cppclass CListArray" arrow::ListArray"(CArray):
- pass
+ const int32_t* offsets()
+ int32_t offset(int i)
+ int32_t value_length(int i)
+ const shared_ptr[CArray]& values()
+ const shared_ptr[CDataType]& value_type()
cdef cppclass CStringArray" arrow::StringArray"(CListArray):
- pass
+ c_string GetString(int i)
http://git-wip-us.apache.org/repos/asf/arrow/blob/ae95dbd1/python/arrow/scalar.pxd
----------------------------------------------------------------------
diff --git a/python/arrow/scalar.pxd b/python/arrow/scalar.pxd
index e193c09..15cdc95 100644
--- a/python/arrow/scalar.pxd
+++ b/python/arrow/scalar.pxd
@@ -16,7 +16,7 @@
# under the License.
from arrow.includes.common cimport *
-from arrow.includes.arrow cimport CArray, CListArray
+from arrow.includes.arrow cimport *
from arrow.schema cimport DataType
@@ -31,17 +31,36 @@ cdef class NAType(Scalar):
cdef class ArrayValue(Scalar):
cdef:
- shared_ptr[CArray] array
+ shared_ptr[CArray] sp_array
int index
+ cdef void init(self, DataType type,
+ const shared_ptr[CArray]& sp_array, int index)
+
+ cdef void _set_array(self, const shared_ptr[CArray]& sp_array)
+
cdef class Int8Value(ArrayValue):
pass
-cdef class ListValue(ArrayValue):
+cdef class Int64Value(ArrayValue):
pass
+cdef class ListValue(ArrayValue):
+ cdef readonly:
+ DataType value_type
+
+ cdef:
+ CListArray* ap
+
+ cdef _getitem(self, int i)
+
+
cdef class StringValue(ArrayValue):
pass
+
+cdef object box_arrow_scalar(DataType type,
+ const shared_ptr[CArray]& sp_array,
+ int index)
http://git-wip-us.apache.org/repos/asf/arrow/blob/ae95dbd1/python/arrow/scalar.pyx
----------------------------------------------------------------------
diff --git a/python/arrow/scalar.pyx b/python/arrow/scalar.pyx
index 78dadec..951ede2 100644
--- a/python/arrow/scalar.pyx
+++ b/python/arrow/scalar.pyx
@@ -15,14 +15,179 @@
# specific language governing permissions and limitations
# under the License.
+from arrow.schema cimport DataType, box_data_type
+
+from arrow.compat import frombytes
import arrow.schema as schema
+NA = None
+
cdef class NAType(Scalar):
def __cinit__(self):
+ global NA
+ if NA is not None:
+ raise Exception('Cannot create multiple NAType instances')
+
self.type = schema.null()
def __repr__(self):
return 'NA'
+ def as_py(self):
+ return None
+
NA = NAType()
+
+cdef class ArrayValue(Scalar):
+
+ cdef void init(self, DataType type, const shared_ptr[CArray]& sp_array,
+ int index):
+ self.type = type
+ self.index = index
+ self._set_array(sp_array)
+
+ cdef void _set_array(self, const shared_ptr[CArray]& sp_array):
+ self.sp_array = sp_array
+
+ def __repr__(self):
+ if hasattr(self, 'as_py'):
+ return repr(self.as_py())
+ else:
+ return Scalar.__repr__(self)
+
+
+cdef class BooleanValue(ArrayValue):
+ pass
+
+
+cdef class Int8Value(ArrayValue):
+
+ def as_py(self):
+ cdef CInt8Array* ap = <CInt8Array*> self.sp_array.get()
+ return ap.Value(self.index)
+
+
+cdef class UInt8Value(ArrayValue):
+
+ def as_py(self):
+ cdef CUInt8Array* ap = <CUInt8Array*> self.sp_array.get()
+ return ap.Value(self.index)
+
+
+cdef class Int16Value(ArrayValue):
+
+ def as_py(self):
+ cdef CInt16Array* ap = <CInt16Array*> self.sp_array.get()
+ return ap.Value(self.index)
+
+
+cdef class UInt16Value(ArrayValue):
+
+ def as_py(self):
+ cdef CUInt16Array* ap = <CUInt16Array*> self.sp_array.get()
+ return ap.Value(self.index)
+
+
+cdef class Int32Value(ArrayValue):
+
+ def as_py(self):
+ cdef CInt32Array* ap = <CInt32Array*> self.sp_array.get()
+ return ap.Value(self.index)
+
+
+cdef class UInt32Value(ArrayValue):
+
+ def as_py(self):
+ cdef CUInt32Array* ap = <CUInt32Array*> self.sp_array.get()
+ return ap.Value(self.index)
+
+
+cdef class Int64Value(ArrayValue):
+
+ def as_py(self):
+ cdef CInt64Array* ap = <CInt64Array*> self.sp_array.get()
+ return ap.Value(self.index)
+
+
+cdef class UInt64Value(ArrayValue):
+
+ def as_py(self):
+ cdef CUInt64Array* ap = <CUInt64Array*> self.sp_array.get()
+ return ap.Value(self.index)
+
+
+cdef class FloatValue(ArrayValue):
+
+ def as_py(self):
+ cdef CFloatArray* ap = <CFloatArray*> self.sp_array.get()
+ return ap.Value(self.index)
+
+
+cdef class DoubleValue(ArrayValue):
+
+ def as_py(self):
+ cdef CDoubleArray* ap = <CDoubleArray*> self.sp_array.get()
+ return ap.Value(self.index)
+
+
+cdef class StringValue(ArrayValue):
+
+ def as_py(self):
+ cdef CStringArray* ap = <CStringArray*> self.sp_array.get()
+ return frombytes(ap.GetString(self.index))
+
+
+cdef class ListValue(ArrayValue):
+
+ def __len__(self):
+ return self.ap.value_length(self.index)
+
+ def __getitem__(self, i):
+ return self._getitem(i)
+
+ cdef void _set_array(self, const shared_ptr[CArray]& sp_array):
+ self.sp_array = sp_array
+ self.ap = <CListArray*> sp_array.get()
+ self.value_type = box_data_type(self.ap.value_type())
+
+ cdef _getitem(self, int i):
+ cdef int j = self.ap.offset(self.index) + i
+ return box_arrow_scalar(self.value_type, self.ap.values(), j)
+
+ def as_py(self):
+ cdef:
+ int j
+ list result = []
+
+ for j in range(len(self)):
+ result.append(self._getitem(j).as_py())
+
+ return result
+
+
+cdef dict _scalar_classes = {
+ LogicalType_UINT8: Int8Value,
+ LogicalType_UINT16: Int16Value,
+ LogicalType_UINT32: Int32Value,
+ LogicalType_UINT64: Int64Value,
+ LogicalType_INT8: Int8Value,
+ LogicalType_INT16: Int16Value,
+ LogicalType_INT32: Int32Value,
+ LogicalType_INT64: Int64Value,
+ LogicalType_FLOAT: FloatValue,
+ LogicalType_DOUBLE: DoubleValue,
+ LogicalType_LIST: ListValue,
+ LogicalType_STRING: StringValue
+}
+
+cdef object box_arrow_scalar(DataType type,
+ const shared_ptr[CArray]& sp_array,
+ int index):
+ cdef ArrayValue val
+ if sp_array.get().IsNull(index):
+ return NA
+ else:
+ val = _scalar_classes[type.type.type]()
+ val.init(type, sp_array, index)
+ return val
http://git-wip-us.apache.org/repos/asf/arrow/blob/ae95dbd1/python/arrow/schema.pxd
----------------------------------------------------------------------
diff --git a/python/arrow/schema.pxd b/python/arrow/schema.pxd
index 487c246..8cc244a 100644
--- a/python/arrow/schema.pxd
+++ b/python/arrow/schema.pxd
@@ -37,3 +37,5 @@ cdef class Schema:
cdef:
shared_ptr[CSchema] sp_schema
CSchema* schema
+
+cdef DataType box_data_type(const shared_ptr[CDataType]& type)
http://git-wip-us.apache.org/repos/asf/arrow/blob/ae95dbd1/python/arrow/schema.pyx
----------------------------------------------------------------------
diff --git a/python/arrow/schema.pyx b/python/arrow/schema.pyx
index 63cd6e8..3001531 100644
--- a/python/arrow/schema.pyx
+++ b/python/arrow/schema.pyx
@@ -85,6 +85,14 @@ cdef DataType primitive_type(LogicalType type, bint nullable=True):
def field(name, type):
return Field(name, type)
+cdef set PRIMITIVE_TYPES = set([
+ LogicalType_NA, LogicalType_BOOL,
+ LogicalType_UINT8, LogicalType_INT8,
+ LogicalType_UINT16, LogicalType_INT16,
+ LogicalType_UINT32, LogicalType_INT32,
+ LogicalType_UINT64, LogicalType_INT64,
+ LogicalType_FLOAT, LogicalType_DOUBLE])
+
def null():
return primitive_type(LogicalType_NA)
@@ -148,3 +156,9 @@ def struct(fields, c_bool nullable=True):
out.init(shared_ptr[CDataType](
new CStructType(c_fields, nullable)))
return out
+
+
+cdef DataType box_data_type(const shared_ptr[CDataType]& type):
+ cdef DataType out = DataType()
+ out.init(type)
+ return out
http://git-wip-us.apache.org/repos/asf/arrow/blob/ae95dbd1/python/arrow/tests/test_scalars.py
----------------------------------------------------------------------
diff --git a/python/arrow/tests/test_scalars.py b/python/arrow/tests/test_scalars.py
new file mode 100644
index 0000000..951380b
--- /dev/null
+++ b/python/arrow/tests/test_scalars.py
@@ -0,0 +1,82 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from arrow.compat import unittest, u
+import arrow
+
+
+class TestScalars(unittest.TestCase):
+
+ def test_null_singleton(self):
+ with self.assertRaises(Exception):
+ arrow.NAType()
+
+ def test_bool(self):
+ pass
+
+ def test_int64(self):
+ arr = arrow.from_pylist([1, 2, None])
+
+ v = arr[0]
+ assert isinstance(v, arrow.Int64Value)
+ assert repr(v) == "1"
+ assert v.as_py() == 1
+
+ assert arr[2] is arrow.NA
+
+ def test_double(self):
+ arr = arrow.from_pylist([1.5, None, 3])
+
+ v = arr[0]
+ assert isinstance(v, arrow.DoubleValue)
+ assert repr(v) == "1.5"
+ assert v.as_py() == 1.5
+
+ assert arr[1] is arrow.NA
+
+ v = arr[2]
+ assert v.as_py() == 3.0
+
+ def test_string(self):
+ arr = arrow.from_pylist(['foo', None, u('bar')])
+
+ v = arr[0]
+ assert isinstance(v, arrow.StringValue)
+ assert repr(v) == "'foo'"
+ assert v.as_py() == 'foo'
+
+ assert arr[1] is arrow.NA
+
+ v = arr[2].as_py()
+ assert v == 'bar'
+ assert isinstance(v, str)
+
+ def test_list(self):
+ arr = arrow.from_pylist([['foo', None], None, ['bar'], []])
+
+ v = arr[0]
+ assert len(v) == 2
+ assert isinstance(v, arrow.ListValue)
+ assert repr(v) == "['foo', None]"
+ assert v.as_py() == ['foo', None]
+ assert v[0].as_py() == 'foo'
+ assert v[1] is arrow.NA
+
+ assert arr[1] is arrow.NA
+
+ v = arr[3]
+ assert len(v) == 0
http://git-wip-us.apache.org/repos/asf/arrow/blob/ae95dbd1/python/src/pyarrow/adapters/builtin.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/adapters/builtin.cc b/python/src/pyarrow/adapters/builtin.cc
index ae84fa1..60d6248 100644
--- a/python/src/pyarrow/adapters/builtin.cc
+++ b/python/src/pyarrow/adapters/builtin.cc
@@ -276,7 +276,7 @@ class Int64Converter : public TypedConverter<arrow::Int64Builder> {
class DoubleConverter : public TypedConverter<arrow::DoubleBuilder> {
public:
Status AppendData(PyObject* seq) override {
- int64_t val;
+ double val;
Py_ssize_t size = PySequence_Size(seq);
for (int64_t i = 0; i < size; ++i) {
OwnedRef item(PySequence_GetItem(seq, i));