You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by uw...@apache.org on 2018/01/28 16:31:07 UTC
[arrow] branch master updated: ARROW-1646: [Python] Handle NumPy
scalar types
This is an automated email from the ASF dual-hosted git repository.
uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 0543953 ARROW-1646: [Python] Handle NumPy scalar types
0543953 is described below
commit 05439532e70c105f8f282e2963dc31e0340ec503
Author: Korn, Uwe <Uw...@blue-yonder.com>
AuthorDate: Sun Jan 28 17:30:59 2018 +0100
ARROW-1646: [Python] Handle NumPy scalar types
Author: Korn, Uwe <Uw...@blue-yonder.com>
Author: Uwe L. Korn <uw...@xhochy.com>
Closes #1475 from xhochy/ARROW-1646 and squashes the following commits:
7d85879 [Uwe L. Korn] flake8
eb4c08d [Korn, Uwe] ARROW-1646: [Python] pyarrow.array cannot handle NumPy scalar types
---
cpp/src/arrow/python/builtin_convert.cc | 34 ++
cpp/src/arrow/python/numpy_convert.cc | 3 +
cpp/src/arrow/python/numpy_convert.h | 2 +
cpp/src/arrow/python/numpy_interop.h | 1 +
python/pyarrow/tests/test_convert_builtin.py | 805 +++++++++++++++------------
5 files changed, 492 insertions(+), 353 deletions(-)
diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc
index 71f2fde..f7a370c 100644
--- a/cpp/src/arrow/python/builtin_convert.cc
+++ b/cpp/src/arrow/python/builtin_convert.cc
@@ -32,6 +32,7 @@
#include "arrow/util/logging.h"
#include "arrow/python/helpers.h"
+#include "arrow/python/numpy_convert.h"
#include "arrow/python/util/datetime.h"
namespace arrow {
@@ -93,6 +94,21 @@ class ScalarVisitor {
++binary_count_;
} else if (PyUnicode_Check(obj)) {
++unicode_count_;
+ } else if (PyArray_CheckAnyScalarExact(obj)) {
+ std::shared_ptr<DataType> type;
+ RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), &type));
+ if (is_integer(type->id())) {
+ ++int_count_;
+ } else if (is_floating(type->id())) {
+ ++float_count_;
+ } else if (type->id() == Type::TIMESTAMP) {
+ ++timestamp_count_;
+ } else {
+ std::ostringstream ss;
+ ss << "Found a NumPy scalar with Arrow dtype that we cannot handle: ";
+ ss << type->ToString();
+ return Status::Invalid(ss.str());
+ }
} else {
// TODO(wesm): accumulate error information somewhere
static std::string supported_types =
@@ -575,6 +591,24 @@ class TimestampConverter
t = PyDateTime_to_ns(pydatetime);
break;
}
+ } else if (PyArray_CheckAnyScalarExact(item.obj())) {
+ // numpy.datetime64
+ std::shared_ptr<DataType> type;
+ RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(item.obj()), &type));
+ if (type->id() != Type::TIMESTAMP) {
+ std::ostringstream ss;
+ ss << "Expected np.datetime64 but got: ";
+ ss << type->ToString();
+ return Status::Invalid(ss.str());
+ }
+ const TimestampType& ttype = static_cast<const TimestampType&>(*type);
+ if (unit_ != ttype.unit()) {
+ return Status::NotImplemented(
+ "Cannot convert NumPy datetime64 objects with differing unit");
+ }
+
+ PyDatetimeScalarObject* obj = reinterpret_cast<PyDatetimeScalarObject*>(item.obj());
+ t = obj->obval;
} else {
t = static_cast<int64_t>(PyLong_AsLongLong(item.obj()));
RETURN_IF_PYERROR();
diff --git a/cpp/src/arrow/python/numpy_convert.cc b/cpp/src/arrow/python/numpy_convert.cc
index 124745e..c2d055f 100644
--- a/cpp/src/arrow/python/numpy_convert.cc
+++ b/cpp/src/arrow/python/numpy_convert.cc
@@ -152,7 +152,10 @@ Status NumPyDtypeToArrow(PyObject* dtype, std::shared_ptr<DataType>* out) {
return Status::TypeError("Did not pass numpy.dtype object");
}
PyArray_Descr* descr = reinterpret_cast<PyArray_Descr*>(dtype);
+ return NumPyDtypeToArrow(descr, out);
+}
+Status NumPyDtypeToArrow(PyArray_Descr* descr, std::shared_ptr<DataType>* out) {
int type_num = cast_npy_type_compat(descr->type_num);
switch (type_num) {
diff --git a/cpp/src/arrow/python/numpy_convert.h b/cpp/src/arrow/python/numpy_convert.h
index 93c4848..220e38f 100644
--- a/cpp/src/arrow/python/numpy_convert.h
+++ b/cpp/src/arrow/python/numpy_convert.h
@@ -56,6 +56,8 @@ bool is_contiguous(PyObject* array);
ARROW_EXPORT
Status NumPyDtypeToArrow(PyObject* dtype, std::shared_ptr<DataType>* out);
+ARROW_EXPORT
+Status NumPyDtypeToArrow(PyArray_Descr* descr, std::shared_ptr<DataType>* out);
Status GetTensorType(PyObject* dtype, std::shared_ptr<DataType>* out);
Status GetNumPyType(const DataType& type, int* type_num);
diff --git a/cpp/src/arrow/python/numpy_interop.h b/cpp/src/arrow/python/numpy_interop.h
index b93200c..8c569e2 100644
--- a/cpp/src/arrow/python/numpy_interop.h
+++ b/cpp/src/arrow/python/numpy_interop.h
@@ -40,6 +40,7 @@
#endif
#include <numpy/arrayobject.h>
+#include <numpy/arrayscalars.h>
#include <numpy/ufuncobject.h>
namespace arrow {
diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py
index d7760da..fa603b1 100644
--- a/python/pyarrow/tests/test_convert_builtin.py
+++ b/python/pyarrow/tests/test_convert_builtin.py
@@ -23,6 +23,8 @@ import pyarrow as pa
import datetime
import decimal
+import numpy as np
+import six
class StrangeIterable:
@@ -33,356 +35,453 @@ class StrangeIterable:
return self.lst.__iter__()
-class TestConvertIterable(unittest.TestCase):
-
- def test_iterable_types(self):
- arr1 = pa.array(StrangeIterable([0, 1, 2, 3]))
- arr2 = pa.array((0, 1, 2, 3))
-
- assert arr1.equals(arr2)
-
- def test_empty_iterable(self):
- arr = pa.array(StrangeIterable([]))
- assert len(arr) == 0
- assert arr.null_count == 0
- assert arr.type == pa.null()
- assert arr.to_pylist() == []
-
-
-class TestLimitedConvertIterator(unittest.TestCase):
- def test_iterator_types(self):
- arr1 = pa.array(iter(range(3)), type=pa.int64(), size=3)
- arr2 = pa.array((0, 1, 2))
- assert arr1.equals(arr2)
-
- def test_iterator_size_overflow(self):
- arr1 = pa.array(iter(range(3)), type=pa.int64(), size=2)
- arr2 = pa.array((0, 1))
- assert arr1.equals(arr2)
-
- def test_iterator_size_underflow(self):
- arr1 = pa.array(iter(range(3)), type=pa.int64(), size=10)
- arr2 = pa.array((0, 1, 2))
- assert arr1.equals(arr2)
-
-
-class TestConvertSequence(unittest.TestCase):
-
- def test_sequence_types(self):
- arr1 = pa.array([1, 2, 3])
- arr2 = pa.array((1, 2, 3))
-
- assert arr1.equals(arr2)
-
- def test_boolean(self):
- expected = [True, None, False, None]
- arr = pa.array(expected)
- assert len(arr) == 4
- assert arr.null_count == 2
- assert arr.type == pa.bool_()
- assert arr.to_pylist() == expected
-
- def test_empty_list(self):
- arr = pa.array([])
- assert len(arr) == 0
- assert arr.null_count == 0
- assert arr.type == pa.null()
- assert arr.to_pylist() == []
-
- def test_all_none(self):
- arr = pa.array([None, None])
- assert len(arr) == 2
- assert arr.null_count == 2
- assert arr.type == pa.null()
- assert arr.to_pylist() == [None, None]
-
- def test_integer(self):
- expected = [1, None, 3, None]
- arr = pa.array(expected)
- assert len(arr) == 4
- assert arr.null_count == 2
- assert arr.type == pa.int64()
- assert arr.to_pylist() == expected
-
- def test_garbage_collection(self):
- import gc
-
- # Force the cyclic garbage collector to run
- gc.collect()
-
- bytes_before = pa.total_allocated_bytes()
- pa.array([1, None, 3, None])
- gc.collect()
- assert pa.total_allocated_bytes() == bytes_before
-
- def test_double(self):
- data = [1.5, 1, None, 2.5, None, None]
- arr = pa.array(data)
- assert len(arr) == 6
- assert arr.null_count == 3
- assert arr.type == pa.float64()
- assert arr.to_pylist() == data
-
- def test_unicode(self):
- data = [u'foo', u'bar', None, u'mañana']
- arr = pa.array(data)
- assert len(arr) == 4
- assert arr.null_count == 1
- assert arr.type == pa.string()
- assert arr.to_pylist() == data
-
- def test_bytes(self):
- u1 = b'ma\xc3\xb1ana'
- data = [b'foo',
- u1.decode('utf-8'), # unicode gets encoded,
- None]
- arr = pa.array(data)
- assert len(arr) == 3
- assert arr.null_count == 1
- assert arr.type == pa.binary()
- assert arr.to_pylist() == [b'foo', u1, None]
-
- def test_utf8_to_unicode(self):
- # ARROW-1225
- data = [b'foo', None, b'bar']
- arr = pa.array(data, type=pa.string())
- assert arr[0].as_py() == u'foo'
-
- # test a non-utf8 unicode string
- val = (u'mañana').encode('utf-16-le')
- with pytest.raises(pa.ArrowException):
- pa.array([val], type=pa.string())
-
- def test_fixed_size_bytes(self):
- data = [b'foof', None, b'barb', b'2346']
- arr = pa.array(data, type=pa.binary(4))
- assert len(arr) == 4
- assert arr.null_count == 1
- assert arr.type == pa.binary(4)
- assert arr.to_pylist() == data
-
- def test_fixed_size_bytes_does_not_accept_varying_lengths(self):
- data = [b'foo', None, b'barb', b'2346']
- with self.assertRaises(pa.ArrowInvalid):
- pa.array(data, type=pa.binary(4))
-
- def test_date(self):
- data = [datetime.date(2000, 1, 1), None, datetime.date(1970, 1, 1),
- datetime.date(2040, 2, 26)]
- arr = pa.array(data)
- assert len(arr) == 4
- assert arr.type == pa.date64()
- assert arr.null_count == 1
- assert arr[0].as_py() == datetime.date(2000, 1, 1)
- assert arr[1].as_py() is None
- assert arr[2].as_py() == datetime.date(1970, 1, 1)
- assert arr[3].as_py() == datetime.date(2040, 2, 26)
-
- def test_date32(self):
- data = [datetime.date(2000, 1, 1), None]
- arr = pa.array(data, type=pa.date32())
-
- data2 = [10957, None]
- arr2 = pa.array(data2, type=pa.date32())
-
- for x in [arr, arr2]:
- assert len(x) == 2
- assert x.type == pa.date32()
- assert x.null_count == 1
- assert x[0].as_py() == datetime.date(2000, 1, 1)
- assert x[1] is pa.NA
-
- # Overflow
- data3 = [2**32, None]
- with pytest.raises(pa.ArrowException):
- pa.array(data3, type=pa.date32())
-
- def test_timestamp(self):
- data = [
- datetime.datetime(2007, 7, 13, 1, 23, 34, 123456),
- None,
- datetime.datetime(2006, 1, 13, 12, 34, 56, 432539),
- datetime.datetime(2010, 8, 13, 5, 46, 57, 437699)
- ]
- arr = pa.array(data)
- assert len(arr) == 4
- assert arr.type == pa.timestamp('us')
- assert arr.null_count == 1
- assert arr[0].as_py() == datetime.datetime(2007, 7, 13, 1,
- 23, 34, 123456)
- assert arr[1].as_py() is None
- assert arr[2].as_py() == datetime.datetime(2006, 1, 13, 12,
- 34, 56, 432539)
- assert arr[3].as_py() == datetime.datetime(2010, 8, 13, 5,
- 46, 57, 437699)
-
- def test_timestamp_with_unit(self):
- data = [
- datetime.datetime(2007, 7, 13, 1, 23, 34, 123456),
- ]
-
- s = pa.timestamp('s')
- ms = pa.timestamp('ms')
- us = pa.timestamp('us')
- ns = pa.timestamp('ns')
-
- arr_s = pa.array(data, type=s)
- assert len(arr_s) == 1
- assert arr_s.type == s
- assert arr_s[0].as_py() == datetime.datetime(2007, 7, 13, 1,
- 23, 34, 0)
-
- arr_ms = pa.array(data, type=ms)
- assert len(arr_ms) == 1
- assert arr_ms.type == ms
- assert arr_ms[0].as_py() == datetime.datetime(2007, 7, 13, 1,
- 23, 34, 123000)
-
- arr_us = pa.array(data, type=us)
- assert len(arr_us) == 1
- assert arr_us.type == us
- assert arr_us[0].as_py() == datetime.datetime(2007, 7, 13, 1,
- 23, 34, 123456)
-
- arr_ns = pa.array(data, type=ns)
- assert len(arr_ns) == 1
- assert arr_ns.type == ns
- assert arr_ns[0].as_py() == datetime.datetime(2007, 7, 13, 1,
- 23, 34, 123456)
-
- def test_timestamp_from_int_with_unit(self):
- data = [1]
-
- s = pa.timestamp('s')
- ms = pa.timestamp('ms')
- us = pa.timestamp('us')
- ns = pa.timestamp('ns')
-
- arr_s = pa.array(data, type=s)
- assert len(arr_s) == 1
- assert arr_s.type == s
- assert str(arr_s[0]) == "Timestamp('1970-01-01 00:00:01')"
-
- arr_ms = pa.array(data, type=ms)
- assert len(arr_ms) == 1
- assert arr_ms.type == ms
- assert str(arr_ms[0]) == "Timestamp('1970-01-01 00:00:00.001000')"
-
- arr_us = pa.array(data, type=us)
- assert len(arr_us) == 1
- assert arr_us.type == us
- assert str(arr_us[0]) == "Timestamp('1970-01-01 00:00:00.000001')"
-
- arr_ns = pa.array(data, type=ns)
- assert len(arr_ns) == 1
- assert arr_ns.type == ns
- assert str(arr_ns[0]) == "Timestamp('1970-01-01 00:00:00.000000001')"
-
- with pytest.raises(pa.ArrowException):
- class CustomClass():
- pass
- pa.array([1, CustomClass()], type=ns)
- pa.array([1, CustomClass()], type=pa.date32())
- pa.array([1, CustomClass()], type=pa.date64())
-
- def test_mixed_nesting_levels(self):
- pa.array([1, 2, None])
- pa.array([[1], [2], None])
- pa.array([[1], [2], [None]])
-
- with self.assertRaises(pa.ArrowInvalid):
- pa.array([1, 2, [1]])
-
- with self.assertRaises(pa.ArrowInvalid):
- pa.array([1, 2, []])
-
- with self.assertRaises(pa.ArrowInvalid):
- pa.array([[1], [2], [None, [1]]])
-
- def test_list_of_int(self):
- data = [[1, 2, 3], [], None, [1, 2]]
- arr = pa.array(data)
- assert len(arr) == 4
- assert arr.null_count == 1
- assert arr.type == pa.list_(pa.int64())
- assert arr.to_pylist() == data
-
- def test_mixed_types_fails(self):
- data = ['a', 1, 2.0]
- with self.assertRaises(pa.ArrowException):
- pa.array(data)
-
- def test_mixed_types_with_specified_type_fails(self):
- data = ['-10', '-5', {'a': 1}, '0', '5', '10']
-
- type = pa.string()
- with self.assertRaises(pa.ArrowInvalid):
- pa.array(data, type=type)
-
- def test_decimal(self):
- data = [decimal.Decimal('1234.183'), decimal.Decimal('8094.234')]
- type = pa.decimal128(precision=7, scale=3)
- arr = pa.array(data, type=type)
- assert arr.to_pylist() == data
-
- def test_decimal_different_precisions(self):
- data = [
- decimal.Decimal('1234234983.183'), decimal.Decimal('80943244.234')
- ]
- type = pa.decimal128(precision=13, scale=3)
- arr = pa.array(data, type=type)
- assert arr.to_pylist() == data
-
- def test_decimal_no_scale(self):
- data = [decimal.Decimal('1234234983'), decimal.Decimal('8094324')]
- type = pa.decimal128(precision=10)
- arr = pa.array(data, type=type)
- assert arr.to_pylist() == data
-
- def test_decimal_negative(self):
- data = [decimal.Decimal('-1234.234983'), decimal.Decimal('-8.094324')]
- type = pa.decimal128(precision=10, scale=6)
- arr = pa.array(data, type=type)
- assert arr.to_pylist() == data
-
- def test_decimal_no_whole_part(self):
- data = [decimal.Decimal('-.4234983'), decimal.Decimal('.0103943')]
- type = pa.decimal128(precision=7, scale=7)
- arr = pa.array(data, type=type)
- assert arr.to_pylist() == data
-
- def test_decimal_large_integer(self):
- data = [decimal.Decimal('-394029506937548693.42983'),
- decimal.Decimal('32358695912932.01033')]
- type = pa.decimal128(precision=23, scale=5)
- arr = pa.array(data, type=type)
- assert arr.to_pylist() == data
-
- def test_range_types(self):
- arr1 = pa.array(range(3))
- arr2 = pa.array((0, 1, 2))
- assert arr1.equals(arr2)
-
- def test_empty_range(self):
- arr = pa.array(range(0))
- assert len(arr) == 0
- assert arr.null_count == 0
- assert arr.type == pa.null()
- assert arr.to_pylist() == []
-
- def test_structarray(self):
- ints = pa.array([None, 2, 3], type=pa.int64())
- strs = pa.array([u'a', None, u'c'], type=pa.string())
- bools = pa.array([True, False, None], type=pa.bool_())
- arr = pa.StructArray.from_arrays(
- ['ints', 'strs', 'bools'],
- [ints, strs, bools])
-
- expected = [
- {'ints': None, 'strs': u'a', 'bools': True},
- {'ints': 2, 'strs': None, 'bools': False},
- {'ints': 3, 'strs': u'c', 'bools': None},
- ]
-
- pylist = arr.to_pylist()
- assert pylist == expected, (pylist, expected)
+def test_iterable_types():
+ arr1 = pa.array(StrangeIterable([0, 1, 2, 3]))
+ arr2 = pa.array((0, 1, 2, 3))
+
+ assert arr1.equals(arr2)
+
+
+def test_empty_iterable():
+ arr = pa.array(StrangeIterable([]))
+ assert len(arr) == 0
+ assert arr.null_count == 0
+ assert arr.type == pa.null()
+ assert arr.to_pylist() == []
+
+
+def test_limited_iterator_types():
+ arr1 = pa.array(iter(range(3)), type=pa.int64(), size=3)
+ arr2 = pa.array((0, 1, 2))
+ assert arr1.equals(arr2)
+
+
+def test_limited_iterator_size_overflow():
+ arr1 = pa.array(iter(range(3)), type=pa.int64(), size=2)
+ arr2 = pa.array((0, 1))
+ assert arr1.equals(arr2)
+
+
+def test_limited_iterator_size_underflow():
+ arr1 = pa.array(iter(range(3)), type=pa.int64(), size=10)
+ arr2 = pa.array((0, 1, 2))
+ assert arr1.equals(arr2)
+
+
+def _as_list(xs):
+ return xs
+
+
+def _as_tuple(xs):
+ return tuple(xs)
+
+
+def _as_dict_values(xs):
+ dct = {k: v for k, v in enumerate(xs)}
+ return six.viewvalues(dct)
+
+
+@pytest.mark.parametrize("seq", [_as_list, _as_tuple, _as_dict_values])
+def test_sequence_types(seq):
+ arr1 = pa.array(seq([1, 2, 3]))
+ arr2 = pa.array([1, 2, 3])
+
+ assert arr1.equals(arr2)
+
+
+@pytest.mark.parametrize("seq", [_as_list, _as_tuple, _as_dict_values])
+def test_sequence_boolean(seq):
+ expected = [True, None, False, None]
+ arr = pa.array(seq(expected))
+ assert len(arr) == 4
+ assert arr.null_count == 2
+ assert arr.type == pa.bool_()
+ assert arr.to_pylist() == expected
+
+
+@pytest.mark.parametrize("seq", [_as_list, _as_tuple, _as_dict_values])
+def test_sequence_numpy_boolean(seq):
+ expected = [np.bool(True), None, np.bool(False), None]
+ arr = pa.array(seq(expected))
+ assert len(arr) == 4
+ assert arr.null_count == 2
+ assert arr.type == pa.bool_()
+ assert arr.to_pylist() == expected
+
+
+@pytest.mark.parametrize("seq", [_as_list, _as_tuple, _as_dict_values])
+def test_empty_list(seq):
+ arr = pa.array(seq([]))
+ assert len(arr) == 0
+ assert arr.null_count == 0
+ assert arr.type == pa.null()
+ assert arr.to_pylist() == []
+
+
+def test_sequence_all_none():
+ arr = pa.array([None, None])
+ assert len(arr) == 2
+ assert arr.null_count == 2
+ assert arr.type == pa.null()
+ assert arr.to_pylist() == [None, None]
+
+
+@pytest.mark.parametrize("seq", [_as_list, _as_tuple, _as_dict_values])
+def test_sequence_integer(seq):
+ expected = [1, None, 3, None]
+ arr = pa.array(seq(expected))
+ assert len(arr) == 4
+ assert arr.null_count == 2
+ assert arr.type == pa.int64()
+ assert arr.to_pylist() == expected
+
+
+@pytest.mark.parametrize("seq", [_as_list, _as_tuple, _as_dict_values])
+@pytest.mark.parametrize("np_scalar", [np.int16, np.int32, np.int64, np.uint16,
+ np.uint32, np.uint64])
+def test_sequence_numpy_integer(seq, np_scalar):
+ expected = [np_scalar(1), None, np_scalar(3), None]
+ arr = pa.array(seq(expected))
+ assert len(arr) == 4
+ assert arr.null_count == 2
+ assert arr.type == pa.int64()
+ assert arr.to_pylist() == expected
+
+
+def test_garbage_collection():
+ import gc
+
+ # Force the cyclic garbage collector to run
+ gc.collect()
+
+ bytes_before = pa.total_allocated_bytes()
+ pa.array([1, None, 3, None])
+ gc.collect()
+ assert pa.total_allocated_bytes() == bytes_before
+
+
+def test_sequence_double():
+ data = [1.5, 1, None, 2.5, None, None]
+ arr = pa.array(data)
+ assert len(arr) == 6
+ assert arr.null_count == 3
+ assert arr.type == pa.float64()
+ assert arr.to_pylist() == data
+
+
+@pytest.mark.parametrize("seq", [_as_list, _as_tuple, _as_dict_values])
+@pytest.mark.parametrize("np_scalar", [np.float16, np.float32, np.float64])
+def test_sequence_numpy_double(seq, np_scalar):
+ data = [np_scalar(1.5), np_scalar(1), None, np_scalar(2.5), None, None]
+ arr = pa.array(seq(data))
+ assert len(arr) == 6
+ assert arr.null_count == 3
+ assert arr.type == pa.float64()
+ assert arr.to_pylist() == data
+
+
+def test_sequence_unicode():
+ data = [u'foo', u'bar', None, u'mañana']
+ arr = pa.array(data)
+ assert len(arr) == 4
+ assert arr.null_count == 1
+ assert arr.type == pa.string()
+ assert arr.to_pylist() == data
+
+
+def test_sequence_bytes():
+ u1 = b'ma\xc3\xb1ana'
+ data = [b'foo',
+ u1.decode('utf-8'), # unicode gets encoded,
+ None]
+ arr = pa.array(data)
+ assert len(arr) == 3
+ assert arr.null_count == 1
+ assert arr.type == pa.binary()
+ assert arr.to_pylist() == [b'foo', u1, None]
+
+
+def test_sequence_utf8_to_unicode():
+ # ARROW-1225
+ data = [b'foo', None, b'bar']
+ arr = pa.array(data, type=pa.string())
+ assert arr[0].as_py() == u'foo'
+
+ # test a non-utf8 unicode string
+ val = (u'mañana').encode('utf-16-le')
+ with pytest.raises(pa.ArrowException):
+ pa.array([val], type=pa.string())
+
+
+def test_sequence_fixed_size_bytes():
+ data = [b'foof', None, b'barb', b'2346']
+ arr = pa.array(data, type=pa.binary(4))
+ assert len(arr) == 4
+ assert arr.null_count == 1
+ assert arr.type == pa.binary(4)
+ assert arr.to_pylist() == data
+
+
+def test_fixed_size_bytes_does_not_accept_varying_lengths():
+ data = [b'foo', None, b'barb', b'2346']
+ with pytest.raises(pa.ArrowInvalid):
+ pa.array(data, type=pa.binary(4))
+
+
+def test_sequence_date():
+ data = [datetime.date(2000, 1, 1), None, datetime.date(1970, 1, 1),
+ datetime.date(2040, 2, 26)]
+ arr = pa.array(data)
+ assert len(arr) == 4
+ assert arr.type == pa.date64()
+ assert arr.null_count == 1
+ assert arr[0].as_py() == datetime.date(2000, 1, 1)
+ assert arr[1].as_py() is None
+ assert arr[2].as_py() == datetime.date(1970, 1, 1)
+ assert arr[3].as_py() == datetime.date(2040, 2, 26)
+
+
+def test_sequence_date32():
+ data = [datetime.date(2000, 1, 1), None]
+ arr = pa.array(data, type=pa.date32())
+
+ data2 = [10957, None]
+ arr2 = pa.array(data2, type=pa.date32())
+
+ for x in [arr, arr2]:
+ assert len(x) == 2
+ assert x.type == pa.date32()
+ assert x.null_count == 1
+ assert x[0].as_py() == datetime.date(2000, 1, 1)
+ assert x[1] is pa.NA
+
+ # Overflow
+ data3 = [2**32, None]
+ with pytest.raises(pa.ArrowException):
+ pa.array(data3, type=pa.date32())
+
+
+def test_sequence_timestamp():
+ data = [
+ datetime.datetime(2007, 7, 13, 1, 23, 34, 123456),
+ None,
+ datetime.datetime(2006, 1, 13, 12, 34, 56, 432539),
+ datetime.datetime(2010, 8, 13, 5, 46, 57, 437699)
+ ]
+ arr = pa.array(data)
+ assert len(arr) == 4
+ assert arr.type == pa.timestamp('us')
+ assert arr.null_count == 1
+ assert arr[0].as_py() == datetime.datetime(2007, 7, 13, 1,
+ 23, 34, 123456)
+ assert arr[1].as_py() is None
+ assert arr[2].as_py() == datetime.datetime(2006, 1, 13, 12,
+ 34, 56, 432539)
+ assert arr[3].as_py() == datetime.datetime(2010, 8, 13, 5,
+ 46, 57, 437699)
+
+
+def test_sequence_numpy_timestamp():
+ data = [
+ np.datetime64(datetime.datetime(2007, 7, 13, 1, 23, 34, 123456)),
+ None,
+ np.datetime64(datetime.datetime(2006, 1, 13, 12, 34, 56, 432539)),
+ np.datetime64(datetime.datetime(2010, 8, 13, 5, 46, 57, 437699))
+ ]
+ arr = pa.array(data)
+ assert len(arr) == 4
+ assert arr.type == pa.timestamp('us')
+ assert arr.null_count == 1
+ assert arr[0].as_py() == datetime.datetime(2007, 7, 13, 1,
+ 23, 34, 123456)
+ assert arr[1].as_py() is None
+ assert arr[2].as_py() == datetime.datetime(2006, 1, 13, 12,
+ 34, 56, 432539)
+ assert arr[3].as_py() == datetime.datetime(2010, 8, 13, 5,
+ 46, 57, 437699)
+
+
+def test_sequence_timestamp_with_unit():
+ data = [
+ datetime.datetime(2007, 7, 13, 1, 23, 34, 123456),
+ ]
+
+ s = pa.timestamp('s')
+ ms = pa.timestamp('ms')
+ us = pa.timestamp('us')
+ ns = pa.timestamp('ns')
+
+ arr_s = pa.array(data, type=s)
+ assert len(arr_s) == 1
+ assert arr_s.type == s
+ assert arr_s[0].as_py() == datetime.datetime(2007, 7, 13, 1,
+ 23, 34, 0)
+
+ arr_ms = pa.array(data, type=ms)
+ assert len(arr_ms) == 1
+ assert arr_ms.type == ms
+ assert arr_ms[0].as_py() == datetime.datetime(2007, 7, 13, 1,
+ 23, 34, 123000)
+
+ arr_us = pa.array(data, type=us)
+ assert len(arr_us) == 1
+ assert arr_us.type == us
+ assert arr_us[0].as_py() == datetime.datetime(2007, 7, 13, 1,
+ 23, 34, 123456)
+
+ arr_ns = pa.array(data, type=ns)
+ assert len(arr_ns) == 1
+ assert arr_ns.type == ns
+ assert arr_ns[0].as_py() == datetime.datetime(2007, 7, 13, 1,
+ 23, 34, 123456)
+
+
+def test_sequence_timestamp_from_int_with_unit():
+ data = [1]
+
+ s = pa.timestamp('s')
+ ms = pa.timestamp('ms')
+ us = pa.timestamp('us')
+ ns = pa.timestamp('ns')
+
+ arr_s = pa.array(data, type=s)
+ assert len(arr_s) == 1
+ assert arr_s.type == s
+ assert str(arr_s[0]) == "Timestamp('1970-01-01 00:00:01')"
+
+ arr_ms = pa.array(data, type=ms)
+ assert len(arr_ms) == 1
+ assert arr_ms.type == ms
+ assert str(arr_ms[0]) == "Timestamp('1970-01-01 00:00:00.001000')"
+
+ arr_us = pa.array(data, type=us)
+ assert len(arr_us) == 1
+ assert arr_us.type == us
+ assert str(arr_us[0]) == "Timestamp('1970-01-01 00:00:00.000001')"
+
+ arr_ns = pa.array(data, type=ns)
+ assert len(arr_ns) == 1
+ assert arr_ns.type == ns
+ assert str(arr_ns[0]) == "Timestamp('1970-01-01 00:00:00.000000001')"
+
+ with pytest.raises(pa.ArrowException):
+ class CustomClass():
+ pass
+ pa.array([1, CustomClass()], type=ns)
+ pa.array([1, CustomClass()], type=pa.date32())
+ pa.array([1, CustomClass()], type=pa.date64())
+
+
+def test_sequence_mixed_nesting_levels():
+ pa.array([1, 2, None])
+ pa.array([[1], [2], None])
+ pa.array([[1], [2], [None]])
+
+ with pytest.raises(pa.ArrowInvalid):
+ pa.array([1, 2, [1]])
+
+ with pytest.raises(pa.ArrowInvalid):
+ pa.array([1, 2, []])
+
+ with pytest.raises(pa.ArrowInvalid):
+ pa.array([[1], [2], [None, [1]]])
+
+
+def test_sequence_list_of_int():
+ data = [[1, 2, 3], [], None, [1, 2]]
+ arr = pa.array(data)
+ assert len(arr) == 4
+ assert arr.null_count == 1
+ assert arr.type == pa.list_(pa.int64())
+ assert arr.to_pylist() == data
+
+
+def test_sequence_mixed_types_fails():
+ data = ['a', 1, 2.0]
+ with pytest.raises(pa.ArrowException):
+ pa.array(data)
+
+
+def test_sequence_mixed_types_with_specified_type_fails():
+ data = ['-10', '-5', {'a': 1}, '0', '5', '10']
+
+ type = pa.string()
+ with pytest.raises(pa.ArrowInvalid):
+ pa.array(data, type=type)
+
+
+def test_sequence_decimal():
+ data = [decimal.Decimal('1234.183'), decimal.Decimal('8094.234')]
+ type = pa.decimal128(precision=7, scale=3)
+ arr = pa.array(data, type=type)
+ assert arr.to_pylist() == data
+
+
+def test_sequence_decimal_different_precisions():
+ data = [
+ decimal.Decimal('1234234983.183'), decimal.Decimal('80943244.234')
+ ]
+ type = pa.decimal128(precision=13, scale=3)
+ arr = pa.array(data, type=type)
+ assert arr.to_pylist() == data
+
+
+def test_sequence_decimal_no_scale():
+ data = [decimal.Decimal('1234234983'), decimal.Decimal('8094324')]
+ type = pa.decimal128(precision=10)
+ arr = pa.array(data, type=type)
+ assert arr.to_pylist() == data
+
+
+def test_sequence_decimal_negative():
+ data = [decimal.Decimal('-1234.234983'), decimal.Decimal('-8.094324')]
+ type = pa.decimal128(precision=10, scale=6)
+ arr = pa.array(data, type=type)
+ assert arr.to_pylist() == data
+
+
+def test_sequence_decimal_no_whole_part():
+ data = [decimal.Decimal('-.4234983'), decimal.Decimal('.0103943')]
+ type = pa.decimal128(precision=7, scale=7)
+ arr = pa.array(data, type=type)
+ assert arr.to_pylist() == data
+
+
+def test_sequence_decimal_large_integer():
+ data = [decimal.Decimal('-394029506937548693.42983'),
+ decimal.Decimal('32358695912932.01033')]
+ type = pa.decimal128(precision=23, scale=5)
+ arr = pa.array(data, type=type)
+ assert arr.to_pylist() == data
+
+
+def test_range_types():
+ arr1 = pa.array(range(3))
+ arr2 = pa.array((0, 1, 2))
+ assert arr1.equals(arr2)
+
+
+def test_empty_range():
+ arr = pa.array(range(0))
+ assert len(arr) == 0
+ assert arr.null_count == 0
+ assert arr.type == pa.null()
+ assert arr.to_pylist() == []
+
+
+def test_structarray():
+ ints = pa.array([None, 2, 3], type=pa.int64())
+ strs = pa.array([u'a', None, u'c'], type=pa.string())
+ bools = pa.array([True, False, None], type=pa.bool_())
+ arr = pa.StructArray.from_arrays(
+ ['ints', 'strs', 'bools'],
+ [ints, strs, bools])
+
+ expected = [
+ {'ints': None, 'strs': u'a', 'bools': True},
+ {'ints': 2, 'strs': None, 'bools': False},
+ {'ints': 3, 'strs': u'c', 'bools': None},
+ ]
+
+ pylist = arr.to_pylist()
+ assert pylist == expected, (pylist, expected)
--
To stop receiving notification emails like this one, please contact
uwe@apache.org.