You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by uw...@apache.org on 2018/09/12 14:37:42 UTC
[arrow] branch master updated: ARROW-1963: [C++/Python] Create
Array from sequence of numpy.datetime64
This is an automated email from the ASF dual-hosted git repository.
uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 8d71906 ARROW-1963: [C++/Python] Create Array from sequence of numpy.datetime64
8d71906 is described below
commit 8d719063d2ba85e12c27ec61fcead1c630ff0b14
Author: Krisztián Szűcs <sz...@gmail.com>
AuthorDate: Wed Sep 12 16:37:24 2018 +0200
ARROW-1963: [C++/Python] Create Array from sequence of numpy.datetime64
- fix NaT sentinel check when converting from sequence
- always treat NaT as null in ListConverter
- separate error message for generic datetime units
Author: Krisztián Szűcs <sz...@gmail.com>
Closes #2528 from kszucs/ARROW-1963 and squashes the following commits:
2ee68dbf <Krisztián Szűcs> fix NaT sentinel checking when converting from sequence; always treat NaT as null in ListConverter; separate error message for generic datetime units
---
cpp/src/arrow/python/numpy_convert.cc | 2 +
cpp/src/arrow/python/python_to_arrow.cc | 12 +++++-
python/pyarrow/array.pxi | 1 +
python/pyarrow/tests/test_array.py | 72 ++++++++++++++++++++++++++++-----
4 files changed, 75 insertions(+), 12 deletions(-)
diff --git a/cpp/src/arrow/python/numpy_convert.cc b/cpp/src/arrow/python/numpy_convert.cc
index 97220a5..d95e337 100644
--- a/cpp/src/arrow/python/numpy_convert.cc
+++ b/cpp/src/arrow/python/numpy_convert.cc
@@ -174,6 +174,8 @@ Status NumPyDtypeToArrow(PyArray_Descr* descr, std::shared_ptr<DataType>* out) {
case NPY_FR_D:
*out = date32();
break;
+ case NPY_FR_GENERIC:
+ return Status::NotImplemented("Unbound or generic datetime64 time unit");
default:
return Status::NotImplemented("Unsupported datetime64 time unit");
}
diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc
index 2a75989..783abd8 100644
--- a/cpp/src/arrow/python/python_to_arrow.cc
+++ b/cpp/src/arrow/python/python_to_arrow.cc
@@ -349,6 +349,8 @@ class TimestampConverter : public TypedConverter<TimestampType, TimestampConvert
}
} else if (PyArray_CheckAnyScalarExact(obj)) {
// numpy.datetime64
+ using traits = internal::npy_traits<NPY_DATETIME>;
+
std::shared_ptr<DataType> type;
RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), &type));
if (type->id() != Type::TIMESTAMP) {
@@ -364,6 +366,10 @@ class TimestampConverter : public TypedConverter<TimestampType, TimestampConvert
}
t = reinterpret_cast<PyDatetimeScalarObject*>(obj)->obval;
+ if (traits::isnull(t)) {
+ // checks numpy NaT sentinel after conversion
+ return typed_builder_->AppendNull();
+ }
} else {
RETURN_NOT_OK(internal::CIntFromPython(obj, &t));
}
@@ -575,7 +581,11 @@ Status ListConverter::AppendNdarrayTypedItem(PyArrayObject* arr) {
using T = typename traits::value_type;
using ValueBuilderType = typename TypeTraits<Type>::BuilderType;
- const bool null_sentinels_possible = (from_pandas_ && traits::supports_nulls);
+ const bool null_sentinels_possible =
+ // Always treat Numpy's NaT as null
+ NUMPY_TYPE == NPY_DATETIME ||
+ // Observing pandas's null sentinels
+ (from_pandas_ && traits::supports_nulls);
auto child_builder = checked_cast<ValueBuilderType*>(value_converter_->builder());
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 362ebc6..bc21e16 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -31,6 +31,7 @@ cdef _sequence_to_array(object sequence, object mask, object size,
options.from_pandas = from_pandas
cdef shared_ptr[CChunkedArray] out
+
with nogil:
check_status(ConvertPySequence(sequence, mask, options, &out))
diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py
index 0002dce..89f4d3d 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -861,6 +861,67 @@ def test_array_from_numpy_datetimeD():
assert result.equals(expected)
+@pytest.mark.parametrize(('dtype', 'type'), [
+ ('datetime64[s]', pa.timestamp('s')),
+ ('datetime64[ms]', pa.timestamp('ms')),
+ ('datetime64[us]', pa.timestamp('us')),
+ ('datetime64[ns]', pa.timestamp('ns'))
+])
+def test_array_from_numpy_datetime(dtype, type):
+ data = [
+ None,
+ datetime.datetime(2017, 4, 4, 12, 11, 10),
+ datetime.datetime(2018, 1, 1, 0, 2, 0)
+ ]
+
+ # from numpy array
+ arr = pa.array(np.array(data, dtype=dtype))
+ expected = pa.array(data, type=type)
+ assert arr.equals(expected)
+
+ # from list of numpy scalars
+ arr = pa.array(list(np.array(data, dtype=dtype)))
+ assert arr.equals(expected)
+
+
+def test_array_from_different_numpy_datetime_units_raises():
+ data = [
+ None,
+ datetime.datetime(2017, 4, 4, 12, 11, 10),
+ datetime.datetime(2018, 1, 1, 0, 2, 0)
+ ]
+ s = np.array(data, dtype='datetime64[s]')
+ ms = np.array(data, dtype='datetime64[ms]')
+ data = list(s[:2]) + list(ms[2:])
+
+ with pytest.raises(pa.ArrowNotImplementedError):
+ pa.array(data)
+
+
+@pytest.mark.parametrize('unit', ['ns', 'us', 'ms', 's'])
+def test_array_from_list_of_timestamps(unit):
+ n = np.datetime64('NaT', unit)
+ x = np.datetime64('2017-01-01 01:01:01.111111111', unit)
+ y = np.datetime64('2018-11-22 12:24:48.111111111', unit)
+
+ a1 = pa.array([n, x, y])
+ a2 = pa.array([n, x, y], type=pa.timestamp(unit))
+
+ assert a1.type == a2.type
+ assert a1.type.unit == unit
+ assert a1[0] == a2[0]
+
+
+def test_array_from_timestamp_with_generic_unit():
+ n = np.datetime64('NaT')
+ x = np.datetime64('2017-01-01 01:01:01.111111111')
+ y = np.datetime64('2018-11-22 12:24:48.111111111')
+
+ with pytest.raises(pa.ArrowNotImplementedError,
+ match='Unbound or generic datetime64 time unit'):
+ pa.array([n, x, y])
+
+
def test_array_from_py_float32():
data = [[1.2, 3.4], [9.0, 42.0]]
@@ -1068,14 +1129,3 @@ def test_nested_dictionary_array():
dict_arr = pa.DictionaryArray.from_arrays([0, 1, 0], ['a', 'b'])
dict_arr2 = pa.DictionaryArray.from_arrays([0, 1, 2, 1, 0], dict_arr)
assert dict_arr2.to_pylist() == ['a', 'b', 'a', 'b', 'a']
-
-
-@pytest.mark.parametrize('unit', ['ns', 'us', 'ms', 's'])
-def test_timestamp_units_from_list(unit):
- x = np.datetime64('2017-01-01 01:01:01.111111111', unit)
- a1 = pa.array([x])
- a2 = pa.array([x], type=pa.timestamp(unit))
-
- assert a1.type == a2.type
- assert a1.type.unit == unit
- assert a1[0] == a2[0]