You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by uw...@apache.org on 2018/09/12 14:37:42 UTC

[arrow] branch master updated: ARROW-1963: [C++/Python] Create Array from sequence of numpy.datetime64

This is an automated email from the ASF dual-hosted git repository.

uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 8d71906  ARROW-1963: [C++/Python] Create Array from sequence of numpy.datetime64
8d71906 is described below

commit 8d719063d2ba85e12c27ec61fcead1c630ff0b14
Author: Krisztián Szűcs <sz...@gmail.com>
AuthorDate: Wed Sep 12 16:37:24 2018 +0200

    ARROW-1963: [C++/Python] Create Array from sequence of numpy.datetime64
    
    - fix NaT sentinel check when converting from sequence
    - always treat NaT as null in ListConverter
    - separate error message for generic datetime units
    
    Author: Krisztián Szűcs <sz...@gmail.com>
    
    Closes #2528 from kszucs/ARROW-1963 and squashes the following commits:
    
    2ee68dbf <Krisztián Szűcs> fix NaT sentinel checking when converting from sequence; always treat NaT as null in ListConverter; separate error message for generic datetime units
---
 cpp/src/arrow/python/numpy_convert.cc   |  2 +
 cpp/src/arrow/python/python_to_arrow.cc | 12 +++++-
 python/pyarrow/array.pxi                |  1 +
 python/pyarrow/tests/test_array.py      | 72 ++++++++++++++++++++++++++++-----
 4 files changed, 75 insertions(+), 12 deletions(-)

diff --git a/cpp/src/arrow/python/numpy_convert.cc b/cpp/src/arrow/python/numpy_convert.cc
index 97220a5..d95e337 100644
--- a/cpp/src/arrow/python/numpy_convert.cc
+++ b/cpp/src/arrow/python/numpy_convert.cc
@@ -174,6 +174,8 @@ Status NumPyDtypeToArrow(PyArray_Descr* descr, std::shared_ptr<DataType>* out) {
         case NPY_FR_D:
           *out = date32();
           break;
+        case NPY_FR_GENERIC:
+          return Status::NotImplemented("Unbound or generic datetime64 time unit");
         default:
           return Status::NotImplemented("Unsupported datetime64 time unit");
       }
diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc
index 2a75989..783abd8 100644
--- a/cpp/src/arrow/python/python_to_arrow.cc
+++ b/cpp/src/arrow/python/python_to_arrow.cc
@@ -349,6 +349,8 @@ class TimestampConverter : public TypedConverter<TimestampType, TimestampConvert
       }
     } else if (PyArray_CheckAnyScalarExact(obj)) {
       // numpy.datetime64
+      using traits = internal::npy_traits<NPY_DATETIME>;
+
       std::shared_ptr<DataType> type;
       RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), &type));
       if (type->id() != Type::TIMESTAMP) {
@@ -364,6 +366,10 @@ class TimestampConverter : public TypedConverter<TimestampType, TimestampConvert
       }
 
       t = reinterpret_cast<PyDatetimeScalarObject*>(obj)->obval;
+      if (traits::isnull(t)) {
+        // checks numpy NaT sentinel after conversion
+        return typed_builder_->AppendNull();
+      }
     } else {
       RETURN_NOT_OK(internal::CIntFromPython(obj, &t));
     }
@@ -575,7 +581,11 @@ Status ListConverter::AppendNdarrayTypedItem(PyArrayObject* arr) {
   using T = typename traits::value_type;
   using ValueBuilderType = typename TypeTraits<Type>::BuilderType;
 
-  const bool null_sentinels_possible = (from_pandas_ && traits::supports_nulls);
+  const bool null_sentinels_possible =
+      // Always treat Numpy's NaT as null
+      NUMPY_TYPE == NPY_DATETIME ||
+      // Observing pandas's null sentinels
+      (from_pandas_ && traits::supports_nulls);
 
   auto child_builder = checked_cast<ValueBuilderType*>(value_converter_->builder());
 
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 362ebc6..bc21e16 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -31,6 +31,7 @@ cdef _sequence_to_array(object sequence, object mask, object size,
     options.from_pandas = from_pandas
 
     cdef shared_ptr[CChunkedArray] out
+
     with nogil:
         check_status(ConvertPySequence(sequence, mask, options, &out))
 
diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py
index 0002dce..89f4d3d 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -861,6 +861,67 @@ def test_array_from_numpy_datetimeD():
     assert result.equals(expected)
 
 
+@pytest.mark.parametrize(('dtype', 'type'), [
+    ('datetime64[s]', pa.timestamp('s')),
+    ('datetime64[ms]', pa.timestamp('ms')),
+    ('datetime64[us]', pa.timestamp('us')),
+    ('datetime64[ns]', pa.timestamp('ns'))
+])
+def test_array_from_numpy_datetime(dtype, type):
+    data = [
+        None,
+        datetime.datetime(2017, 4, 4, 12, 11, 10),
+        datetime.datetime(2018, 1, 1, 0, 2, 0)
+    ]
+
+    # from numpy array
+    arr = pa.array(np.array(data, dtype=dtype))
+    expected = pa.array(data, type=type)
+    assert arr.equals(expected)
+
+    # from list of numpy scalars
+    arr = pa.array(list(np.array(data, dtype=dtype)))
+    assert arr.equals(expected)
+
+
+def test_array_from_different_numpy_datetime_units_raises():
+    data = [
+        None,
+        datetime.datetime(2017, 4, 4, 12, 11, 10),
+        datetime.datetime(2018, 1, 1, 0, 2, 0)
+    ]
+    s = np.array(data, dtype='datetime64[s]')
+    ms = np.array(data, dtype='datetime64[ms]')
+    data = list(s[:2]) + list(ms[2:])
+
+    with pytest.raises(pa.ArrowNotImplementedError):
+        pa.array(data)
+
+
+@pytest.mark.parametrize('unit', ['ns', 'us', 'ms', 's'])
+def test_array_from_list_of_timestamps(unit):
+    n = np.datetime64('NaT', unit)
+    x = np.datetime64('2017-01-01 01:01:01.111111111', unit)
+    y = np.datetime64('2018-11-22 12:24:48.111111111', unit)
+
+    a1 = pa.array([n, x, y])
+    a2 = pa.array([n, x, y], type=pa.timestamp(unit))
+
+    assert a1.type == a2.type
+    assert a1.type.unit == unit
+    assert a1[0] == a2[0]
+
+
+def test_array_from_timestamp_with_generic_unit():
+    n = np.datetime64('NaT')
+    x = np.datetime64('2017-01-01 01:01:01.111111111')
+    y = np.datetime64('2018-11-22 12:24:48.111111111')
+
+    with pytest.raises(pa.ArrowNotImplementedError,
+                       match='Unbound or generic datetime64 time unit'):
+        pa.array([n, x, y])
+
+
 def test_array_from_py_float32():
     data = [[1.2, 3.4], [9.0, 42.0]]
 
@@ -1068,14 +1129,3 @@ def test_nested_dictionary_array():
     dict_arr = pa.DictionaryArray.from_arrays([0, 1, 0], ['a', 'b'])
     dict_arr2 = pa.DictionaryArray.from_arrays([0, 1, 2, 1, 0], dict_arr)
     assert dict_arr2.to_pylist() == ['a', 'b', 'a', 'b', 'a']
-
-
-@pytest.mark.parametrize('unit', ['ns', 'us', 'ms', 's'])
-def test_timestamp_units_from_list(unit):
-    x = np.datetime64('2017-01-01 01:01:01.111111111', unit)
-    a1 = pa.array([x])
-    a2 = pa.array([x], type=pa.timestamp(unit))
-
-    assert a1.type == a2.type
-    assert a1.type.unit == unit
-    assert a1[0] == a2[0]