You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2020/10/08 14:31:32 UTC

[GitHub] [arrow] pitrou commented on a change in pull request #8349: ARROW-3080: [Python] Unify Arrow to Python object conversion paths

pitrou commented on a change in pull request #8349:
URL: https://github.com/apache/arrow/pull/8349#discussion_r501762533



##########
File path: cpp/src/arrow/python/helpers.cc
##########
@@ -258,30 +266,46 @@ bool PyFloat_IsNaN(PyObject* obj) {
 namespace {
 
 static std::once_flag pandas_static_initialized;
-static PyTypeObject* pandas_NaTType = nullptr;
+
 static PyObject* pandas_NA = nullptr;
+static PyObject* pandas_NaT = nullptr;
+static PyObject* pandas_Timedelta = nullptr;
+static PyObject* pandas_Timestamp = nullptr;
+static PyTypeObject* pandas_NaTType = nullptr;
 
 void GetPandasStaticSymbols() {
   OwnedRef pandas;
+
+  // import pandas
   Status s = ImportModule("pandas", &pandas);
   if (!s.ok()) {
     return;
   }
 
   OwnedRef ref;
-  s = ImportFromModule(pandas.obj(), "NaT", &ref);
-  if (!s.ok()) {
-    return;
+
+  // set NaT sentinel and its type
+  if (ImportFromModule(pandas.obj(), "NaT", &ref).ok()) {
+    pandas_NaT = ref.obj();
+    // PyObject_Type returns a new reference but we trust that pandas.NaT will

Review comment:
       You can just use the `Py_TYPE` macro then: https://docs.python.org/3/c-api/structures.html#c.Py_TYPE

##########
File path: python/pyarrow/tests/test_convert_builtin.py
##########
@@ -1903,18 +1946,40 @@ def test_dictionary_from_strings():
     assert a.dictionary.equals(expected_dictionary)
 
 
-def _has_unique_field_names(ty):
-    if isinstance(ty, pa.StructType):
-        field_names = [field.name for field in ty]
-        return len(set(field_names)) == len(field_names)
-    else:
-        return True
+@pytest.mark.parametrize('unit', ['s', 'ms', 'us', 'ns'])
+def test_duration_array_roundtrip_corner_cases(unit):
+    # Corner case discovered by hypothesis: there were implicit conversions to
+    # unsigned values resulting wrong values with wrong signs.
+    ty = pa.duration(unit)
+    arr = pa.array([-2147483000], type=ty)
+    restored = pa.array(arr.to_pylist(), type=ty)

Review comment:
       Shouldn't you also test the actual value of `restored`?

##########
File path: python/pyarrow/tests/test_convert_builtin.py
##########
@@ -1903,18 +1946,40 @@ def test_dictionary_from_strings():
     assert a.dictionary.equals(expected_dictionary)
 
 
-def _has_unique_field_names(ty):
-    if isinstance(ty, pa.StructType):
-        field_names = [field.name for field in ty]
-        return len(set(field_names)) == len(field_names)
-    else:
-        return True
+@pytest.mark.parametrize('unit', ['s', 'ms', 'us', 'ns'])
+def test_duration_array_roundtrip_corner_cases(unit):
+    # Corner case discovered by hypothesis: there were implicit conversions to
+    # unsigned values resulting wrong values with wrong signs.
+    ty = pa.duration(unit)
+    arr = pa.array([-2147483000], type=ty)
+    restored = pa.array(arr.to_pylist(), type=ty)
+    assert arr.equals(restored)
+
+
+@pytest.mark.pandas
+def test_roundtrip_nanosecond_resolution_pandas_temporal_objects():
+    # corner case discovered by hypothesis: preserving the nanoseconds on
+    # conversion from a list of Timedelta and Timestamp objects
+    import pandas as pd
+
+    ty = pa.duration('ns')
+    arr = pa.array([9223371273709551616], type=ty)
+    data = arr.to_pylist()
+    assert isinstance(data[0], pd.Timedelta)
+    restored = pa.array(data, type=ty)

Review comment:
       Same here and below: Shouldn't you also test the actual value of `restored`?

##########
File path: python/pyarrow/tests/test_convert_builtin.py
##########
@@ -1903,18 +1946,40 @@ def test_dictionary_from_strings():
     assert a.dictionary.equals(expected_dictionary)
 
 
-def _has_unique_field_names(ty):
-    if isinstance(ty, pa.StructType):
-        field_names = [field.name for field in ty]
-        return len(set(field_names)) == len(field_names)
-    else:
-        return True
+@pytest.mark.parametrize('unit', ['s', 'ms', 'us', 'ns'])
+def test_duration_array_roundtrip_corner_cases(unit):
+    # Corner case discovered by hypothesis: there were implicit conversions to
+    # unsigned values resulting wrong values with wrong signs.
+    ty = pa.duration(unit)
+    arr = pa.array([-2147483000], type=ty)
+    restored = pa.array(arr.to_pylist(), type=ty)
+    assert arr.equals(restored)
+
+
+@pytest.mark.pandas
+def test_roundtrip_nanosecond_resolution_pandas_temporal_objects():
+    # corner case discovered by hypothesis: preserving the nanoseconds on
+    # conversion from a list of Timedelta and Timestamp objects
+    import pandas as pd
+
+    ty = pa.duration('ns')
+    arr = pa.array([9223371273709551616], type=ty)
+    data = arr.to_pylist()
+    assert isinstance(data[0], pd.Timedelta)
+    restored = pa.array(data, type=ty)
+    assert arr.equals(restored)
+
+    ty = pa.timestamp('ns')
+    arr = pa.array([9223371273709551616], type=ty)
+    data = arr.to_pylist()
+    assert isinstance(data[0], pd.Timestamp)
+    restored = pa.array(data, type=ty)
+    assert arr.equals(restored)
 
 
 @h.given(past.all_arrays)
 def test_array_to_pylist_roundtrip(arr):
     # TODO(kszucs): ARROW-9997

Review comment:
       Shouldn't you remove this TODO?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org