You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2020/08/08 00:13:47 UTC

[GitHub] [arrow] wesm commented on a change in pull request #7816: ARROW-9528: [Python] Honor tzinfo when converting from datetime

wesm commented on a change in pull request #7816:
URL: https://github.com/apache/arrow/pull/7816#discussion_r467330463



##########
File path: ci/scripts/integration_spark.sh
##########
@@ -22,6 +22,9 @@ source_dir=${1}
 spark_dir=${2}
 spark_version=${SPARK_VERSION:-master}
 
+# Use old behavior that always dropped tiemzones.
+export ARROW_NO_TZ=1

Review comment:
       Nit: might be better to call this something more explicit like `PYARROW_PANDAS_DROP_NESTED_TZ` (since it only impacts tz-aware datetimes inside e.g. structs, right?)

##########
File path: cpp/src/arrow/python/arrow_to_pandas.cc
##########
@@ -642,24 +641,27 @@ inline Status ConvertStruct(const PandasOptions& options, const ChunkedArray& da
   std::vector<OwnedRef> fields_data(num_fields);
   OwnedRef dict_item;
 
-  // XXX(wesm): In ARROW-7723, we found as a result of ARROW-3789 that second
+  // In ARROW-7723, we found as a result of ARROW-3789 that second
   // through microsecond resolution tz-aware timestamps were being promoted to
   // use the DATETIME_NANO_TZ conversion path, yielding a datetime64[ns] NumPy
   // array in this function. PyArray_GETITEM returns datetime.datetime for
   // units second through microsecond but PyLong for nanosecond (because
-  // datetime.datetime does not support nanoseconds). We inserted this hack to
-  // preserve the <= 0.15.1 behavior until a better solution can be devised
+  // datetime.datetime does not support nanoseconds).
+  // We force the object conversion to preserve the value of the timezone.

Review comment:
       Might be worth stating again for emphasis that nanoseconds are returned as integers inside structs (I got confused myself when I saw it because I forgot about this)

##########
File path: cpp/src/arrow/python/python_to_arrow.cc
##########
@@ -191,10 +191,11 @@ struct ValueConverter<Date64Type> {
 
 template <>
 struct ValueConverter<Time32Type> {
-  static inline Result<int32_t> FromPython(PyObject* obj, TimeUnit::type unit) {
+  static inline Result<int32_t> FromPython(PyObject* obj, TimeUnit::type unit,
+                                           bool /*ignore_timezone*/) {

Review comment:
       This is a bit ugly, could we pass an options struct instead?

##########
File path: cpp/src/arrow/python/python_to_arrow.cc
##########
@@ -1328,6 +1357,10 @@ Status ConvertPySequence(PyObject* sequence_source, PyObject* mask,
 
   if (options.type == nullptr) {
     RETURN_NOT_OK(InferArrowType(seq, mask, options.from_pandas, &real_type));
+    if (options.ignore_timezone && real_type->id() == Type::TIMESTAMP) {
+      const auto& ts_type = checked_cast<TimestampType&>(*real_type);

Review comment:
       be consistent about const?

##########
File path: cpp/src/arrow/python/arrow_to_pandas.cc
##########
@@ -951,8 +953,21 @@ struct ObjectWriterVisitor {
   template <typename Type>
   enable_if_timestamp<Type, Status> Visit(const Type& type) {
     const TimeUnit::type unit = type.unit();
-    auto WrapValue = [unit](typename Type::c_type value, PyObject** out) {
+    OwnedRef tzinfo;
+    if (!type.timezone().empty() && !options.ignore_timezone) {
+      ARROW_ASSIGN_OR_RAISE(tzinfo, internal::StringToTzinfo(type.timezone()));
+      RETURN_IF_PYERROR();
+    }
+    auto WrapValue = [&](typename Type::c_type value, PyObject** out) {

Review comment:
       I'd suggest declaring two lambdas, one that is the original WrapValue and another that adds the tzinfo to the result of WrapValue

##########
File path: cpp/src/arrow/python/common.h
##########
@@ -137,6 +137,11 @@ class ARROW_PYTHON_EXPORT OwnedRef {
   OwnedRef(OwnedRef&& other) : OwnedRef(other.detach()) {}
   explicit OwnedRef(PyObject* obj) : obj_(obj) {}
 
+  OwnedRef& operator=(PyObject* obj) {
+    obj_ = obj;
+    return *this;
+  }

Review comment:
       Is this definitely needed? I think it's cleaner to use the explicit reset/detach methods

##########
File path: cpp/src/arrow/python/datetime.h
##########
@@ -157,6 +157,22 @@ inline int64_t PyDelta_to_ns(PyDateTime_Delta* pytimedelta) {
   return PyDelta_to_us(pytimedelta) * 1000;
 }
 
+ARROW_PYTHON_EXPORT

Review comment:
       Do these need to be exported?

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -861,10 +861,10 @@ void AddBinaryLength(FunctionRegistry* registry) {
       applicator::ScalarUnaryNotNull<Int32Type, StringType, BinaryLength>::Exec;
   ArrayKernelExec exec_offset_64 =
       applicator::ScalarUnaryNotNull<Int64Type, LargeStringType, BinaryLength>::Exec;
-  for (const auto input_type : {binary(), utf8()}) {
+  for (const auto& input_type : {binary(), utf8()}) {

Review comment:
       I agree it's good to fix but not required for the pR

##########
File path: python/pyarrow/tests/test_pandas.py
##########
@@ -3325,13 +3325,35 @@ def test_cast_timestamp_unit():
     assert result.equals(expected)
 
 
-def test_struct_with_timestamp_tz():
+def test_nested_with_timestamp_tz_round_trip():
+    ts = pd.Timestamp.now()
+    ts_dt = ts.to_pydatetime()
+    arr = pa.array([ts_dt], type=pa.timestamp('us', tz='America/New_York'))
+    struct = pa.StructArray.from_arrays([arr, arr], ['start', 'stop'])
+
+    result = struct.to_pandas()
+    # N.B. we test with Pandas because the Arrow types are not

Review comment:
       Indeed it seems like `pa.array(result)` should recover `struct` and we want to verify that?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org