You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2020/07/01 01:22:22 UTC

[GitHub] [arrow] wesm commented on a change in pull request #7584: ARROW-9272: [C++][Python] Reduce complexity in python to arrow conversion

wesm commented on a change in pull request #7584:
URL: https://github.com/apache/arrow/pull/7584#discussion_r448064117



##########
File path: cpp/src/arrow/python/python_to_arrow.cc
##########
@@ -53,6 +53,335 @@ using internal::checked_pointer_cast;
 
 namespace py {
 
+// ----------------------------------------------------------------------
+// NullCoding
+
+enum class NullCoding : char { NONE_ONLY, PANDAS_SENTINELS };
+
+template <NullCoding kind>
+struct NullChecker {};
+
+template <>
+struct NullChecker<NullCoding::NONE_ONLY> {
+  static inline bool Check(PyObject* obj) { return obj == Py_None; }
+};
+
+template <>
+struct NullChecker<NullCoding::PANDAS_SENTINELS> {
+  static inline bool Check(PyObject* obj) { return internal::PandasObjectIsNull(obj); }
+};
+
+// ----------------------------------------------------------------------
+// ValueConverters
+//
+// Typed conversion logic for single python objects are encapsulated in
+// ValueConverter structs using SFINAE for specialization.
+//
+// The FromPython medthod is responsible to convert the python object to the
+// C++ value counterpart which can be directly appended to the ArrayBuilder or
+// Scalar can be constructed from.
+
+template <typename Type, typename Enable = void>
+struct ValueConverter {};
+
+template <>
+struct ValueConverter<BooleanType> {
+  static inline Result<bool> FromPython(PyObject* obj) {
+    if (obj == Py_True) {
+      return true;
+    } else if (obj == Py_False) {
+      return false;
+    } else {
+      return internal::InvalidValue(obj, "tried to convert to boolean");
+    }
+  }
+};
+
+template <typename Type>
+struct ValueConverter<Type, enable_if_integer<Type>> {
+  using ValueType = typename Type::c_type;
+
+  static inline Result<ValueType> FromPython(PyObject* obj) {
+    ValueType value;
+    RETURN_NOT_OK(internal::CIntFromPython(obj, &value));
+    return value;
+  }
+};
+
+template <>
+struct ValueConverter<HalfFloatType> {
+  using ValueType = typename HalfFloatType::c_type;
+
+  static inline Result<ValueType> FromPython(PyObject* obj) {
+    ValueType value;
+    RETURN_NOT_OK(PyFloat_AsHalf(obj, &value));
+    return value;
+  }
+};
+
+template <>
+struct ValueConverter<FloatType> {
+  static inline Result<float> FromPython(PyObject* obj) {
+    float value;
+    if (internal::PyFloatScalar_Check(obj)) {
+      value = static_cast<float>(PyFloat_AsDouble(obj));
+      RETURN_IF_PYERROR();
+    } else if (internal::PyIntScalar_Check(obj)) {
+      RETURN_NOT_OK(internal::IntegerScalarToFloat32Safe(obj, &value));
+    } else {
+      return internal::InvalidValue(obj, "tried to convert to float32");
+    }
+    return value;
+  }
+};
+
+template <>
+struct ValueConverter<DoubleType> {
+  static inline Result<double> FromPython(PyObject* obj) {
+    double value;
+    if (PyFloat_Check(obj)) {
+      value = PyFloat_AS_DOUBLE(obj);
+    } else if (internal::PyFloatScalar_Check(obj)) {
+      // Other kinds of float-y things
+      value = PyFloat_AsDouble(obj);
+      RETURN_IF_PYERROR();
+    } else if (internal::PyIntScalar_Check(obj)) {
+      RETURN_NOT_OK(internal::IntegerScalarToDoubleSafe(obj, &value));
+    } else {
+      return internal::InvalidValue(obj, "tried to convert to double");
+    }
+    return value;
+  }
+};
+
+template <>
+struct ValueConverter<Date32Type> {
+  static inline Result<int32_t> FromPython(PyObject* obj) {
+    int32_t value;
+    if (PyDate_Check(obj)) {
+      auto pydate = reinterpret_cast<PyDateTime_Date*>(obj);
+      value = static_cast<int32_t>(internal::PyDate_to_days(pydate));
+    } else {
+      RETURN_NOT_OK(
+          internal::CIntFromPython(obj, &value, "Integer too large for date32"));
+    }
+    return value;
+  }
+};
+
+template <>
+struct ValueConverter<Date64Type> {
+  static inline Result<int64_t> FromPython(PyObject* obj) {
+    int64_t value;
+    if (PyDateTime_Check(obj)) {
+      auto pydate = reinterpret_cast<PyDateTime_DateTime*>(obj);
+      value = internal::PyDateTime_to_ms(pydate);
+      // Truncate any intraday milliseconds
+      value -= value % 86400000LL;
+    } else if (PyDate_Check(obj)) {
+      auto pydate = reinterpret_cast<PyDateTime_Date*>(obj);
+      value = internal::PyDate_to_ms(pydate);
+    } else {
+      RETURN_NOT_OK(
+          internal::CIntFromPython(obj, &value, "Integer too large for date64"));
+    }
+    return value;
+  }
+};
+
+template <>
+struct ValueConverter<Time32Type> {
+  static inline Result<int32_t> FromPython(PyObject* obj, TimeUnit::type unit) {
+    int32_t value;
+    if (PyTime_Check(obj)) {
+      // datetime.time stores microsecond resolution
+      switch (unit) {
+        case TimeUnit::SECOND:
+          value = static_cast<int32_t>(internal::PyTime_to_s(obj));
+          break;
+        case TimeUnit::MILLI:
+          value = static_cast<int32_t>(internal::PyTime_to_ms(obj));
+          break;
+        default:
+          return Status::UnknownError("Invalid time unit");
+      }
+    } else {
+      RETURN_NOT_OK(internal::CIntFromPython(obj, &value, "Integer too large for int32"));
+    }
+    return value;
+  }
+};
+
+template <>
+struct ValueConverter<Time64Type> {
+  static inline Result<int64_t> FromPython(PyObject* obj, TimeUnit::type unit) {
+    int64_t value;
+    if (PyTime_Check(obj)) {
+      // datetime.time stores microsecond resolution
+      switch (unit) {
+        case TimeUnit::MICRO:
+          value = internal::PyTime_to_us(obj);
+          break;
+        case TimeUnit::NANO:
+          value = internal::PyTime_to_ns(obj);
+          break;
+        default:
+          return Status::UnknownError("Invalid time unit");
+      }
+    } else {
+      RETURN_NOT_OK(internal::CIntFromPython(obj, &value, "Integer too large for int64"));
+    }
+    return value;
+  }
+};
+
+template <>
+struct ValueConverter<TimestampType> {
+  static inline Result<int64_t> FromPython(PyObject* obj, TimeUnit::type unit) {
+    int64_t value;
+    if (PyDateTime_Check(obj)) {
+      auto dt = reinterpret_cast<PyDateTime_DateTime*>(obj);
+      switch (unit) {
+        case TimeUnit::SECOND:
+          value = internal::PyDateTime_to_s(dt);
+          break;
+        case TimeUnit::MILLI:
+          value = internal::PyDateTime_to_ms(dt);
+          break;
+        case TimeUnit::MICRO:
+          value = internal::PyDateTime_to_us(dt);
+          break;
+        case TimeUnit::NANO:
+          value = internal::PyDateTime_to_ns(dt);
+          break;
+        default:
+          return Status::UnknownError("Invalid time unit");
+      }
+    } else {
+      RETURN_NOT_OK(internal::CIntFromPython(obj, &value));
+    }
+    return value;
+  }
+
+  static inline Result<int64_t> FromNumpy(PyObject* obj, TimeUnit::type unit) {
+    // validate that the numpy scalar has np.datetime64 dtype
+    std::shared_ptr<DataType> type;
+    RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), &type));
+    if (type->id() != TimestampType::type_id) {
+      // TODO(kszucs): the message should highlight the received numpy dtype
+      return Status::Invalid("Expected np.datetime64 but got: ", type->ToString());
+    }
+    // validate that the time units are matching
+    if (unit != checked_cast<const TimestampType&>(*type).unit()) {
+      return Status::NotImplemented(
+          "Cannot convert NumPy np.datetime64 objects with differing unit");
+    }
+    // convert the numpy value
+    return reinterpret_cast<PyDatetimeScalarObject*>(obj)->obval;
+  }
+};
+
+template <>
+struct ValueConverter<DurationType> {
+  static inline Result<int64_t> FromPython(PyObject* obj, TimeUnit::type unit) {
+    int64_t value;
+    if (PyDelta_Check(obj)) {
+      auto dt = reinterpret_cast<PyDateTime_Delta*>(obj);
+      switch (unit) {
+        case TimeUnit::SECOND:
+          value = internal::PyDelta_to_s(dt);
+          break;
+        case TimeUnit::MILLI:
+          value = internal::PyDelta_to_ms(dt);
+          break;
+        case TimeUnit::MICRO:
+          value = internal::PyDelta_to_us(dt);
+          break;
+        case TimeUnit::NANO:
+          value = internal::PyDelta_to_ns(dt);
+          break;
+        default:
+          return Status::UnknownError("Invalid time unit");
+      }
+    } else {
+      RETURN_NOT_OK(internal::CIntFromPython(obj, &value));
+    }
+    return value;
+  }
+
+  static inline Result<int64_t> FromNumpy(PyObject* obj, TimeUnit::type unit) {
+    // validate that the numpy scalar has np.timedelta64 dtype
+    std::shared_ptr<DataType> type;
+    RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), &type));
+    if (type->id() != DurationType::type_id) {
+      // TODO(kszucs): the message should highlight the received numpy dtype
+      return Status::Invalid("Expected np.timedelta64 but got: ", type->ToString());
+    }
+    // validate that the time units are matching
+    if (unit != checked_cast<const DurationType&>(*type).unit()) {
+      return Status::NotImplemented(
+          "Cannot convert NumPy np.timedelta64 objects with differing unit");
+    }
+    // convert the numpy value
+    return reinterpret_cast<PyTimedeltaScalarObject*>(obj)->obval;
+  }
+};
+
+template <typename Type>
+struct ValueConverter<Type, enable_if_any_binary<Type>> {
+  static inline Result<PyBytesView> FromPython(PyObject* obj) {
+    PyBytesView view;
+    RETURN_NOT_OK(view.FromString(obj));
+    return std::move(view);
+  }
+};
+
+template <typename Type>
+struct ValueConverter<Type, enable_if_string_like<Type>> {
+  static inline Result<PyBytesView> FromPython(PyObject* obj) {
+    // strict conversion, force output to be unicode / utf8 and validate that
+    // any binary values are utf8
+    bool is_utf8 = false;
+    PyBytesView view;
+
+    RETURN_NOT_OK(view.FromString(obj, &is_utf8));
+    if (!is_utf8) {
+      return internal::InvalidValue(obj, "was not a utf8 string");
+    }
+    return std::move(view);
+  }
+
+  static inline Result<PyBytesView> FromPython(PyObject* obj, bool& is_utf8) {

Review comment:
       Need `bool* is_utf8` here




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org