You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2018/09/13 21:15:48 UTC

[arrow] branch master updated: ARROW-2646: [C++/Python] Pandas roundtrip for date objects

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 67c05c2  ARROW-2646: [C++/Python] Pandas roundtrip for date objects
67c05c2 is described below

commit 67c05c203dded7896c1fd00c89ffc89a3eab50e1
Author: Krisztián Szűcs <sz...@gmail.com>
AuthorDate: Thu Sep 13 17:15:20 2018 -0400

    ARROW-2646: [C++/Python] Pandas roundtrip for date objects
    
    Author: Krisztián Szűcs <sz...@gmail.com>
    
    Closes #2535 from kszucs/ARROW-2646 and squashes the following commits:
    
    1f36aa6d9 <Krisztián Szűcs> add plasma_store_server to gitignore
    2f6a31061 <Krisztián Szűcs> flake8
    070e97520 <Krisztián Szűcs> test case for ChunkedArray and Column
    8b44f7a02 <Krisztián Szűcs> support date_as_object in ArrowDeserializer
    6ee8d2d85 <Krisztián Szűcs> ConvertDates if date_as_object PandasOption is set
---
 cpp/src/arrow/python/arrow_to_pandas.cc     | 77 +++++++++++++++++++++++++---
 cpp/src/arrow/python/arrow_to_pandas.h      |  2 +
 cpp/src/arrow/python/util/datetime.h        | 21 ++++++++
 python/.gitignore                           |  1 +
 python/pyarrow/array.pxi                    |  6 ++-
 python/pyarrow/includes/libarrow.pxd        |  1 +
 python/pyarrow/table.pxi                    | 14 +++--
 python/pyarrow/tests/test_convert_pandas.py | 79 +++++++++++++++++++++++++++++
 8 files changed, 191 insertions(+), 10 deletions(-)

diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc
index 9fb9194..6a142cd 100644
--- a/cpp/src/arrow/python/arrow_to_pandas.cc
+++ b/cpp/src/arrow/python/arrow_to_pandas.cc
@@ -633,6 +633,37 @@ inline void ConvertDatetimeNanos(const ChunkedArray& data, int64_t* out_values)
 }
 
 template <typename TYPE>
+static Status ConvertDates(PandasOptions options, const ChunkedArray& data,
+                           PyObject** out_values) {
+  using ArrayType = typename TypeTraits<TYPE>::ArrayType;
+
+  PyAcquireGIL lock;
+  OwnedRef date_ref;
+
+  PyDateTime_IMPORT;
+
+  for (int c = 0; c < data.num_chunks(); c++) {
+    const auto& arr = checked_cast<const ArrayType&>(*data.chunk(c));
+    auto type = std::dynamic_pointer_cast<TYPE>(arr.type());
+    DCHECK(type);
+
+    const DateUnit unit = type->unit();
+
+    for (int64_t i = 0; i < arr.length(); ++i) {
+      if (arr.IsNull(i)) {
+        Py_INCREF(Py_None);
+        *out_values++ = Py_None;
+      } else {
+        RETURN_NOT_OK(PyDate_from_int(arr.Value(i), unit, out_values++));
+        RETURN_IF_PYERROR();
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
+template <typename TYPE>
 static Status ConvertTimes(PandasOptions options, const ChunkedArray& data,
                            PyObject** out_values) {
   using ArrayType = typename TypeTraits<TYPE>::ArrayType;
@@ -733,6 +764,10 @@ class ObjectBlock : public PandasBlock {
       RETURN_NOT_OK(ConvertBinaryLike<StringType>(options_, data, out_buffer));
     } else if (type == Type::FIXED_SIZE_BINARY) {
       RETURN_NOT_OK(ConvertFixedSizeBinary(options_, data, out_buffer));
+    } else if (type == Type::DATE32) {
+      RETURN_NOT_OK(ConvertDates<Date32Type>(options_, data, out_buffer));
+    } else if (type == Type::DATE64) {
+      RETURN_NOT_OK(ConvertDates<Date64Type>(options_, data, out_buffer));
     } else if (type == Type::TIME32) {
       RETURN_NOT_OK(ConvertTimes<Time32Type>(options_, data, out_buffer));
     } else if (type == Type::TIME64) {
@@ -759,6 +794,7 @@ class ObjectBlock : public PandasBlock {
         CONVERTLISTSLIKE_CASE(StringType, STRING)
         CONVERTLISTSLIKE_CASE(ListType, LIST)
         CONVERTLISTSLIKE_CASE(NullType, NA)
+        // TODO(kszucs) Time and Date?
         default: {
           std::stringstream ss;
           ss << "Not implemented type for conversion from List to Pandas ObjectBlock: "
@@ -1322,10 +1358,8 @@ static Status GetPandasBlockType(const Column& col, const PandasOptions& options
       *output_type = PandasBlock::OBJECT;
       break;
     case Type::DATE32:
-      *output_type = PandasBlock::DATETIME;
-      break;
     case Type::DATE64:
-      *output_type = PandasBlock::DATETIME;
+      *output_type = options.date_as_object ? PandasBlock::OBJECT : PandasBlock::DATETIME;
       break;
     case Type::TIMESTAMP: {
       const auto& ts_type = checked_cast<const TimestampType&>(*col.type());
@@ -1660,9 +1694,7 @@ class ArrowDeserializer {
   }
 
   template <typename Type>
-  typename std::enable_if<std::is_base_of<DateType, Type>::value ||
-                              std::is_base_of<TimestampType, Type>::value,
-                          Status>::type
+  typename std::enable_if<std::is_base_of<TimestampType, Type>::value, Status>::type
   Visit(const Type& type) {
     if (options_.zero_copy_only) {
       return Status::Invalid("Copy Needed, but zero_copy_only was True");
@@ -1692,6 +1724,39 @@ class ArrowDeserializer {
   }
 
   template <typename Type>
+  typename std::enable_if<std::is_base_of<DateType, Type>::value, Status>::type Visit(
+      const Type& type) {
+    if (options_.zero_copy_only) {
+      return Status::Invalid("Copy Needed, but zero_copy_only was True");
+    }
+    if (options_.date_as_object) {
+      return VisitObjects(ConvertDates<Type>);
+    }
+
+    constexpr int TYPE = Type::type_id;
+    using traits = internal::arrow_traits<TYPE>;
+    using c_type = typename Type::c_type;
+
+    typedef typename traits::T T;
+
+    RETURN_NOT_OK(AllocateOutput(traits::npy_type));
+    auto out_values = reinterpret_cast<T*>(PyArray_DATA(arr_));
+
+    constexpr T na_value = traits::na_value;
+    constexpr int64_t kShift = traits::npy_shift;
+
+    for (int c = 0; c < data_.num_chunks(); c++) {
+      const auto& arr = *data_.chunk(c);
+      const c_type* in_values = GetPrimitiveValues<c_type>(arr);
+
+      for (int64_t i = 0; i < arr.length(); ++i) {
+        *out_values++ = arr.IsNull(i) ? na_value : static_cast<T>(in_values[i]) / kShift;
+      }
+    }
+    return Status::OK();
+  }
+
+  template <typename Type>
   typename std::enable_if<std::is_base_of<TimeType, Type>::value, Status>::type Visit(
       const Type& type) {
     return Status::NotImplemented("Don't know how to serialize Arrow time type to NumPy");
diff --git a/cpp/src/arrow/python/arrow_to_pandas.h b/cpp/src/arrow/python/arrow_to_pandas.h
index 2a338ac..138b010 100644
--- a/cpp/src/arrow/python/arrow_to_pandas.h
+++ b/cpp/src/arrow/python/arrow_to_pandas.h
@@ -46,12 +46,14 @@ struct PandasOptions {
   bool strings_to_categorical;
   bool zero_copy_only;
   bool integer_object_nulls;
+  bool date_as_object;
   bool use_threads;
 
   PandasOptions()
       : strings_to_categorical(false),
         zero_copy_only(false),
         integer_object_nulls(false),
+        date_as_object(false),
         use_threads(false) {}
 };
 
diff --git a/cpp/src/arrow/python/util/datetime.h b/cpp/src/arrow/python/util/datetime.h
index e76c2e0..d39178d 100644
--- a/cpp/src/arrow/python/util/datetime.h
+++ b/cpp/src/arrow/python/util/datetime.h
@@ -211,6 +211,19 @@ static inline Status PyTime_convert_int(int64_t val, const TimeUnit::type unit,
   return Status::OK();
 }
 
+static inline Status PyDate_convert_int(int64_t val, const DateUnit unit, int64_t* year,
+                                        int64_t* month, int64_t* day) {
+  switch (unit) {
+    case DateUnit::MILLI:
+      val /= 86400000LL;
+    case DateUnit::DAY:
+      get_date_from_days(val, year, month, day);
+    default:
+      break;
+  }
+  return Status::OK();
+}
+
 static inline Status PyTime_from_int(int64_t val, const TimeUnit::type unit,
                                      PyObject** out) {
   int64_t hour = 0, minute = 0, second = 0, microsecond = 0;
@@ -220,6 +233,14 @@ static inline Status PyTime_from_int(int64_t val, const TimeUnit::type unit,
   return Status::OK();
 }
 
+static inline Status PyDate_from_int(int64_t val, const DateUnit unit, PyObject** out) {
+  int64_t year = 0, month = 0, day = 0;
+  RETURN_NOT_OK(PyDate_convert_int(val, unit, &year, &month, &day));
+  *out = PyDate_FromDate(static_cast<int32_t>(year), static_cast<int32_t>(month),
+                         static_cast<int32_t>(day));
+  return Status::OK();
+}
+
 static inline Status PyDateTime_from_int(int64_t val, const TimeUnit::type unit,
                                          PyObject** out) {
   int64_t hour = 0, minute = 0, second = 0, microsecond = 0;
diff --git a/python/.gitignore b/python/.gitignore
index c6125ad..fac4e99 100644
--- a/python/.gitignore
+++ b/python/.gitignore
@@ -43,3 +43,4 @@ manylinux1/arrow
 
 # plasma store
 pyarrow/plasma_store
+pyarrow/plasma_store_server
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index bc21e16..52af717 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -600,7 +600,8 @@ cdef class Array:
         return pyarrow_wrap_array(result)
 
     def to_pandas(self, bint strings_to_categorical=False,
-                  bint zero_copy_only=False, bint integer_object_nulls=False):
+                  bint zero_copy_only=False, bint integer_object_nulls=False,
+                  bint date_as_object=False):
         """
         Convert to a NumPy array object suitable for use in pandas.
 
@@ -613,6 +614,8 @@ cdef class Array:
             the underlying data
         integer_object_nulls : boolean, default False
             Cast integers with nulls to objects
+        date_as_object : boolean, default False
+            Cast dates to objects
 
         See also
         --------
@@ -628,6 +631,7 @@ cdef class Array:
             strings_to_categorical=strings_to_categorical,
             zero_copy_only=zero_copy_only,
             integer_object_nulls=integer_object_nulls,
+            date_as_object=date_as_object,
             use_threads=False)
         with nogil:
             check_status(ConvertArrayToPandas(options, self.sp_array,
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 8a91bf5..1a499df 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -1029,6 +1029,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil:
         c_bool strings_to_categorical
         c_bool zero_copy_only
         c_bool integer_object_nulls
+        c_bool date_as_object
         c_bool use_threads
 
 cdef extern from "arrow/python/api.h" namespace 'arrow::py' nogil:
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index 62f6803..cc21bbf 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -140,7 +140,8 @@ cdef class ChunkedArray:
         return result
 
     def to_pandas(self, bint strings_to_categorical=False,
-                  bint zero_copy_only=False, bint integer_object_nulls=False):
+                  bint zero_copy_only=False, bint integer_object_nulls=False,
+                  bint date_as_object=False):
         """
         Convert the arrow::ChunkedArray to an array object suitable for use
         in pandas
@@ -157,6 +158,7 @@ cdef class ChunkedArray:
             strings_to_categorical=strings_to_categorical,
             zero_copy_only=zero_copy_only,
             integer_object_nulls=integer_object_nulls,
+            date_as_object=date_as_object,
             use_threads=False)
 
         with nogil:
@@ -483,7 +485,8 @@ cdef class Column:
         return [pyarrow_wrap_column(col) for col in flattened]
 
     def to_pandas(self, bint strings_to_categorical=False,
-                  bint zero_copy_only=False, bint integer_object_nulls=False):
+                  bint zero_copy_only=False, bint integer_object_nulls=False,
+                  bint date_as_object=False):
         """
         Convert the arrow::Column to a pandas.Series
 
@@ -494,6 +497,7 @@ cdef class Column:
         values = self.data.to_pandas(
             strings_to_categorical=strings_to_categorical,
             zero_copy_only=zero_copy_only,
+            date_as_object=date_as_object,
             integer_object_nulls=integer_object_nulls)
         result = pd.Series(values, name=self.name)
 
@@ -1317,7 +1321,8 @@ cdef class Table:
 
     def to_pandas(self, bint strings_to_categorical=False,
                   memory_pool=None, bint zero_copy_only=False, categories=None,
-                  bint integer_object_nulls=False, bint use_threads=True):
+                  bint integer_object_nulls=False, bint date_as_object=False,
+                  bint use_threads=True):
         """
         Convert the arrow::Table to a pandas DataFrame
 
@@ -1334,6 +1339,8 @@ cdef class Table:
             List of columns that should be returned as pandas.Categorical
         integer_object_nulls : boolean, default False
             Cast integers with nulls to objects
+        date_as_object : boolean, default False
+            Cast dates to objects
         use_threads: boolean, default True
             Whether to parallelize the conversion using multiple threads
 
@@ -1348,6 +1355,7 @@ cdef class Table:
             strings_to_categorical=strings_to_categorical,
             zero_copy_only=zero_copy_only,
             integer_object_nulls=integer_object_nulls,
+            date_as_object=date_as_object,
             use_threads=use_threads)
 
         mgr = pdcompat.table_to_blockmanager(options, self, memory_pool,
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index 3fa7cf4..bb53c14 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -906,6 +906,85 @@ class TestConvertDateTimeLikeTypes(object):
         with pytest.raises(pa.ArrowInvalid, match=expected_msg):
             pa.Array.from_pandas(s, type=pa.date64(), mask=mask)
 
+    def test_array_date_as_object(self):
+        data = [date(2000, 1, 1),
+                None,
+                date(1970, 1, 1),
+                date(2040, 2, 26)]
+        expected = np.array(['2000-01-01',
+                             None,
+                             '1970-01-01',
+                             '2040-02-26'], dtype='datetime64')
+
+        arr = pa.array(data)
+        assert arr.equals(pa.array(expected))
+
+        result = arr.to_pandas()
+        assert result.dtype == expected.dtype
+        npt.assert_array_equal(arr.to_pandas(), expected)
+
+        result = arr.to_pandas(date_as_object=True)
+        expected = expected.astype(object)
+        assert result.dtype == expected.dtype
+        npt.assert_array_equal(result, expected)
+
+    def test_chunked_array_convert_date_as_object(self):
+        data = [date(2000, 1, 1),
+                None,
+                date(1970, 1, 1),
+                date(2040, 2, 26)]
+        expected = np.array(['2000-01-01',
+                             None,
+                             '1970-01-01',
+                             '2040-02-26'], dtype='datetime64')
+        carr = pa.chunked_array([data])
+
+        result = carr.to_pandas()
+        assert result.dtype == expected.dtype
+        npt.assert_array_equal(carr.to_pandas(), expected)
+
+        result = carr.to_pandas(date_as_object=True)
+        expected = expected.astype(object)
+        assert result.dtype == expected.dtype
+        npt.assert_array_equal(result, expected)
+
+    def test_column_convert_date_as_object(self):
+        data = [date(2000, 1, 1),
+                None,
+                date(1970, 1, 1),
+                date(2040, 2, 26)]
+        expected = np.array(['2000-01-01',
+                             None,
+                             '1970-01-01',
+                             '2040-02-26'], dtype='datetime64')
+
+        arr = pa.array(data)
+        column = pa.column('date', arr)
+
+        result = column.to_pandas()
+        npt.assert_array_equal(column.to_pandas(), expected)
+
+        result = column.to_pandas(date_as_object=True)
+        expected = expected.astype(object)
+        assert result.dtype == expected.dtype
+        npt.assert_array_equal(result, expected)
+
+    def test_table_convert_date_as_object(self):
+        df = pd.DataFrame({
+            'date': [date(2000, 1, 1),
+                     None,
+                     date(1970, 1, 1),
+                     date(2040, 2, 26)]})
+
+        table = pa.Table.from_pandas(df, preserve_index=False)
+
+        df_datetime = table.to_pandas()
+        df_object = table.to_pandas(date_as_object=True)
+
+        tm.assert_frame_equal(df.astype('datetime64[ns]'), df_datetime,
+                              check_dtype=True)
+        tm.assert_frame_equal(df, df_object, check_dtype=True)
+
     def test_date_infer(self):
         df = pd.DataFrame({
             'date': [date(2000, 1, 1),