You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2018/09/13 21:15:48 UTC
[arrow] branch master updated: ARROW-2646: [C++/Python] Pandas
roundtrip for date objects
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 67c05c2 ARROW-2646: [C++/Python] Pandas roundtrip for date objects
67c05c2 is described below
commit 67c05c203dded7896c1fd00c89ffc89a3eab50e1
Author: Krisztián Szűcs <sz...@gmail.com>
AuthorDate: Thu Sep 13 17:15:20 2018 -0400
ARROW-2646: [C++/Python] Pandas roundtrip for date objects
Author: Krisztián Szűcs <sz...@gmail.com>
Closes #2535 from kszucs/ARROW-2646 and squashes the following commits:
1f36aa6d9 <Krisztián Szűcs> add plasma_store_server to gitignore
2f6a31061 <Krisztián Szűcs> flake8
070e97520 <Krisztián Szűcs> test case for ChunkedArray and Column
8b44f7a02 <Krisztián Szűcs> support date_as_object in ArrowDeserializer
6ee8d2d85 <Krisztián Szűcs> ConvertDates if date_as_object PandasOption is set
---
cpp/src/arrow/python/arrow_to_pandas.cc | 77 +++++++++++++++++++++++++---
cpp/src/arrow/python/arrow_to_pandas.h | 2 +
cpp/src/arrow/python/util/datetime.h | 21 ++++++++
python/.gitignore | 1 +
python/pyarrow/array.pxi | 6 ++-
python/pyarrow/includes/libarrow.pxd | 1 +
python/pyarrow/table.pxi | 14 +++--
python/pyarrow/tests/test_convert_pandas.py | 79 +++++++++++++++++++++++++++++
8 files changed, 191 insertions(+), 10 deletions(-)
diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc
index 9fb9194..6a142cd 100644
--- a/cpp/src/arrow/python/arrow_to_pandas.cc
+++ b/cpp/src/arrow/python/arrow_to_pandas.cc
@@ -633,6 +633,37 @@ inline void ConvertDatetimeNanos(const ChunkedArray& data, int64_t* out_values)
}
template <typename TYPE>
+static Status ConvertDates(PandasOptions options, const ChunkedArray& data,
+ PyObject** out_values) {
+ using ArrayType = typename TypeTraits<TYPE>::ArrayType;
+
+ PyAcquireGIL lock;
+ OwnedRef date_ref;
+
+ PyDateTime_IMPORT;
+
+ for (int c = 0; c < data.num_chunks(); c++) {
+ const auto& arr = checked_cast<const ArrayType&>(*data.chunk(c));
+ auto type = std::dynamic_pointer_cast<TYPE>(arr.type());
+ DCHECK(type);
+
+ const DateUnit unit = type->unit();
+
+ for (int64_t i = 0; i < arr.length(); ++i) {
+ if (arr.IsNull(i)) {
+ Py_INCREF(Py_None);
+ *out_values++ = Py_None;
+ } else {
+ RETURN_NOT_OK(PyDate_from_int(arr.Value(i), unit, out_values++));
+ RETURN_IF_PYERROR();
+ }
+ }
+ }
+
+ return Status::OK();
+}
+
+template <typename TYPE>
static Status ConvertTimes(PandasOptions options, const ChunkedArray& data,
PyObject** out_values) {
using ArrayType = typename TypeTraits<TYPE>::ArrayType;
@@ -733,6 +764,10 @@ class ObjectBlock : public PandasBlock {
RETURN_NOT_OK(ConvertBinaryLike<StringType>(options_, data, out_buffer));
} else if (type == Type::FIXED_SIZE_BINARY) {
RETURN_NOT_OK(ConvertFixedSizeBinary(options_, data, out_buffer));
+ } else if (type == Type::DATE32) {
+ RETURN_NOT_OK(ConvertDates<Date32Type>(options_, data, out_buffer));
+ } else if (type == Type::DATE64) {
+ RETURN_NOT_OK(ConvertDates<Date64Type>(options_, data, out_buffer));
} else if (type == Type::TIME32) {
RETURN_NOT_OK(ConvertTimes<Time32Type>(options_, data, out_buffer));
} else if (type == Type::TIME64) {
@@ -759,6 +794,7 @@ class ObjectBlock : public PandasBlock {
CONVERTLISTSLIKE_CASE(StringType, STRING)
CONVERTLISTSLIKE_CASE(ListType, LIST)
CONVERTLISTSLIKE_CASE(NullType, NA)
+ // TODO(kszucs) Time and Date?
default: {
std::stringstream ss;
ss << "Not implemented type for conversion from List to Pandas ObjectBlock: "
@@ -1322,10 +1358,8 @@ static Status GetPandasBlockType(const Column& col, const PandasOptions& options
*output_type = PandasBlock::OBJECT;
break;
case Type::DATE32:
- *output_type = PandasBlock::DATETIME;
- break;
case Type::DATE64:
- *output_type = PandasBlock::DATETIME;
+ *output_type = options.date_as_object ? PandasBlock::OBJECT : PandasBlock::DATETIME;
break;
case Type::TIMESTAMP: {
const auto& ts_type = checked_cast<const TimestampType&>(*col.type());
@@ -1660,9 +1694,7 @@ class ArrowDeserializer {
}
template <typename Type>
- typename std::enable_if<std::is_base_of<DateType, Type>::value ||
- std::is_base_of<TimestampType, Type>::value,
- Status>::type
+ typename std::enable_if<std::is_base_of<TimestampType, Type>::value, Status>::type
Visit(const Type& type) {
if (options_.zero_copy_only) {
return Status::Invalid("Copy Needed, but zero_copy_only was True");
@@ -1692,6 +1724,39 @@ class ArrowDeserializer {
}
template <typename Type>
+ typename std::enable_if<std::is_base_of<DateType, Type>::value, Status>::type Visit(
+ const Type& type) {
+ if (options_.zero_copy_only) {
+ return Status::Invalid("Copy Needed, but zero_copy_only was True");
+ }
+ if (options_.date_as_object) {
+ return VisitObjects(ConvertDates<Type>);
+ }
+
+ constexpr int TYPE = Type::type_id;
+ using traits = internal::arrow_traits<TYPE>;
+ using c_type = typename Type::c_type;
+
+ typedef typename traits::T T;
+
+ RETURN_NOT_OK(AllocateOutput(traits::npy_type));
+ auto out_values = reinterpret_cast<T*>(PyArray_DATA(arr_));
+
+ constexpr T na_value = traits::na_value;
+ constexpr int64_t kShift = traits::npy_shift;
+
+ for (int c = 0; c < data_.num_chunks(); c++) {
+ const auto& arr = *data_.chunk(c);
+ const c_type* in_values = GetPrimitiveValues<c_type>(arr);
+
+ for (int64_t i = 0; i < arr.length(); ++i) {
+ *out_values++ = arr.IsNull(i) ? na_value : static_cast<T>(in_values[i]) / kShift;
+ }
+ }
+ return Status::OK();
+ }
+
+ template <typename Type>
typename std::enable_if<std::is_base_of<TimeType, Type>::value, Status>::type Visit(
const Type& type) {
return Status::NotImplemented("Don't know how to serialize Arrow time type to NumPy");
diff --git a/cpp/src/arrow/python/arrow_to_pandas.h b/cpp/src/arrow/python/arrow_to_pandas.h
index 2a338ac..138b010 100644
--- a/cpp/src/arrow/python/arrow_to_pandas.h
+++ b/cpp/src/arrow/python/arrow_to_pandas.h
@@ -46,12 +46,14 @@ struct PandasOptions {
bool strings_to_categorical;
bool zero_copy_only;
bool integer_object_nulls;
+ bool date_as_object;
bool use_threads;
PandasOptions()
: strings_to_categorical(false),
zero_copy_only(false),
integer_object_nulls(false),
+ date_as_object(false),
use_threads(false) {}
};
diff --git a/cpp/src/arrow/python/util/datetime.h b/cpp/src/arrow/python/util/datetime.h
index e76c2e0..d39178d 100644
--- a/cpp/src/arrow/python/util/datetime.h
+++ b/cpp/src/arrow/python/util/datetime.h
@@ -211,6 +211,19 @@ static inline Status PyTime_convert_int(int64_t val, const TimeUnit::type unit,
return Status::OK();
}
+static inline Status PyDate_convert_int(int64_t val, const DateUnit unit, int64_t* year,
+ int64_t* month, int64_t* day) {
+ switch (unit) {
+ case DateUnit::MILLI:
+ val /= 86400000LL;
+ case DateUnit::DAY:
+ get_date_from_days(val, year, month, day);
+ default:
+ break;
+ }
+ return Status::OK();
+}
+
static inline Status PyTime_from_int(int64_t val, const TimeUnit::type unit,
PyObject** out) {
int64_t hour = 0, minute = 0, second = 0, microsecond = 0;
@@ -220,6 +233,14 @@ static inline Status PyTime_from_int(int64_t val, const TimeUnit::type unit,
return Status::OK();
}
+static inline Status PyDate_from_int(int64_t val, const DateUnit unit, PyObject** out) {
+ int64_t year = 0, month = 0, day = 0;
+ RETURN_NOT_OK(PyDate_convert_int(val, unit, &year, &month, &day));
+ *out = PyDate_FromDate(static_cast<int32_t>(year), static_cast<int32_t>(month),
+ static_cast<int32_t>(day));
+ return Status::OK();
+}
+
static inline Status PyDateTime_from_int(int64_t val, const TimeUnit::type unit,
PyObject** out) {
int64_t hour = 0, minute = 0, second = 0, microsecond = 0;
diff --git a/python/.gitignore b/python/.gitignore
index c6125ad..fac4e99 100644
--- a/python/.gitignore
+++ b/python/.gitignore
@@ -43,3 +43,4 @@ manylinux1/arrow
# plasma store
pyarrow/plasma_store
+pyarrow/plasma_store_server
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index bc21e16..52af717 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -600,7 +600,8 @@ cdef class Array:
return pyarrow_wrap_array(result)
def to_pandas(self, bint strings_to_categorical=False,
- bint zero_copy_only=False, bint integer_object_nulls=False):
+ bint zero_copy_only=False, bint integer_object_nulls=False,
+ bint date_as_object=False):
"""
Convert to a NumPy array object suitable for use in pandas.
@@ -613,6 +614,8 @@ cdef class Array:
the underlying data
integer_object_nulls : boolean, default False
Cast integers with nulls to objects
+ date_as_object : boolean, default False
+ Cast dates to objects
See also
--------
@@ -628,6 +631,7 @@ cdef class Array:
strings_to_categorical=strings_to_categorical,
zero_copy_only=zero_copy_only,
integer_object_nulls=integer_object_nulls,
+ date_as_object=date_as_object,
use_threads=False)
with nogil:
check_status(ConvertArrayToPandas(options, self.sp_array,
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 8a91bf5..1a499df 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -1029,6 +1029,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil:
c_bool strings_to_categorical
c_bool zero_copy_only
c_bool integer_object_nulls
+ c_bool date_as_object
c_bool use_threads
cdef extern from "arrow/python/api.h" namespace 'arrow::py' nogil:
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index 62f6803..cc21bbf 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -140,7 +140,8 @@ cdef class ChunkedArray:
return result
def to_pandas(self, bint strings_to_categorical=False,
- bint zero_copy_only=False, bint integer_object_nulls=False):
+ bint zero_copy_only=False, bint integer_object_nulls=False,
+ bint date_as_object=False):
"""
Convert the arrow::ChunkedArray to an array object suitable for use
in pandas
@@ -157,6 +158,7 @@ cdef class ChunkedArray:
strings_to_categorical=strings_to_categorical,
zero_copy_only=zero_copy_only,
integer_object_nulls=integer_object_nulls,
+ date_as_object=date_as_object,
use_threads=False)
with nogil:
@@ -483,7 +485,8 @@ cdef class Column:
return [pyarrow_wrap_column(col) for col in flattened]
def to_pandas(self, bint strings_to_categorical=False,
- bint zero_copy_only=False, bint integer_object_nulls=False):
+ bint zero_copy_only=False, bint integer_object_nulls=False,
+ bint date_as_object=False):
"""
Convert the arrow::Column to a pandas.Series
@@ -494,6 +497,7 @@ cdef class Column:
values = self.data.to_pandas(
strings_to_categorical=strings_to_categorical,
zero_copy_only=zero_copy_only,
+ date_as_object=date_as_object,
integer_object_nulls=integer_object_nulls)
result = pd.Series(values, name=self.name)
@@ -1317,7 +1321,8 @@ cdef class Table:
def to_pandas(self, bint strings_to_categorical=False,
memory_pool=None, bint zero_copy_only=False, categories=None,
- bint integer_object_nulls=False, bint use_threads=True):
+ bint integer_object_nulls=False, bint date_as_object=False,
+ bint use_threads=True):
"""
Convert the arrow::Table to a pandas DataFrame
@@ -1334,6 +1339,8 @@ cdef class Table:
List of columns that should be returned as pandas.Categorical
integer_object_nulls : boolean, default False
Cast integers with nulls to objects
+ date_as_object : boolean, default False
+ Cast dates to objects
use_threads: boolean, default True
Whether to parallelize the conversion using multiple threads
@@ -1348,6 +1355,7 @@ cdef class Table:
strings_to_categorical=strings_to_categorical,
zero_copy_only=zero_copy_only,
integer_object_nulls=integer_object_nulls,
+ date_as_object=date_as_object,
use_threads=use_threads)
mgr = pdcompat.table_to_blockmanager(options, self, memory_pool,
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index 3fa7cf4..bb53c14 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -906,6 +906,85 @@ class TestConvertDateTimeLikeTypes(object):
with pytest.raises(pa.ArrowInvalid, match=expected_msg):
pa.Array.from_pandas(s, type=pa.date64(), mask=mask)
+ def test_array_date_as_object(self):
+ data = [date(2000, 1, 1),
+ None,
+ date(1970, 1, 1),
+ date(2040, 2, 26)]
+ expected = np.array(['2000-01-01',
+ None,
+ '1970-01-01',
+ '2040-02-26'], dtype='datetime64')
+
+ arr = pa.array(data)
+ assert arr.equals(pa.array(expected))
+
+ result = arr.to_pandas()
+ assert result.dtype == expected.dtype
+ npt.assert_array_equal(arr.to_pandas(), expected)
+
+ result = arr.to_pandas(date_as_object=True)
+ expected = expected.astype(object)
+ assert result.dtype == expected.dtype
+ npt.assert_array_equal(result, expected)
+
+ def test_chunked_array_convert_date_as_object(self):
+ data = [date(2000, 1, 1),
+ None,
+ date(1970, 1, 1),
+ date(2040, 2, 26)]
+ expected = np.array(['2000-01-01',
+ None,
+ '1970-01-01',
+ '2040-02-26'], dtype='datetime64')
+ carr = pa.chunked_array([data])
+
+ result = carr.to_pandas()
+ assert result.dtype == expected.dtype
+ npt.assert_array_equal(carr.to_pandas(), expected)
+
+ result = carr.to_pandas(date_as_object=True)
+ expected = expected.astype(object)
+ assert result.dtype == expected.dtype
+ npt.assert_array_equal(result, expected)
+
+ def test_column_convert_date_as_object(self):
+ data = [date(2000, 1, 1),
+ None,
+ date(1970, 1, 1),
+ date(2040, 2, 26)]
+ expected = np.array(['2000-01-01',
+ None,
+ '1970-01-01',
+ '2040-02-26'], dtype='datetime64')
+
+ arr = pa.array(data)
+ column = pa.column('date', arr)
+
+ result = column.to_pandas()
+ npt.assert_array_equal(column.to_pandas(), expected)
+
+ result = column.to_pandas(date_as_object=True)
+ expected = expected.astype(object)
+ assert result.dtype == expected.dtype
+ npt.assert_array_equal(result, expected)
+
+ def test_table_convert_date_as_object(self):
+ df = pd.DataFrame({
+ 'date': [date(2000, 1, 1),
+ None,
+ date(1970, 1, 1),
+ date(2040, 2, 26)]})
+
+ table = pa.Table.from_pandas(df, preserve_index=False)
+
+ df_datetime = table.to_pandas()
+ df_object = table.to_pandas(date_as_object=True)
+
+ tm.assert_frame_equal(df.astype('datetime64[ns]'), df_datetime,
+ check_dtype=True)
+ tm.assert_frame_equal(df, df_object, check_dtype=True)
+
def test_date_infer(self):
df = pd.DataFrame({
'date': [date(2000, 1, 1),