You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2016/12/19 23:44:15 UTC
arrow git commit: ARROW-420: Align DATE type with Java implementation
Repository: arrow
Updated Branches:
refs/heads/master cfde4607d -> d7845fcd8
ARROW-420: Align DATE type with Java implementation
Author: Uwe L. Korn <uw...@xhochy.com>
Closes #238 from xhochy/ARROW-420 and squashes the following commits:
e497d9f [Uwe L. Korn] Add datetime.date parsing for numpy conversion
5c21453 [Uwe L. Korn] Add support for datetime.datetime
6bf346f [Uwe L. Korn] Add datetime.date conversions
6fca4da [Uwe L. Korn] ARROW-420: Align DATE type with Java implementation
Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/d7845fcd
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/d7845fcd
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/d7845fcd
Branch: refs/heads/master
Commit: d7845fcd8b8a06248e42ca083c6460c43723c154
Parents: cfde460
Author: Uwe L. Korn <uw...@xhochy.com>
Authored: Mon Dec 19 18:44:09 2016 -0500
Committer: Wes McKinney <we...@twosigma.com>
Committed: Mon Dec 19 18:44:09 2016 -0500
----------------------------------------------------------------------
cpp/src/arrow/array.cc | 1 +
cpp/src/arrow/array.h | 1 +
cpp/src/arrow/builder.cc | 2 +
cpp/src/arrow/builder.h | 1 +
cpp/src/arrow/type.cc | 4 +
cpp/src/arrow/type.h | 4 +-
cpp/src/arrow/type_fwd.h | 4 +-
cpp/src/arrow/type_traits.h | 8 ++
python/pyarrow/__init__.py | 1 +
python/pyarrow/array.pyx | 7 +-
python/pyarrow/includes/libarrow.pxd | 16 ++++
python/pyarrow/scalar.pyx | 31 +++++++
python/pyarrow/schema.pyx | 7 ++
python/pyarrow/tests/test_convert_builtin.py | 28 ++++++
python/pyarrow/tests/test_convert_pandas.py | 15 ++++
python/src/pyarrow/adapters/builtin.cc | 69 +++++++++++++++
python/src/pyarrow/adapters/pandas.cc | 103 ++++++++++++++++++----
python/src/pyarrow/helpers.cc | 6 ++
python/src/pyarrow/helpers.h | 2 +
python/src/pyarrow/util/datetime.h | 40 +++++++++
20 files changed, 330 insertions(+), 20 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/arrow/blob/d7845fcd/cpp/src/arrow/array.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc
index 7ab61f5..d13fa1e 100644
--- a/cpp/src/arrow/array.cc
+++ b/cpp/src/arrow/array.cc
@@ -148,6 +148,7 @@ template class NumericArray<Int16Type>;
template class NumericArray<Int32Type>;
template class NumericArray<Int64Type>;
template class NumericArray<TimestampType>;
+template class NumericArray<DateType>;
template class NumericArray<HalfFloatType>;
template class NumericArray<FloatType>;
template class NumericArray<DoubleType>;
http://git-wip-us.apache.org/repos/asf/arrow/blob/d7845fcd/cpp/src/arrow/array.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h
index 1a4a923..26d53f7 100644
--- a/cpp/src/arrow/array.h
+++ b/cpp/src/arrow/array.h
@@ -468,6 +468,7 @@ extern template class ARROW_EXPORT NumericArray<HalfFloatType>;
extern template class ARROW_EXPORT NumericArray<FloatType>;
extern template class ARROW_EXPORT NumericArray<DoubleType>;
extern template class ARROW_EXPORT NumericArray<TimestampType>;
+extern template class ARROW_EXPORT NumericArray<DateType>;
#if defined(__GNUC__) && !defined(__clang__)
#pragma GCC diagnostic pop
http://git-wip-us.apache.org/repos/asf/arrow/blob/d7845fcd/cpp/src/arrow/builder.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc
index 493b5e7..1d94dba 100644
--- a/cpp/src/arrow/builder.cc
+++ b/cpp/src/arrow/builder.cc
@@ -199,6 +199,7 @@ template class PrimitiveBuilder<Int8Type>;
template class PrimitiveBuilder<Int16Type>;
template class PrimitiveBuilder<Int32Type>;
template class PrimitiveBuilder<Int64Type>;
+template class PrimitiveBuilder<DateType>;
template class PrimitiveBuilder<TimestampType>;
template class PrimitiveBuilder<HalfFloatType>;
template class PrimitiveBuilder<FloatType>;
@@ -411,6 +412,7 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
BUILDER_CASE(INT32, Int32Builder);
BUILDER_CASE(UINT64, UInt64Builder);
BUILDER_CASE(INT64, Int64Builder);
+ BUILDER_CASE(DATE, DateBuilder);
BUILDER_CASE(TIMESTAMP, TimestampBuilder);
BUILDER_CASE(BOOL, BooleanBuilder);
http://git-wip-us.apache.org/repos/asf/arrow/blob/d7845fcd/cpp/src/arrow/builder.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h
index 7162d31..2051398 100644
--- a/cpp/src/arrow/builder.h
+++ b/cpp/src/arrow/builder.h
@@ -220,6 +220,7 @@ using Int16Builder = NumericBuilder<Int16Type>;
using Int32Builder = NumericBuilder<Int32Type>;
using Int64Builder = NumericBuilder<Int64Type>;
using TimestampBuilder = NumericBuilder<TimestampType>;
+using DateBuilder = NumericBuilder<DateType>;
using HalfFloatBuilder = NumericBuilder<HalfFloatType>;
using FloatBuilder = NumericBuilder<FloatType>;
http://git-wip-us.apache.org/repos/asf/arrow/blob/d7845fcd/cpp/src/arrow/type.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc
index 5b172e4..4748cc3 100644
--- a/cpp/src/arrow/type.cc
+++ b/cpp/src/arrow/type.cc
@@ -88,6 +88,10 @@ std::string StructType::ToString() const {
return s.str();
}
+std::string DateType::ToString() const {
+ return std::string("date");
+}
+
std::string UnionType::ToString() const {
std::stringstream s;
http://git-wip-us.apache.org/repos/asf/arrow/blob/d7845fcd/cpp/src/arrow/type.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
index 8637081..7300570 100644
--- a/cpp/src/arrow/type.h
+++ b/cpp/src/arrow/type.h
@@ -413,14 +413,14 @@ struct ARROW_EXPORT UnionType : public DataType {
struct ARROW_EXPORT DateType : public FixedWidthType {
static constexpr Type::type type_id = Type::DATE;
- using c_type = int32_t;
+ using c_type = int64_t;
DateType() : FixedWidthType(Type::DATE) {}
int bit_width() const override { return sizeof(c_type) * 8; }
Status Accept(TypeVisitor* visitor) const override;
- std::string ToString() const override { return name(); }
+ std::string ToString() const override;
static std::string name() { return "date"; }
};
http://git-wip-us.apache.org/repos/asf/arrow/blob/d7845fcd/cpp/src/arrow/type_fwd.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h
index 6d660f4..a9db32d 100644
--- a/cpp/src/arrow/type_fwd.h
+++ b/cpp/src/arrow/type_fwd.h
@@ -87,13 +87,15 @@ _NUMERIC_TYPE_DECL(Double);
#undef _NUMERIC_TYPE_DECL
struct DateType;
-class DateArray;
+using DateArray = NumericArray<DateType>;
+using DateBuilder = NumericBuilder<DateType>;
struct TimeType;
class TimeArray;
struct TimestampType;
using TimestampArray = NumericArray<TimestampType>;
+using TimestampBuilder = NumericBuilder<TimestampType>;
struct IntervalType;
using IntervalArray = NumericArray<IntervalType>;
http://git-wip-us.apache.org/repos/asf/arrow/blob/d7845fcd/cpp/src/arrow/type_traits.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h
index 3aaec0b..5616018 100644
--- a/cpp/src/arrow/type_traits.h
+++ b/cpp/src/arrow/type_traits.h
@@ -91,6 +91,14 @@ struct TypeTraits<Int64Type> {
};
template <>
+struct TypeTraits<DateType> {
+ using ArrayType = DateArray;
+ // using BuilderType = DateBuilder;
+
+ static inline int bytes_required(int elements) { return elements * sizeof(int64_t); }
+};
+
+template <>
struct TypeTraits<TimestampType> {
using ArrayType = TimestampArray;
// using BuilderType = TimestampBuilder;
http://git-wip-us.apache.org/repos/asf/arrow/blob/d7845fcd/python/pyarrow/__init__.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index b9d3861..a42e39c 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -37,6 +37,7 @@ from pyarrow.scalar import (ArrayValue, Scalar, NA, NAType,
from pyarrow.schema import (null, bool_,
int8, int16, int32, int64,
uint8, uint16, uint32, uint64,
+ timestamp, date,
float_, double, string,
list_, struct, field,
DataType, Field, Schema, schema)
http://git-wip-us.apache.org/repos/asf/arrow/blob/d7845fcd/python/pyarrow/array.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/array.pyx b/python/pyarrow/array.pyx
index d44212f..84f1705 100644
--- a/python/pyarrow/array.pyx
+++ b/python/pyarrow/array.pyx
@@ -218,6 +218,10 @@ cdef class UInt64Array(NumericArray):
pass
+cdef class DateArray(NumericArray):
+ pass
+
+
cdef class FloatArray(NumericArray):
pass
@@ -245,6 +249,7 @@ cdef dict _array_classes = {
Type_INT16: Int16Array,
Type_INT32: Int32Array,
Type_INT64: Int64Array,
+ Type_DATE: DateArray,
Type_FLOAT: FloatArray,
Type_DOUBLE: DoubleArray,
Type_LIST: ListArray,
@@ -284,7 +289,7 @@ def from_pylist(object list_obj, DataType type=None):
if type is None:
check_status(pyarrow.ConvertPySequence(list_obj, &sp_array))
else:
- raise NotImplementedError
+ raise NotImplementedError()
return box_arrow_array(sp_array)
http://git-wip-us.apache.org/repos/asf/arrow/blob/d7845fcd/python/pyarrow/includes/libarrow.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 15781ce..419dd74 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -39,11 +39,18 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
Type_DOUBLE" arrow::Type::DOUBLE"
Type_TIMESTAMP" arrow::Type::TIMESTAMP"
+ Type_DATE" arrow::Type::DATE"
Type_STRING" arrow::Type::STRING"
Type_LIST" arrow::Type::LIST"
Type_STRUCT" arrow::Type::STRUCT"
+ enum TimeUnit" arrow::TimeUnit":
+ TimeUnit_SECOND" arrow::TimeUnit::SECOND"
+ TimeUnit_MILLI" arrow::TimeUnit::MILLI"
+ TimeUnit_MICRO" arrow::TimeUnit::MICRO"
+ TimeUnit_NANO" arrow::TimeUnit::NANO"
+
cdef cppclass CDataType" arrow::DataType":
Type type
@@ -74,6 +81,9 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
cdef cppclass CStringType" arrow::StringType"(CDataType):
pass
+ cdef cppclass CTimestampType" arrow::TimestampType"(CDataType):
+ TimeUnit unit
+
cdef cppclass CField" arrow::Field":
c_string name
shared_ptr[CDataType] type
@@ -132,6 +142,12 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
cdef cppclass CInt64Array" arrow::Int64Array"(CArray):
int64_t Value(int i)
+ cdef cppclass CDateArray" arrow::DateArray"(CArray):
+ int64_t Value(int i)
+
+ cdef cppclass CTimestampArray" arrow::TimestampArray"(CArray):
+ int64_t Value(int i)
+
cdef cppclass CFloatArray" arrow::FloatArray"(CArray):
float Value(int i)
http://git-wip-us.apache.org/repos/asf/arrow/blob/d7845fcd/python/pyarrow/scalar.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/scalar.pyx b/python/pyarrow/scalar.pyx
index c2d20e4..09f60e2 100644
--- a/python/pyarrow/scalar.pyx
+++ b/python/pyarrow/scalar.pyx
@@ -20,6 +20,9 @@ from pyarrow.schema cimport DataType, box_data_type
from pyarrow.compat import frombytes
import pyarrow.schema as schema
+import datetime
+
+
NA = None
cdef class NAType(Scalar):
@@ -120,6 +123,32 @@ cdef class UInt64Value(ArrayValue):
return ap.Value(self.index)
+cdef class DateValue(ArrayValue):
+
+ def as_py(self):
+ cdef CDateArray* ap = <CDateArray*> self.sp_array.get()
+ return datetime.date.fromtimestamp(ap.Value(self.index) / 1000)
+
+
+cdef class TimestampValue(ArrayValue):
+
+ def as_py(self):
+ cdef:
+ CTimestampArray* ap = <CTimestampArray*> self.sp_array.get()
+ CTimestampType* dtype = <CTimestampType*>ap.type().get()
+ int64_t val = ap.Value(self.index)
+
+ if dtype.unit == TimeUnit_SECOND:
+ return datetime.datetime.utcfromtimestamp(val)
+ elif dtype.unit == TimeUnit_MILLI:
+ return datetime.datetime.utcfromtimestamp(float(val) / 1000)
+ elif dtype.unit == TimeUnit_MICRO:
+ return datetime.datetime.utcfromtimestamp(float(val) / 1000000)
+ else:
+ # TimeUnit_NANO
+ raise NotImplementedError("Cannot convert nanosecond timestamps to datetime.datetime")
+
+
cdef class FloatValue(ArrayValue):
def as_py(self):
@@ -184,6 +213,8 @@ cdef dict _scalar_classes = {
Type_INT16: Int16Value,
Type_INT32: Int32Value,
Type_INT64: Int64Value,
+ Type_DATE: DateValue,
+ Type_TIMESTAMP: TimestampValue,
Type_FLOAT: FloatValue,
Type_DOUBLE: DoubleValue,
Type_LIST: ListValue,
http://git-wip-us.apache.org/repos/asf/arrow/blob/d7845fcd/python/pyarrow/schema.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/schema.pyx b/python/pyarrow/schema.pyx
index e0badb9..d05ac9e 100644
--- a/python/pyarrow/schema.pyx
+++ b/python/pyarrow/schema.pyx
@@ -164,6 +164,7 @@ cdef set PRIMITIVE_TYPES = set([
Type_UINT16, Type_INT16,
Type_UINT32, Type_INT32,
Type_UINT64, Type_INT64,
+ Type_TIMESTAMP, Type_DATE,
Type_FLOAT, Type_DOUBLE])
def null():
@@ -196,6 +197,12 @@ def uint64():
def int64():
return primitive_type(Type_INT64)
+def timestamp():
+ return primitive_type(Type_TIMESTAMP)
+
+def date():
+ return primitive_type(Type_DATE)
+
def float_():
return primitive_type(Type_FLOAT)
http://git-wip-us.apache.org/repos/asf/arrow/blob/d7845fcd/python/pyarrow/tests/test_convert_builtin.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py
index 34371b0..7dc1c1b 100644
--- a/python/pyarrow/tests/test_convert_builtin.py
+++ b/python/pyarrow/tests/test_convert_builtin.py
@@ -18,6 +18,7 @@
from pyarrow.compat import unittest
import pyarrow
+import datetime
class TestConvertList(unittest.TestCase):
@@ -78,6 +79,33 @@ class TestConvertList(unittest.TestCase):
assert arr.type == pyarrow.string()
assert arr.to_pylist() == ['foo', 'bar', None, 'arrow']
+ def test_date(self):
+ data = [datetime.date(2000, 1, 1), None, datetime.date(1970, 1, 1), datetime.date(2040, 2, 26)]
+ arr = pyarrow.from_pylist(data)
+ assert len(arr) == 4
+ assert arr.type == pyarrow.date()
+ assert arr.null_count == 1
+ assert arr[0].as_py() == datetime.date(2000, 1, 1)
+ assert arr[1].as_py() is None
+ assert arr[2].as_py() == datetime.date(1970, 1, 1)
+ assert arr[3].as_py() == datetime.date(2040, 2, 26)
+
+ def test_timestamp(self):
+ data = [
+ datetime.datetime(2007, 7, 13, 1, 23, 34, 123456),
+ None,
+ datetime.datetime(2006, 1, 13, 12, 34, 56, 432539),
+ datetime.datetime(2010, 8, 13, 5, 46, 57, 437699)
+ ]
+ arr = pyarrow.from_pylist(data)
+ assert len(arr) == 4
+ assert arr.type == pyarrow.timestamp()
+ assert arr.null_count == 1
+ assert arr[0].as_py() == datetime.datetime(2007, 7, 13, 1, 23, 34, 123456)
+ assert arr[1].as_py() is None
+ assert arr[2].as_py() == datetime.datetime(2006, 1, 13, 12, 34, 56, 432539)
+ assert arr[3].as_py() == datetime.datetime(2010, 8, 13, 5, 46, 57, 437699)
+
def test_mixed_nesting_levels(self):
pyarrow.from_pylist([1, 2, None])
pyarrow.from_pylist([[1], [2], None])
http://git-wip-us.apache.org/repos/asf/arrow/blob/d7845fcd/python/pyarrow/tests/test_convert_pandas.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index b527ca7..cf50f3d 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -15,6 +15,7 @@
# specific language governing permissions and limitations
# under the License.
+import datetime
import unittest
import numpy as np
@@ -204,6 +205,20 @@ class TestPandasConversion(unittest.TestCase):
})
self._check_pandas_roundtrip(df, timestamps_to_ms=False)
+ def test_date(self):
+ df = pd.DataFrame({
+ 'date': [
+ datetime.date(2000, 1, 1),
+ None,
+ datetime.date(1970, 1, 1),
+ datetime.date(2040, 2, 26)
+ ]})
+ table = A.from_pandas_dataframe(df)
+ result = table.to_pandas()
+ expected = df.copy()
+ expected['date'] = pd.to_datetime(df['date'])
+ tm.assert_frame_equal(result, expected)
+
# def test_category(self):
# repeats = 1000
# values = [b'foo', None, u'bar', 'qux', np.nan]
http://git-wip-us.apache.org/repos/asf/arrow/blob/d7845fcd/python/src/pyarrow/adapters/builtin.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/adapters/builtin.cc b/python/src/pyarrow/adapters/builtin.cc
index ac2f533..e0cb7c2 100644
--- a/python/src/pyarrow/adapters/builtin.cc
+++ b/python/src/pyarrow/adapters/builtin.cc
@@ -16,6 +16,7 @@
// under the License.
#include <Python.h>
+#include <datetime.h>
#include <sstream>
#include "pyarrow/adapters/builtin.h"
@@ -24,6 +25,7 @@
#include "arrow/status.h"
#include "pyarrow/helpers.h"
+#include "pyarrow/util/datetime.h"
using arrow::ArrayBuilder;
using arrow::DataType;
@@ -55,6 +57,8 @@ class ScalarVisitor {
none_count_(0),
bool_count_(0),
int_count_(0),
+ date_count_(0),
+ timestamp_count_(0),
float_count_(0),
string_count_(0) {}
@@ -68,6 +72,10 @@ class ScalarVisitor {
++float_count_;
} else if (IsPyInteger(obj)) {
++int_count_;
+ } else if (PyDate_CheckExact(obj)) {
+ ++date_count_;
+ } else if (PyDateTime_CheckExact(obj)) {
+ ++timestamp_count_;
} else if (IsPyBaseString(obj)) {
++string_count_;
} else {
@@ -82,6 +90,10 @@ class ScalarVisitor {
} else if (int_count_) {
// TODO(wesm): tighter type later
return INT64;
+ } else if (date_count_) {
+ return DATE;
+ } else if (timestamp_count_) {
+ return TIMESTAMP_US;
} else if (bool_count_) {
return BOOL;
} else if (string_count_) {
@@ -100,6 +112,8 @@ class ScalarVisitor {
int64_t none_count_;
int64_t bool_count_;
int64_t int_count_;
+ int64_t date_count_;
+ int64_t timestamp_count_;
int64_t float_count_;
int64_t string_count_;
@@ -297,6 +311,56 @@ class Int64Converter : public TypedConverter<arrow::Int64Builder> {
}
};
+class DateConverter : public TypedConverter<arrow::DateBuilder> {
+ public:
+ Status AppendData(PyObject* seq) override {
+ Py_ssize_t size = PySequence_Size(seq);
+ RETURN_NOT_OK(typed_builder_->Reserve(size));
+ for (int64_t i = 0; i < size; ++i) {
+ OwnedRef item(PySequence_GetItem(seq, i));
+ if (item.obj() == Py_None) {
+ typed_builder_->AppendNull();
+ } else {
+ PyDateTime_Date* pydate = reinterpret_cast<PyDateTime_Date*>(item.obj());
+ typed_builder_->Append(PyDate_to_ms(pydate));
+ }
+ }
+ return Status::OK();
+ }
+};
+
+class TimestampConverter : public TypedConverter<arrow::TimestampBuilder> {
+ public:
+ Status AppendData(PyObject* seq) override {
+ Py_ssize_t size = PySequence_Size(seq);
+ RETURN_NOT_OK(typed_builder_->Reserve(size));
+ for (int64_t i = 0; i < size; ++i) {
+ OwnedRef item(PySequence_GetItem(seq, i));
+ if (item.obj() == Py_None) {
+ typed_builder_->AppendNull();
+ } else {
+ PyDateTime_DateTime* pydatetime = reinterpret_cast<PyDateTime_DateTime*>(item.obj());
+ struct tm datetime = {0};
+ datetime.tm_year = PyDateTime_GET_YEAR(pydatetime) - 1900;
+ datetime.tm_mon = PyDateTime_GET_MONTH(pydatetime) - 1;
+ datetime.tm_mday = PyDateTime_GET_DAY(pydatetime);
+ datetime.tm_hour = PyDateTime_DATE_GET_HOUR(pydatetime);
+ datetime.tm_min = PyDateTime_DATE_GET_MINUTE(pydatetime);
+ datetime.tm_sec = PyDateTime_DATE_GET_SECOND(pydatetime);
+ int us = PyDateTime_DATE_GET_MICROSECOND(pydatetime);
+ RETURN_IF_PYERROR();
+ struct tm epoch = {0};
+ epoch.tm_year = 70;
+ epoch.tm_mday = 1;
+ // Microseconds since the epoch
+ int64_t val = lrint(difftime(mktime(&datetime), mktime(&epoch))) * 1000000 + us;
+ typed_builder_->Append(val);
+ }
+ }
+ return Status::OK();
+ }
+};
+
class DoubleConverter : public TypedConverter<arrow::DoubleBuilder> {
public:
Status AppendData(PyObject* seq) override {
@@ -379,6 +443,10 @@ std::shared_ptr<SeqConverter> GetConverter(const std::shared_ptr<DataType>& type
return std::make_shared<BoolConverter>();
case Type::INT64:
return std::make_shared<Int64Converter>();
+ case Type::DATE:
+ return std::make_shared<DateConverter>();
+ case Type::TIMESTAMP:
+ return std::make_shared<TimestampConverter>();
case Type::DOUBLE:
return std::make_shared<DoubleConverter>();
case Type::STRING:
@@ -409,6 +477,7 @@ Status ListConverter::Init(const std::shared_ptr<ArrayBuilder>& builder) {
Status ConvertPySequence(PyObject* obj, std::shared_ptr<arrow::Array>* out) {
std::shared_ptr<DataType> type;
int64_t size;
+ PyDateTime_IMPORT;
RETURN_NOT_OK(InferArrowType(obj, &size, &type));
// Handle NA / NullType case
http://git-wip-us.apache.org/repos/asf/arrow/blob/d7845fcd/python/src/pyarrow/adapters/pandas.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/adapters/pandas.cc b/python/src/pyarrow/adapters/pandas.cc
index 64b7086..f8dff6d 100644
--- a/python/src/pyarrow/adapters/pandas.cc
+++ b/python/src/pyarrow/adapters/pandas.cc
@@ -35,6 +35,7 @@
#include "pyarrow/common.h"
#include "pyarrow/config.h"
+#include "pyarrow/util/datetime.h"
namespace pyarrow {
@@ -167,6 +168,28 @@ class ArrowSerializer {
private:
Status ConvertData();
+ Status ConvertDates(std::shared_ptr<Array>* out) {
+ PyAcquireGIL lock;
+
+ PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
+ arrow::TypePtr string_type(new arrow::DateType());
+ arrow::DateBuilder date_builder(pool_, string_type);
+ RETURN_NOT_OK(date_builder.Resize(length_));
+
+ Status s;
+ PyObject* obj;
+ for (int64_t i = 0; i < length_; ++i) {
+ obj = objects[i];
+ if (PyDate_CheckExact(obj)) {
+ PyDateTime_Date* pydate = reinterpret_cast<PyDateTime_Date*>(obj);
+ date_builder.Append(PyDate_to_ms(pydate));
+ } else {
+ date_builder.AppendNull();
+ }
+ }
+ return date_builder.Finish(out);
+ }
+
Status ConvertObjectStrings(std::shared_ptr<Array>* out) {
PyAcquireGIL lock;
@@ -369,6 +392,10 @@ inline Status ArrowSerializer<NPY_OBJECT>::Convert(std::shared_ptr<Array>* out)
// TODO: mask not supported here
const PyObject** objects = reinterpret_cast<const PyObject**>(PyArray_DATA(arr_));
+ {
+ PyAcquireGIL lock;
+ PyDateTime_IMPORT;
+ }
for (int64_t i = 0; i < length_; ++i) {
if (PyObject_is_null(objects[i])) {
@@ -377,6 +404,8 @@ inline Status ArrowSerializer<NPY_OBJECT>::Convert(std::shared_ptr<Array>* out)
return ConvertObjectStrings(out);
} else if (PyBool_Check(objects[i])) {
return ConvertBooleans(out);
+ } else if (PyDate_CheckExact(objects[i])) {
+ return ConvertDates(out);
} else {
return Status::TypeError("unhandled python type");
}
@@ -548,6 +577,17 @@ struct arrow_traits<arrow::Type::TIMESTAMP> {
};
template <>
+struct arrow_traits<arrow::Type::DATE> {
+ static constexpr int npy_type = NPY_DATETIME;
+ static constexpr bool supports_nulls = true;
+ static constexpr int64_t na_value = std::numeric_limits<int64_t>::min();
+ static constexpr bool is_boolean = false;
+ static constexpr bool is_pandas_numeric_not_nullable = false;
+ static constexpr bool is_pandas_numeric_nullable = true;
+ typedef typename npy_traits<NPY_DATETIME>::value_type T;
+};
+
+template <>
struct arrow_traits<arrow::Type::STRING> {
static constexpr int npy_type = NPY_OBJECT;
static constexpr bool supports_nulls = true;
@@ -567,24 +607,28 @@ static inline PyObject* make_pystring(const uint8_t* data, int32_t length) {
inline void set_numpy_metadata(int type, DataType* datatype, PyArrayObject* out) {
if (type == NPY_DATETIME) {
- auto timestamp_type = static_cast<arrow::TimestampType*>(datatype);
- // We only support ms resolution at the moment
PyArray_Descr* descr = PyArray_DESCR(out);
auto date_dtype = reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(descr->c_metadata);
+ if (datatype->type == arrow::Type::TIMESTAMP) {
+ auto timestamp_type = static_cast<arrow::TimestampType*>(datatype);
- switch (timestamp_type->unit) {
- case arrow::TimestampType::Unit::SECOND:
- date_dtype->meta.base = NPY_FR_s;
- break;
- case arrow::TimestampType::Unit::MILLI:
- date_dtype->meta.base = NPY_FR_ms;
- break;
- case arrow::TimestampType::Unit::MICRO:
- date_dtype->meta.base = NPY_FR_us;
- break;
- case arrow::TimestampType::Unit::NANO:
- date_dtype->meta.base = NPY_FR_ns;
- break;
+ switch (timestamp_type->unit) {
+ case arrow::TimestampType::Unit::SECOND:
+ date_dtype->meta.base = NPY_FR_s;
+ break;
+ case arrow::TimestampType::Unit::MILLI:
+ date_dtype->meta.base = NPY_FR_ms;
+ break;
+ case arrow::TimestampType::Unit::MICRO:
+ date_dtype->meta.base = NPY_FR_us;
+ break;
+ case arrow::TimestampType::Unit::NANO:
+ date_dtype->meta.base = NPY_FR_ns;
+ break;
+ }
+ } else {
+ // datatype->type == arrow::Type::DATE
+ date_dtype->meta.base = NPY_FR_D;
}
}
}
@@ -666,7 +710,7 @@ class ArrowDeserializer {
template <int T2>
inline typename std::enable_if<
- arrow_traits<T2>::is_pandas_numeric_nullable, Status>::type
+ (T2 != arrow::Type::DATE) & arrow_traits<T2>::is_pandas_numeric_nullable, Status>::type
ConvertValues(const std::shared_ptr<arrow::ChunkedArray>& data) {
typedef typename arrow_traits<T2>::T T;
size_t chunk_offset = 0;
@@ -697,6 +741,32 @@ class ArrowDeserializer {
return Status::OK();
}
+ template <int T2>
+ inline typename std::enable_if<
+ T2 == arrow::Type::DATE, Status>::type
+ ConvertValues(const std::shared_ptr<arrow::ChunkedArray>& data) {
+ typedef typename arrow_traits<T2>::T T;
+ size_t chunk_offset = 0;
+
+ RETURN_NOT_OK(AllocateOutput(arrow_traits<T2>::npy_type));
+
+ for (int c = 0; c < data->num_chunks(); c++) {
+ const std::shared_ptr<Array> arr = data->chunk(c);
+ auto prim_arr = static_cast<arrow::PrimitiveArray*>(arr.get());
+ auto in_values = reinterpret_cast<const T*>(prim_arr->data()->data());
+ auto out_values = reinterpret_cast<T*>(PyArray_DATA(out_)) + chunk_offset;
+
+ for (int64_t i = 0; i < arr->length(); ++i) {
+ // There are 1000 * 60 * 60 * 24 = 86400000ms in a day
+ out_values[i] = arr->IsNull(i) ? arrow_traits<T2>::na_value : in_values[i] / 86400000;
+ }
+
+ chunk_offset += arr->length();
+ }
+
+ return Status::OK();
+ }
+
// Integer specialization
template <int T2>
inline typename std::enable_if<
@@ -879,6 +949,7 @@ Status ConvertColumnToPandas(const std::shared_ptr<Column>& col, PyObject* py_re
FROM_ARROW_CASE(FLOAT);
FROM_ARROW_CASE(DOUBLE);
FROM_ARROW_CASE(STRING);
+ FROM_ARROW_CASE(DATE);
FROM_ARROW_CASE(TIMESTAMP);
default:
return Status::NotImplemented("Arrow type reading not implemented");
http://git-wip-us.apache.org/repos/asf/arrow/blob/d7845fcd/python/src/pyarrow/helpers.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/helpers.cc b/python/src/pyarrow/helpers.cc
index 08003aa..af92744 100644
--- a/python/src/pyarrow/helpers.cc
+++ b/python/src/pyarrow/helpers.cc
@@ -33,6 +33,8 @@ const std::shared_ptr<Int8Type> INT8 = std::make_shared<Int8Type>();
const std::shared_ptr<Int16Type> INT16 = std::make_shared<Int16Type>();
const std::shared_ptr<Int32Type> INT32 = std::make_shared<Int32Type>();
const std::shared_ptr<Int64Type> INT64 = std::make_shared<Int64Type>();
+const std::shared_ptr<DateType> DATE = std::make_shared<DateType>();
+const std::shared_ptr<TimestampType> TIMESTAMP_US = std::make_shared<TimestampType>(TimeUnit::MICRO);
const std::shared_ptr<FloatType> FLOAT = std::make_shared<FloatType>();
const std::shared_ptr<DoubleType> DOUBLE = std::make_shared<DoubleType>();
const std::shared_ptr<StringType> STRING = std::make_shared<StringType>();
@@ -54,6 +56,10 @@ std::shared_ptr<DataType> GetPrimitiveType(Type::type type) {
GET_PRIMITIVE_TYPE(INT32, Int32Type);
GET_PRIMITIVE_TYPE(UINT64, UInt64Type);
GET_PRIMITIVE_TYPE(INT64, Int64Type);
+ GET_PRIMITIVE_TYPE(DATE, DateType);
+ case Type::TIMESTAMP:
+ return TIMESTAMP_US;
+ break;
GET_PRIMITIVE_TYPE(BOOL, BooleanType);
GET_PRIMITIVE_TYPE(FLOAT, FloatType);
GET_PRIMITIVE_TYPE(DOUBLE, DoubleType);
http://git-wip-us.apache.org/repos/asf/arrow/blob/d7845fcd/python/src/pyarrow/helpers.h
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/helpers.h b/python/src/pyarrow/helpers.h
index fa9c713..e714bba 100644
--- a/python/src/pyarrow/helpers.h
+++ b/python/src/pyarrow/helpers.h
@@ -38,6 +38,8 @@ extern const std::shared_ptr<arrow::Int8Type> INT8;
extern const std::shared_ptr<arrow::Int16Type> INT16;
extern const std::shared_ptr<arrow::Int32Type> INT32;
extern const std::shared_ptr<arrow::Int64Type> INT64;
+extern const std::shared_ptr<arrow::DateType> DATE;
+extern const std::shared_ptr<arrow::TimestampType> TIMESTAMP_US;
extern const std::shared_ptr<arrow::FloatType> FLOAT;
extern const std::shared_ptr<arrow::DoubleType> DOUBLE;
extern const std::shared_ptr<arrow::StringType> STRING;
http://git-wip-us.apache.org/repos/asf/arrow/blob/d7845fcd/python/src/pyarrow/util/datetime.h
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/util/datetime.h b/python/src/pyarrow/util/datetime.h
new file mode 100644
index 0000000..b67accc
--- /dev/null
+++ b/python/src/pyarrow/util/datetime.h
@@ -0,0 +1,40 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef PYARROW_UTIL_DATETIME_H
+#define PYARROW_UTIL_DATETIME_H
+
+#include <Python.h>
+#include <datetime.h>
+
+namespace pyarrow {
+
+inline int64_t PyDate_to_ms(PyDateTime_Date* pydate) {
+ struct tm date = {0};
+ date.tm_year = PyDateTime_GET_YEAR(pydate) - 1900;
+ date.tm_mon = PyDateTime_GET_MONTH(pydate) - 1;
+ date.tm_mday = PyDateTime_GET_DAY(pydate);
+ struct tm epoch = {0};
+ epoch.tm_year = 70;
+ epoch.tm_mday = 1;
+ // Milliseconds since the epoch
+ return lrint(difftime(mktime(&date), mktime(&epoch)) * 1000);
+}
+
+} // namespace pyarrow
+
+#endif // PYARROW_UTIL_DATETIME_H