You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2016/08/28 19:25:46 UTC
arrow git commit: ARROW-242: Support Timestamp Data Type
Repository: arrow
Updated Branches:
refs/heads/master e081a4c27 -> 0a411fd29
ARROW-242: Support Timestamp Data Type
For the Pandas<->Parquet bridge this is a lossy conversion but must be explicitly activated by the user.
Regarding Parquet 1.0: Yes, the logical type is not supported but should be simply ignored by the reader. Implementation for INT96 timestamps is not in the scope of this PR.
Author: Uwe L. Korn <uw...@xhochy.com>
Closes #107 from xhochy/arrow-242 and squashes the following commits:
8db6968 [Uwe L. Korn] Add missing include
34126b1 [Uwe L. Korn] ARROW-242: Support Timestamp Data Type
Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/0a411fd2
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/0a411fd2
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/0a411fd2
Branch: refs/heads/master
Commit: 0a411fd29ed1baac6f1524be82fc15e08f2b28db
Parents: e081a4c
Author: Uwe L. Korn <uw...@xhochy.com>
Authored: Sun Aug 28 15:25:35 2016 -0400
Committer: Wes McKinney <we...@apache.org>
Committed: Sun Aug 28 15:25:35 2016 -0400
----------------------------------------------------------------------
.../arrow/parquet/parquet-reader-writer-test.cc | 12 ++-
cpp/src/arrow/parquet/parquet-schema-test.cc | 23 +++-
cpp/src/arrow/parquet/reader.cc | 1 +
cpp/src/arrow/parquet/schema.cc | 13 ++-
cpp/src/arrow/parquet/writer.cc | 1 +
cpp/src/arrow/types/construct.cc | 3 +-
cpp/src/arrow/types/datetime.h | 12 ++-
cpp/src/arrow/types/primitive.cc | 1 +
cpp/src/arrow/types/primitive.h | 11 ++
python/pyarrow/array.pyx | 40 ++++++-
python/pyarrow/includes/libarrow.pxd | 1 +
python/pyarrow/tests/test_convert_pandas.py | 24 ++++-
python/pyarrow/tests/test_parquet.py | 4 +-
python/src/pyarrow/adapters/pandas.cc | 107 +++++++++++++++++--
14 files changed, 232 insertions(+), 21 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/arrow/blob/0a411fd2/cpp/src/arrow/parquet/parquet-reader-writer-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/parquet/parquet-reader-writer-test.cc b/cpp/src/arrow/parquet/parquet-reader-writer-test.cc
index bfc27d2..d7b39dd 100644
--- a/cpp/src/arrow/parquet/parquet-reader-writer-test.cc
+++ b/cpp/src/arrow/parquet/parquet-reader-writer-test.cc
@@ -138,6 +138,15 @@ struct test_traits<Int64Type> {
const int64_t test_traits<Int64Type>::value(-1024);
template <>
+struct test_traits<TimestampType> {
+ static constexpr ParquetType::type parquet_enum = ParquetType::INT64;
+ static constexpr LogicalType::type logical_enum = LogicalType::TIMESTAMP_MILLIS;
+ static int64_t const value;
+};
+
+const int64_t test_traits<TimestampType>::value(14695634030000);
+
+template <>
struct test_traits<FloatType> {
static constexpr ParquetType::type parquet_enum = ParquetType::FLOAT;
static constexpr LogicalType::type logical_enum = LogicalType::NONE;
@@ -248,7 +257,8 @@ class TestParquetIO : public ::testing::Test {
// Parquet version 1.0.
typedef ::testing::Types<BooleanType, UInt8Type, Int8Type, UInt16Type, Int16Type,
- Int32Type, UInt64Type, Int64Type, FloatType, DoubleType, StringType> TestTypes;
+ Int32Type, UInt64Type, Int64Type, TimestampType, FloatType, DoubleType,
+ StringType> TestTypes;
TYPED_TEST_CASE(TestParquetIO, TestTypes);
http://git-wip-us.apache.org/repos/asf/arrow/blob/0a411fd2/cpp/src/arrow/parquet/parquet-schema-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/parquet/parquet-schema-test.cc b/cpp/src/arrow/parquet/parquet-schema-test.cc
index 819cdd3..a2bcd3e 100644
--- a/cpp/src/arrow/parquet/parquet-schema-test.cc
+++ b/cpp/src/arrow/parquet/parquet-schema-test.cc
@@ -22,6 +22,7 @@
#include "arrow/test-util.h"
#include "arrow/type.h"
+#include "arrow/types/datetime.h"
#include "arrow/types/decimal.h"
#include "arrow/util/status.h"
@@ -45,6 +46,9 @@ const auto INT64 = std::make_shared<Int64Type>();
const auto FLOAT = std::make_shared<FloatType>();
const auto DOUBLE = std::make_shared<DoubleType>();
const auto UTF8 = std::make_shared<StringType>();
+const auto TIMESTAMP_MS = std::make_shared<TimestampType>(TimestampType::Unit::MILLI);
+// TODO: This requires parquet-cpp implementing the MICROS enum value
+// const auto TIMESTAMP_US = std::make_shared<TimestampType>(TimestampType::Unit::MICRO);
const auto BINARY = std::make_shared<ListType>(std::make_shared<Field>("", UINT8));
const auto DECIMAL_8_4 = std::make_shared<DecimalType>(8, 4);
@@ -89,6 +93,14 @@ TEST_F(TestConvertParquetSchema, ParquetFlatPrimitives) {
PrimitiveNode::Make("int64", Repetition::REQUIRED, ParquetType::INT64));
arrow_fields.push_back(std::make_shared<Field>("int64", INT64, false));
+ parquet_fields.push_back(PrimitiveNode::Make("timestamp", Repetition::REQUIRED,
+ ParquetType::INT64, LogicalType::TIMESTAMP_MILLIS));
+ arrow_fields.push_back(std::make_shared<Field>("timestamp", TIMESTAMP_MS, false));
+
+ // parquet_fields.push_back(PrimitiveNode::Make("timestamp", Repetition::REQUIRED,
+ // ParquetType::INT64, LogicalType::TIMESTAMP_MICROS));
+ // arrow_fields.push_back(std::make_shared<Field>("timestamp", TIMESTAMP_US, false));
+
parquet_fields.push_back(
PrimitiveNode::Make("float", Repetition::OPTIONAL, ParquetType::FLOAT));
arrow_fields.push_back(std::make_shared<Field>("float", FLOAT));
@@ -153,9 +165,6 @@ TEST_F(TestConvertParquetSchema, UnsupportedThings) {
unsupported_nodes.push_back(PrimitiveNode::Make(
"int32", Repetition::OPTIONAL, ParquetType::INT32, LogicalType::DATE));
- unsupported_nodes.push_back(PrimitiveNode::Make(
- "int64", Repetition::OPTIONAL, ParquetType::INT64, LogicalType::TIMESTAMP_MILLIS));
-
for (const NodePtr& node : unsupported_nodes) {
ASSERT_RAISES(NotImplemented, ConvertSchema({node}));
}
@@ -209,6 +218,14 @@ TEST_F(TestConvertArrowSchema, ParquetFlatPrimitives) {
PrimitiveNode::Make("int64", Repetition::REQUIRED, ParquetType::INT64));
arrow_fields.push_back(std::make_shared<Field>("int64", INT64, false));
+ parquet_fields.push_back(PrimitiveNode::Make("timestamp", Repetition::REQUIRED,
+ ParquetType::INT64, LogicalType::TIMESTAMP_MILLIS));
+ arrow_fields.push_back(std::make_shared<Field>("timestamp", TIMESTAMP_MS, false));
+
+ // parquet_fields.push_back(PrimitiveNode::Make("timestamp", Repetition::REQUIRED,
+ // ParquetType::INT64, LogicalType::TIMESTAMP_MICROS));
+ // arrow_fields.push_back(std::make_shared<Field>("timestamp", TIMESTAMP_US, false));
+
parquet_fields.push_back(
PrimitiveNode::Make("float", Repetition::OPTIONAL, ParquetType::FLOAT));
arrow_fields.push_back(std::make_shared<Field>("float", FLOAT));
http://git-wip-us.apache.org/repos/asf/arrow/blob/0a411fd2/cpp/src/arrow/parquet/reader.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/parquet/reader.cc b/cpp/src/arrow/parquet/reader.cc
index e92967e..9f62125 100644
--- a/cpp/src/arrow/parquet/reader.cc
+++ b/cpp/src/arrow/parquet/reader.cc
@@ -368,6 +368,7 @@ Status FlatColumnReader::Impl::NextBatch(int batch_size, std::shared_ptr<Array>*
TYPED_BATCH_CASE(FLOAT, FloatType, ::parquet::FloatType)
TYPED_BATCH_CASE(DOUBLE, DoubleType, ::parquet::DoubleType)
TYPED_BATCH_CASE(STRING, StringType, ::parquet::ByteArrayType)
+ TYPED_BATCH_CASE(TIMESTAMP, TimestampType, ::parquet::Int64Type)
default:
return Status::NotImplemented(field_->type->ToString());
}
http://git-wip-us.apache.org/repos/asf/arrow/blob/0a411fd2/cpp/src/arrow/parquet/schema.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/parquet/schema.cc b/cpp/src/arrow/parquet/schema.cc
index a79342a..cd91df3 100644
--- a/cpp/src/arrow/parquet/schema.cc
+++ b/cpp/src/arrow/parquet/schema.cc
@@ -52,6 +52,7 @@ const auto INT64 = std::make_shared<Int64Type>();
const auto FLOAT = std::make_shared<FloatType>();
const auto DOUBLE = std::make_shared<DoubleType>();
const auto UTF8 = std::make_shared<StringType>();
+const auto TIMESTAMP_MS = std::make_shared<TimestampType>(TimestampType::Unit::MILLI);
const auto BINARY = std::make_shared<ListType>(std::make_shared<Field>("", UINT8));
TypePtr MakeDecimalType(const PrimitiveNode* node) {
@@ -133,6 +134,9 @@ static Status FromInt64(const PrimitiveNode* node, TypePtr* out) {
case LogicalType::DECIMAL:
*out = MakeDecimalType(node);
break;
+ case LogicalType::TIMESTAMP_MILLIS:
+ *out = TIMESTAMP_MS;
+ break;
default:
return Status::NotImplemented("Unhandled logical type for int64");
break;
@@ -289,10 +293,15 @@ Status FieldToNode(const std::shared_ptr<Field>& field,
type = ParquetType::INT32;
logical_type = LogicalType::DATE;
break;
- case Type::TIMESTAMP:
+ case Type::TIMESTAMP: {
+ auto timestamp_type = static_cast<TimestampType*>(field->type.get());
+ if (timestamp_type->unit != TimestampType::Unit::MILLI) {
+ return Status::NotImplemented(
+ "Other timestamp units than millisecond are not yet support with parquet.");
+ }
type = ParquetType::INT64;
logical_type = LogicalType::TIMESTAMP_MILLIS;
- break;
+ } break;
case Type::TIMESTAMP_DOUBLE:
type = ParquetType::INT64;
// This is specified as seconds since the UNIX epoch
http://git-wip-us.apache.org/repos/asf/arrow/blob/0a411fd2/cpp/src/arrow/parquet/writer.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/parquet/writer.cc b/cpp/src/arrow/parquet/writer.cc
index f9514aa..ddee573 100644
--- a/cpp/src/arrow/parquet/writer.cc
+++ b/cpp/src/arrow/parquet/writer.cc
@@ -240,6 +240,7 @@ Status FileWriter::Impl::WriteFlatColumnChunk(
TYPED_BATCH_CASE(INT32, Int32Type, ::parquet::Int32Type)
TYPED_BATCH_CASE(UINT64, UInt64Type, ::parquet::Int64Type)
TYPED_BATCH_CASE(INT64, Int64Type, ::parquet::Int64Type)
+ TYPED_BATCH_CASE(TIMESTAMP, TimestampType, ::parquet::Int64Type)
TYPED_BATCH_CASE(FLOAT, FloatType, ::parquet::FloatType)
TYPED_BATCH_CASE(DOUBLE, DoubleType, ::parquet::DoubleType)
default:
http://git-wip-us.apache.org/repos/asf/arrow/blob/0a411fd2/cpp/src/arrow/types/construct.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/construct.cc b/cpp/src/arrow/types/construct.cc
index 5ae9c5a..0b71ea9 100644
--- a/cpp/src/arrow/types/construct.cc
+++ b/cpp/src/arrow/types/construct.cc
@@ -51,6 +51,7 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
BUILDER_CASE(INT32, Int32Builder);
BUILDER_CASE(UINT64, UInt64Builder);
BUILDER_CASE(INT64, Int64Builder);
+ BUILDER_CASE(TIMESTAMP, TimestampBuilder);
BUILDER_CASE(BOOL, BooleanBuilder);
@@ -105,7 +106,7 @@ Status MakePrimitiveArray(const TypePtr& type, int32_t length,
MAKE_PRIMITIVE_ARRAY_CASE(UINT64, UInt64Array);
MAKE_PRIMITIVE_ARRAY_CASE(INT64, Int64Array);
MAKE_PRIMITIVE_ARRAY_CASE(TIME, Int64Array);
- MAKE_PRIMITIVE_ARRAY_CASE(TIMESTAMP, Int64Array);
+ MAKE_PRIMITIVE_ARRAY_CASE(TIMESTAMP, TimestampArray);
MAKE_PRIMITIVE_ARRAY_CASE(FLOAT, FloatArray);
MAKE_PRIMITIVE_ARRAY_CASE(DOUBLE, DoubleArray);
MAKE_PRIMITIVE_ARRAY_CASE(TIMESTAMP_DOUBLE, DoubleArray);
http://git-wip-us.apache.org/repos/asf/arrow/blob/0a411fd2/cpp/src/arrow/types/datetime.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/datetime.h b/cpp/src/arrow/types/datetime.h
index b782455..241a126 100644
--- a/cpp/src/arrow/types/datetime.h
+++ b/cpp/src/arrow/types/datetime.h
@@ -18,6 +18,8 @@
#ifndef ARROW_TYPES_DATETIME_H
#define ARROW_TYPES_DATETIME_H
+#include <string>
+
#include "arrow/type.h"
namespace arrow {
@@ -34,15 +36,23 @@ struct DateType : public DataType {
static char const* name() { return "date"; }
};
-struct TimestampType : public DataType {
+struct ARROW_EXPORT TimestampType : public DataType {
enum class Unit : char { SECOND = 0, MILLI = 1, MICRO = 2, NANO = 3 };
+ typedef int64_t c_type;
+ static constexpr Type::type type_enum = Type::TIMESTAMP;
+
+ int value_size() const override { return sizeof(int64_t); }
+
Unit unit;
explicit TimestampType(Unit unit = Unit::MILLI)
: DataType(Type::TIMESTAMP), unit(unit) {}
TimestampType(const TimestampType& other) : TimestampType(other.unit) {}
+ virtual ~TimestampType() {}
+
+ std::string ToString() const override { return "timestamp"; }
static char const* name() { return "timestamp"; }
};
http://git-wip-us.apache.org/repos/asf/arrow/blob/0a411fd2/cpp/src/arrow/types/primitive.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/primitive.cc b/cpp/src/arrow/types/primitive.cc
index f4b47f9..375e94f 100644
--- a/cpp/src/arrow/types/primitive.cc
+++ b/cpp/src/arrow/types/primitive.cc
@@ -158,6 +158,7 @@ template class PrimitiveBuilder<Int8Type>;
template class PrimitiveBuilder<Int16Type>;
template class PrimitiveBuilder<Int32Type>;
template class PrimitiveBuilder<Int64Type>;
+template class PrimitiveBuilder<TimestampType>;
template class PrimitiveBuilder<FloatType>;
template class PrimitiveBuilder<DoubleType>;
template class PrimitiveBuilder<BooleanType>;
http://git-wip-us.apache.org/repos/asf/arrow/blob/0a411fd2/cpp/src/arrow/types/primitive.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/primitive.h b/cpp/src/arrow/types/primitive.h
index 770de76..c643783 100644
--- a/cpp/src/arrow/types/primitive.h
+++ b/cpp/src/arrow/types/primitive.h
@@ -26,6 +26,7 @@
#include "arrow/array.h"
#include "arrow/builder.h"
#include "arrow/type.h"
+#include "arrow/types/datetime.h"
#include "arrow/util/bit-util.h"
#include "arrow/util/buffer.h"
#include "arrow/util/status.h"
@@ -100,6 +101,7 @@ NUMERIC_ARRAY_DECL(UInt32Array, UInt32Type);
NUMERIC_ARRAY_DECL(Int32Array, Int32Type);
NUMERIC_ARRAY_DECL(UInt64Array, UInt64Type);
NUMERIC_ARRAY_DECL(Int64Array, Int64Type);
+NUMERIC_ARRAY_DECL(TimestampArray, TimestampType);
NUMERIC_ARRAY_DECL(FloatArray, FloatType);
NUMERIC_ARRAY_DECL(DoubleArray, DoubleType);
@@ -235,7 +237,15 @@ struct type_traits<Int64Type> {
static inline int bytes_required(int elements) { return elements * sizeof(int64_t); }
};
+
+template <>
+struct type_traits<TimestampType> {
+ typedef TimestampArray ArrayType;
+
+ static inline int bytes_required(int elements) { return elements * sizeof(int64_t); }
+};
template <>
+
struct type_traits<FloatType> {
typedef FloatArray ArrayType;
@@ -260,6 +270,7 @@ typedef NumericBuilder<Int8Type> Int8Builder;
typedef NumericBuilder<Int16Type> Int16Builder;
typedef NumericBuilder<Int32Type> Int32Builder;
typedef NumericBuilder<Int64Type> Int64Builder;
+typedef NumericBuilder<TimestampType> TimestampBuilder;
typedef NumericBuilder<FloatType> FloatBuilder;
typedef NumericBuilder<DoubleType> DoubleBuilder;
http://git-wip-us.apache.org/repos/asf/arrow/blob/0a411fd2/python/pyarrow/array.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/array.pyx b/python/pyarrow/array.pyx
index 619e5ef..5229b42 100644
--- a/python/pyarrow/array.pyx
+++ b/python/pyarrow/array.pyx
@@ -19,6 +19,8 @@
# distutils: language = c++
# cython: embedsignature = True
+import numpy as np
+
from pyarrow.includes.libarrow cimport *
cimport pyarrow.includes.pyarrow as pyarrow
@@ -186,6 +188,7 @@ cdef dict _array_classes = {
Type_DOUBLE: DoubleArray,
Type_LIST: ListArray,
Type_STRING: StringArray,
+ Type_TIMESTAMP: Int64Array,
}
cdef object box_arrow_array(const shared_ptr[CArray]& sp_array):
@@ -217,11 +220,28 @@ def from_pylist(object list_obj, DataType type=None):
return box_arrow_array(sp_array)
-def from_pandas_series(object series, object mask=None):
+def from_pandas_series(object series, object mask=None, timestamps_to_ms=False):
+ """
+ Convert pandas.Series to an Arrow Array.
+
+ Parameters
+ ----------
+ series: pandas.Series or numpy.ndarray
+
+ mask: pandas.Series or numpy.ndarray
+ array to mask null entries in the series
+
+ timestamps_to_ms: bool
+ Convert datetime columns to ms resolution. This is needed for
+ compability with other functionality like Parquet I/O which
+ only supports milliseconds.
+ """
cdef:
shared_ptr[CArray] out
series_values = series_as_ndarray(series)
+ if series_values.dtype.type == np.datetime64 and timestamps_to_ms:
+ series_values = series_values.astype('datetime64[ms]')
if mask is None:
check_status(pyarrow.PandasToArrow(pyarrow.GetMemoryPool(),
@@ -234,14 +254,28 @@ def from_pandas_series(object series, object mask=None):
return box_arrow_array(out)
-def from_pandas_dataframe(object df, name=None):
+def from_pandas_dataframe(object df, name=None, timestamps_to_ms=False):
+ """
+ Convert pandas.DataFrame to an Arrow Table
+
+ Parameters
+ ----------
+ df: pandas.DataFrame
+
+ name: str
+
+ timestamps_to_ms: bool
+ Convert datetime columns to ms resolution. This is needed for
+ compability with other functionality like Parquet I/O which
+ only supports milliseconds.
+ """
cdef:
list names = []
list arrays = []
for name in df.columns:
col = df[name]
- arr = from_pandas_series(col)
+ arr = from_pandas_series(col, timestamps_to_ms=timestamps_to_ms)
names.append(name)
arrays.append(arr)
http://git-wip-us.apache.org/repos/asf/arrow/blob/0a411fd2/python/pyarrow/includes/libarrow.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 91ce069..854d07d 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -38,6 +38,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
Type_FLOAT" arrow::Type::FLOAT"
Type_DOUBLE" arrow::Type::DOUBLE"
+ Type_TIMESTAMP" arrow::Type::TIMESTAMP"
Type_STRING" arrow::Type::STRING"
Type_LIST" arrow::Type::LIST"
http://git-wip-us.apache.org/repos/asf/arrow/blob/0a411fd2/python/pyarrow/tests/test_convert_pandas.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index 6dc9c68..5530299 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -33,8 +33,9 @@ class TestPandasConversion(unittest.TestCase):
def tearDown(self):
pass
- def _check_pandas_roundtrip(self, df, expected=None):
- table = A.from_pandas_dataframe(df)
+ def _check_pandas_roundtrip(self, df, expected=None,
+ timestamps_to_ms=False):
+ table = A.from_pandas_dataframe(df, timestamps_to_ms=timestamps_to_ms)
result = table.to_pandas()
if expected is None:
expected = df
@@ -164,6 +165,25 @@ class TestPandasConversion(unittest.TestCase):
expected = pd.DataFrame({'strings': values * repeats})
self._check_pandas_roundtrip(df, expected)
+ def test_timestamps_notimezone(self):
+ df = pd.DataFrame({
+ 'datetime64': np.array([
+ '2007-07-13T01:23:34.123',
+ '2006-01-13T12:34:56.432',
+ '2010-08-13T05:46:57.437'],
+ dtype='datetime64[ms]')
+ })
+ self._check_pandas_roundtrip(df, timestamps_to_ms=True)
+
+ df = pd.DataFrame({
+ 'datetime64': np.array([
+ '2007-07-13T01:23:34.123456789',
+ '2006-01-13T12:34:56.432539784',
+ '2010-08-13T05:46:57.437699912'],
+ dtype='datetime64[ns]')
+ })
+ self._check_pandas_roundtrip(df, timestamps_to_ms=False)
+
# def test_category(self):
# repeats = 1000
# values = [b'foo', None, u'bar', 'qux', np.nan]
http://git-wip-us.apache.org/repos/asf/arrow/blob/0a411fd2/python/pyarrow/tests/test_parquet.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index de9cfbb..d89d947 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -57,11 +57,13 @@ def test_pandas_parquet_2_0_rountrip(tmpdir):
'float32': np.arange(size, dtype=np.float32),
'float64': np.arange(size, dtype=np.float64),
'bool': np.random.randn(size) > 0,
+ # Pandas only support ns resolution, Arrow at the moment only ms
+ 'datetime': np.arange("2016-01-01T00:00:00.001", size, dtype='datetime64[ms]'),
'str': [str(x) for x in range(size)],
'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None]
})
filename = tmpdir.join('pandas_rountrip.parquet')
- arrow_table = A.from_pandas_dataframe(df)
+ arrow_table = A.from_pandas_dataframe(df, timestamps_to_ms=True)
A.parquet.write_table(arrow_table, filename.strpath, version="2.0")
table_read = pyarrow.parquet.read_table(filename.strpath)
df_read = table_read.to_pandas()
http://git-wip-us.apache.org/repos/asf/arrow/blob/0a411fd2/python/src/pyarrow/adapters/pandas.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/adapters/pandas.cc b/python/src/pyarrow/adapters/pandas.cc
index 8dcc2b1..a4e7fb6 100644
--- a/python/src/pyarrow/adapters/pandas.cc
+++ b/python/src/pyarrow/adapters/pandas.cc
@@ -38,6 +38,7 @@ namespace pyarrow {
using arrow::Array;
using arrow::Column;
+using arrow::DataType;
namespace util = arrow::util;
// ----------------------------------------------------------------------
@@ -50,7 +51,7 @@ struct npy_traits {
template <>
struct npy_traits<NPY_BOOL> {
typedef uint8_t value_type;
- using ArrayType = arrow::BooleanArray;
+ using TypeClass = arrow::BooleanType;
static constexpr bool supports_nulls = false;
static inline bool isnull(uint8_t v) {
@@ -62,7 +63,7 @@ struct npy_traits<NPY_BOOL> {
template <> \
struct npy_traits<NPY_##TYPE> { \
typedef T value_type; \
- using ArrayType = arrow::CapType##Array; \
+ using TypeClass = arrow::CapType##Type; \
\
static constexpr bool supports_nulls = false; \
static inline bool isnull(T v) { \
@@ -82,7 +83,7 @@ NPY_INT_DECL(UINT64, UInt64, uint64_t);
template <>
struct npy_traits<NPY_FLOAT32> {
typedef float value_type;
- using ArrayType = arrow::FloatArray;
+ using TypeClass = arrow::FloatType;
static constexpr bool supports_nulls = true;
@@ -94,7 +95,7 @@ struct npy_traits<NPY_FLOAT32> {
template <>
struct npy_traits<NPY_FLOAT64> {
typedef double value_type;
- using ArrayType = arrow::DoubleArray;
+ using TypeClass = arrow::DoubleType;
static constexpr bool supports_nulls = true;
@@ -104,6 +105,22 @@ struct npy_traits<NPY_FLOAT64> {
};
template <>
+struct npy_traits<NPY_DATETIME> {
+ typedef double value_type;
+ using TypeClass = arrow::TimestampType;
+
+ static constexpr bool supports_nulls = true;
+
+ static inline bool isnull(int64_t v) {
+ // NaT = -2**63
+ // = -0x8000000000000000
+ // = -9223372036854775808;
+ // = std::numeric_limits<int64_t>::min()
+ return v == std::numeric_limits<int64_t>::min();
+ }
+};
+
+template <>
struct npy_traits<NPY_OBJECT> {
typedef PyObject* value_type;
static constexpr bool supports_nulls = true;
@@ -206,6 +223,8 @@ class ArrowSerializer {
return Status::OK();
}
+ Status MakeDataType(std::shared_ptr<DataType>* out);
+
arrow::MemoryPool* pool_;
PyArrayObject* arr_;
@@ -254,6 +273,39 @@ static int64_t ValuesToBitmap(const void* data, int64_t length, uint8_t* bitmap)
}
template <int TYPE>
+inline Status ArrowSerializer<TYPE>::MakeDataType(std::shared_ptr<DataType>* out) {
+ out->reset(new typename npy_traits<TYPE>::TypeClass());
+ return Status::OK();
+}
+
+template <>
+inline Status ArrowSerializer<NPY_DATETIME>::MakeDataType(std::shared_ptr<DataType>* out) {
+ PyArray_Descr* descr = PyArray_DESCR(arr_);
+ auto date_dtype = reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(descr->c_metadata);
+ arrow::TimestampType::Unit unit;
+
+ switch (date_dtype->meta.base) {
+ case NPY_FR_s:
+ unit = arrow::TimestampType::Unit::SECOND;
+ break;
+ case NPY_FR_ms:
+ unit = arrow::TimestampType::Unit::MILLI;
+ break;
+ case NPY_FR_us:
+ unit = arrow::TimestampType::Unit::MICRO;
+ break;
+ case NPY_FR_ns:
+ unit = arrow::TimestampType::Unit::NANO;
+ break;
+ default:
+ return Status::ValueError("Unknown NumPy datetime unit");
+ }
+
+ out->reset(new arrow::TimestampType(unit));
+ return Status::OK();
+}
+
+template <int TYPE>
inline Status ArrowSerializer<TYPE>::Convert(std::shared_ptr<Array>* out) {
typedef npy_traits<TYPE> traits;
@@ -269,9 +321,9 @@ inline Status ArrowSerializer<TYPE>::Convert(std::shared_ptr<Array>* out) {
}
RETURN_NOT_OK(ConvertData());
- *out = std::make_shared<typename traits::ArrayType>(length_, data_, null_count,
- null_bitmap_);
-
+ std::shared_ptr<DataType> type;
+ RETURN_NOT_OK(MakeDataType(&type));
+ RETURN_ARROW_NOT_OK(MakePrimitiveArray(type, length_, data_, null_count, null_bitmap_, out));
return Status::OK();
}
@@ -402,6 +454,7 @@ Status PandasMaskedToArrow(arrow::MemoryPool* pool, PyObject* ao, PyObject* mo,
TO_ARROW_CASE(UINT64);
TO_ARROW_CASE(FLOAT32);
TO_ARROW_CASE(FLOAT64);
+ TO_ARROW_CASE(DATETIME);
TO_ARROW_CASE(OBJECT);
default:
std::stringstream ss;
@@ -477,6 +530,17 @@ struct arrow_traits<arrow::Type::DOUBLE> {
};
template <>
+struct arrow_traits<arrow::Type::TIMESTAMP> {
+ static constexpr int npy_type = NPY_DATETIME;
+ static constexpr bool supports_nulls = true;
+ static constexpr int64_t na_value = std::numeric_limits<int64_t>::min();
+ static constexpr bool is_boolean = false;
+ static constexpr bool is_integer = true;
+ static constexpr bool is_floating = false;
+ typedef typename npy_traits<NPY_DATETIME>::value_type T;
+};
+
+template <>
struct arrow_traits<arrow::Type::STRING> {
static constexpr int npy_type = NPY_OBJECT;
static constexpr bool supports_nulls = true;
@@ -494,6 +558,30 @@ static inline PyObject* make_pystring(const uint8_t* data, int32_t length) {
#endif
}
+inline void set_numpy_metadata(int type, DataType* datatype, PyArrayObject* out) {
+ if (type == NPY_DATETIME) {
+ auto timestamp_type = static_cast<arrow::TimestampType*>(datatype);
+ // We only support ms resolution at the moment
+ PyArray_Descr* descr = PyArray_DESCR(out);
+ auto date_dtype = reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(descr->c_metadata);
+
+ switch (timestamp_type->unit) {
+ case arrow::TimestampType::Unit::SECOND:
+ date_dtype->meta.base = NPY_FR_s;
+ break;
+ case arrow::TimestampType::Unit::MILLI:
+ date_dtype->meta.base = NPY_FR_ms;
+ break;
+ case arrow::TimestampType::Unit::MICRO:
+ date_dtype->meta.base = NPY_FR_us;
+ break;
+ case arrow::TimestampType::Unit::NANO:
+ date_dtype->meta.base = NPY_FR_ns;
+ break;
+ }
+ }
+}
+
template <int TYPE>
class ArrowDeserializer {
public:
@@ -522,6 +610,8 @@ class ArrowDeserializer {
return Status::OK();
}
+ set_numpy_metadata(type, col_->type().get(), out_);
+
return Status::OK();
}
@@ -538,6 +628,8 @@ class ArrowDeserializer {
return Status::OK();
}
+ set_numpy_metadata(type, col_->type().get(), out_);
+
if (PyArray_SetBaseObject(out_, py_ref_) == -1) {
// Error occurred, trust that SetBaseObject set the error state
return Status::OK();
@@ -713,6 +805,7 @@ Status ArrowToPandas(const std::shared_ptr<Column>& col, PyObject* py_ref,
FROM_ARROW_CASE(FLOAT);
FROM_ARROW_CASE(DOUBLE);
FROM_ARROW_CASE(STRING);
+ FROM_ARROW_CASE(TIMESTAMP);
default:
return Status::NotImplemented("Arrow type reading not implemented");
}