You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/04/10 00:00:37 UTC
arrow git commit: ARROW-722: [Python] Support additional date/time
types and metadata, conversion to/from NumPy and pandas.DataFrame
Repository: arrow
Updated Branches:
refs/heads/master 754bcce68 -> 137aade40
ARROW-722: [Python] Support additional date/time types and metadata, conversion to/from NumPy and pandas.DataFrame
Would appreciate a close look from @xhochy, @cpcloud. Also did some inline visitor cleaning for nicer code reuse
Author: Wes McKinney <we...@twosigma.com>
Closes #510 from wesm/ARROW-722 and squashes the following commits:
3e1fda3 [Wes McKinney] cpplint
cb32a6b [Wes McKinney] Nicer error message. Run clang-format
854f470 [Wes McKinney] First cut refactor
06dce15 [Wes McKinney] Rebase conflicts
d1dc342 [Wes McKinney] Bring Python bindings to date/time types up to spec. Handle zero-copy creation from same-size int32/64. Use inline visitor in PandasConverter
Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/137aade4
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/137aade4
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/137aade4
Branch: refs/heads/master
Commit: 137aade404cf53a7dbe0eaa31a868d1c376654b3
Parents: 754bcce
Author: Wes McKinney <we...@twosigma.com>
Authored: Sun Apr 9 20:00:30 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Sun Apr 9 20:00:30 2017 -0400
----------------------------------------------------------------------
cpp/CMakeLists.txt | 6 +
cpp/src/arrow/CMakeLists.txt | 1 +
cpp/src/arrow/python/builtin_convert.cc | 29 +-
cpp/src/arrow/python/builtin_convert.h | 4 +
cpp/src/arrow/python/pandas_convert.cc | 345 +++++++++++------------
cpp/src/arrow/python/pandas_convert.h | 3 -
cpp/src/arrow/python/type_traits.h | 63 +++--
cpp/src/arrow/python/util/datetime.h | 6 +-
cpp/src/arrow/table-test.cc | 55 +---
cpp/src/arrow/table.cc | 10 +-
cpp/src/arrow/table.h | 4 +-
cpp/src/arrow/type.cc | 4 +-
cpp/src/arrow/type.h | 4 +-
cpp/src/arrow/util/stl.h | 4 +-
python/pyarrow/scalar.pyx | 6 +-
python/pyarrow/tests/test_convert_pandas.py | 182 +++++++-----
16 files changed, 399 insertions(+), 327 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/arrow/blob/137aade4/cpp/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 5852fe5..b29cb7b 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -386,6 +386,12 @@ enable_testing()
set(Boost_DEBUG TRUE)
set(Boost_USE_MULTITHREADED ON)
+set(Boost_ADDITIONAL_VERSIONS
+ "1.63.0" "1.63"
+ "1.62.0" "1.61"
+ "1.61.0" "1.62"
+ "1.60.0" "1.60")
+
if (ARROW_BOOST_USE_SHARED)
# Find shared Boost libraries.
set(Boost_USE_STATIC_LIBS OFF)
http://git-wip-us.apache.org/repos/asf/arrow/blob/137aade4/cpp/src/arrow/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index 8eaa76a..cb5282c 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -34,6 +34,7 @@ install(FILES
type_traits.h
test-util.h
visitor.h
+ visitor_inline.h
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow")
# pkg-config support
http://git-wip-us.apache.org/repos/asf/arrow/blob/137aade4/cpp/src/arrow/python/builtin_convert.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc
index 189ecee..a064a3d 100644
--- a/cpp/src/arrow/python/builtin_convert.cc
+++ b/cpp/src/arrow/python/builtin_convert.cc
@@ -43,6 +43,31 @@ static inline bool IsPyInteger(PyObject* obj) {
#endif
}
+Status InvalidConversion(PyObject* obj, const std::string& expected_type_name) {
+ OwnedRef type(PyObject_Type(obj));
+ RETURN_IF_PYERROR();
+ DCHECK_NE(type.obj(), nullptr);
+
+ OwnedRef type_name(PyObject_GetAttrString(type.obj(), "__name__"));
+ RETURN_IF_PYERROR();
+ DCHECK_NE(type_name.obj(), nullptr);
+
+ PyObjectStringify bytestring(type_name.obj());
+ RETURN_IF_PYERROR();
+
+ const char* bytes = bytestring.bytes;
+ DCHECK_NE(bytes, nullptr) << "bytes from type(...).__name__ were null";
+
+ Py_ssize_t size = bytestring.size;
+
+ std::string cpp_type_name(bytes, size);
+
+ std::stringstream ss;
+ ss << "Python object of type " << cpp_type_name << " is not None and is not a "
+ << expected_type_name << " object";
+ return Status::Invalid(ss.str());
+}
+
class ScalarVisitor {
public:
ScalarVisitor()
@@ -397,7 +422,7 @@ class BytesConverter : public TypedConverter<BinaryBuilder> {
} else if (PyBytes_Check(item)) {
bytes_obj = item;
} else {
- return Status::Invalid("Value that cannot be converted to bytes was encountered");
+ return InvalidConversion(item, "bytes");
}
// No error checking
length = PyBytes_GET_SIZE(bytes_obj);
@@ -431,7 +456,7 @@ class FixedWidthBytesConverter : public TypedConverter<FixedSizeBinaryBuilder> {
} else if (PyBytes_Check(item)) {
bytes_obj = item;
} else {
- return Status::Invalid("Value that cannot be converted to bytes was encountered");
+ return InvalidConversion(item, "bytes");
}
// No error checking
RETURN_NOT_OK(CheckPythonBytesAreFixedLength(bytes_obj, expected_length));
http://git-wip-us.apache.org/repos/asf/arrow/blob/137aade4/cpp/src/arrow/python/builtin_convert.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/builtin_convert.h b/cpp/src/arrow/python/builtin_convert.h
index 3c2e350..2141c25 100644
--- a/cpp/src/arrow/python/builtin_convert.h
+++ b/cpp/src/arrow/python/builtin_convert.h
@@ -24,6 +24,7 @@
#include <Python.h>
#include <memory>
+#include <string>
#include "arrow/type.h"
@@ -60,6 +61,9 @@ ARROW_EXPORT
Status ConvertPySequence(PyObject* obj, MemoryPool* pool, std::shared_ptr<Array>* out,
const std::shared_ptr<DataType>& type, int64_t size);
+ARROW_EXPORT
+Status InvalidConversion(PyObject* obj, const std::string& expected_type_name);
+
ARROW_EXPORT Status CheckPythonBytesAreFixedLength(
PyObject* obj, Py_ssize_t expected_length);
http://git-wip-us.apache.org/repos/asf/arrow/blob/137aade4/cpp/src/arrow/python/pandas_convert.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/pandas_convert.cc b/cpp/src/arrow/python/pandas_convert.cc
index f6e627e..5bb8e45 100644
--- a/cpp/src/arrow/python/pandas_convert.cc
+++ b/cpp/src/arrow/python/pandas_convert.cc
@@ -44,6 +44,7 @@
#include "arrow/util/decimal.h"
#include "arrow/util/logging.h"
#include "arrow/util/macros.h"
+#include "arrow/visitor_inline.h"
#include "arrow/python/builtin_convert.h"
#include "arrow/python/common.h"
@@ -271,7 +272,7 @@ static inline bool ListTypeSupported(const Type::type type_id) {
// ----------------------------------------------------------------------
// Conversion from NumPy-in-Pandas to Arrow
-class PandasConverter : public TypeVisitor {
+class PandasConverter {
public:
PandasConverter(
MemoryPool* pool, PyObject* ao, PyObject* mo, const std::shared_ptr<DataType>& type)
@@ -332,23 +333,37 @@ class PandasConverter : public TypeVisitor {
return LoadArray(type_, fields, {null_bitmap_, data}, &out_);
}
-#define VISIT_NATIVE(TYPE) \
- Status Visit(const TYPE& type) override { return VisitNative<TYPE>(); }
+ template <typename T>
+ typename std::enable_if<std::is_base_of<PrimitiveCType, T>::value ||
+ std::is_same<BooleanType, T>::value,
+ Status>::type
+ Visit(const T& type) {
+ return VisitNative<T>();
+ }
+
+ Status Visit(const Date32Type& type) { return VisitNative<Int32Type>(); }
+ Status Visit(const Date64Type& type) { return VisitNative<Int64Type>(); }
+ Status Visit(const TimestampType& type) { return VisitNative<TimestampType>(); }
+ Status Visit(const Time32Type& type) { return VisitNative<Int32Type>(); }
+ Status Visit(const Time64Type& type) { return VisitNative<Int64Type>(); }
+
+ Status Visit(const NullType& type) { return Status::NotImplemented("null"); }
+
+ Status Visit(const BinaryType& type) { return Status::NotImplemented(type.ToString()); }
+
+ Status Visit(const FixedSizeBinaryType& type) {
+ return Status::NotImplemented(type.ToString());
+ }
- VISIT_NATIVE(BooleanType);
- VISIT_NATIVE(Int8Type);
- VISIT_NATIVE(Int16Type);
- VISIT_NATIVE(Int32Type);
- VISIT_NATIVE(Int64Type);
- VISIT_NATIVE(UInt8Type);
- VISIT_NATIVE(UInt16Type);
- VISIT_NATIVE(UInt32Type);
- VISIT_NATIVE(UInt64Type);
- VISIT_NATIVE(FloatType);
- VISIT_NATIVE(DoubleType);
- VISIT_NATIVE(TimestampType);
+ Status Visit(const DecimalType& type) {
+ return Status::NotImplemented(type.ToString());
+ }
-#undef VISIT_NATIVE
+ Status Visit(const DictionaryType& type) {
+ return Status::NotImplemented(type.ToString());
+ }
+
+ Status Visit(const NestedType& type) { return Status::NotImplemented(type.ToString()); }
Status Convert() {
if (PyArray_NDIM(arr_) != 1) {
@@ -358,9 +373,7 @@ class PandasConverter : public TypeVisitor {
if (type_ == nullptr) { return Status::Invalid("Must pass data type"); }
// Visit the type to perform conversion
- RETURN_NOT_OK(type_->Accept(this));
-
- return Status::OK();
+ return VisitTypeInline(*type_, this);
}
std::shared_ptr<Array> result() const { return out_; }
@@ -371,10 +384,12 @@ class PandasConverter : public TypeVisitor {
template <int ITEM_TYPE, typename ArrowType>
Status ConvertTypedLists(const std::shared_ptr<DataType>& type);
+ template <typename ArrowType>
+ Status ConvertDates();
+
Status ConvertObjectStrings();
Status ConvertObjectFixedWidthBytes(const std::shared_ptr<DataType>& type);
Status ConvertBooleans();
- Status ConvertDates();
Status ConvertLists(const std::shared_ptr<DataType>& type);
Status ConvertObjects();
Status ConvertDecimals();
@@ -462,41 +477,36 @@ inline Status PandasConverter::ConvertData<BooleanType>(std::shared_ptr<Buffer>*
return Status::OK();
}
-Status InvalidConversion(PyObject* obj, const std::string& expected_type_name) {
- OwnedRef type(PyObject_Type(obj));
- RETURN_IF_PYERROR();
- DCHECK_NE(type.obj(), nullptr);
-
- OwnedRef type_name(PyObject_GetAttrString(type.obj(), "__name__"));
- RETURN_IF_PYERROR();
- DCHECK_NE(type_name.obj(), nullptr);
-
- PyObjectStringify bytestring(type_name.obj());
- RETURN_IF_PYERROR();
-
- const char* bytes = bytestring.bytes;
- DCHECK_NE(bytes, nullptr) << "bytes from type(...).__name__ were null";
-
- Py_ssize_t size = bytestring.size;
+template <typename T>
+struct UnboxDate {};
- std::string cpp_type_name(bytes, size);
+template <>
+struct UnboxDate<Date32Type> {
+ static int64_t Unbox(PyObject* obj) {
+ return PyDate_to_days(reinterpret_cast<PyDateTime_Date*>(obj));
+ }
+};
- std::stringstream ss;
- ss << "Python object of type " << cpp_type_name << " is not None and is not a "
- << expected_type_name << " object";
- return Status::Invalid(ss.str());
-}
+template <>
+struct UnboxDate<Date64Type> {
+ static int64_t Unbox(PyObject* obj) {
+ return PyDate_to_ms(reinterpret_cast<PyDateTime_Date*>(obj));
+ }
+};
+template <typename ArrowType>
Status PandasConverter::ConvertDates() {
PyAcquireGIL lock;
+ using BuilderType = typename TypeTraits<ArrowType>::BuilderType;
+
Ndarray1DIndexer<PyObject*> objects(arr_);
if (mask_ != nullptr) {
return Status::NotImplemented("mask not supported in object conversions yet");
}
- Date64Builder date_builder(pool_);
+ BuilderType date_builder(pool_);
RETURN_NOT_OK(date_builder.Resize(length_));
/// We have to run this in this compilation unit, since we cannot use the
@@ -508,8 +518,7 @@ Status PandasConverter::ConvertDates() {
for (int64_t i = 0; i < length_; ++i) {
obj = objects[i];
if (PyDate_CheckExact(obj)) {
- PyDateTime_Date* pydate = reinterpret_cast<PyDateTime_Date*>(obj);
- date_builder.Append(PyDate_to_ms(pydate));
+ date_builder.Append(UnboxDate<ArrowType>::Unbox(obj));
} else if (PyObject_is_null(obj)) {
date_builder.AppendNull();
} else {
@@ -762,8 +771,10 @@ Status PandasConverter::ConvertObjects() {
return ConvertObjectFixedWidthBytes(type_);
case Type::BOOL:
return ConvertBooleans();
+ case Type::DATE32:
+ return ConvertDates<Date32Type>();
case Type::DATE64:
- return ConvertDates();
+ return ConvertDates<Date64Type>();
case Type::LIST: {
const auto& list_field = static_cast<const ListType&>(*type_);
return ConvertLists(list_field.value_field()->type);
@@ -787,7 +798,8 @@ Status PandasConverter::ConvertObjects() {
} else if (PyBool_Check(objects[i])) {
return ConvertBooleans();
} else if (PyDate_CheckExact(objects[i])) {
- return ConvertDates();
+ // We could choose Date32 or Date64
+ return ConvertDates<Date32Type>();
} else if (PyObject_IsInstance(const_cast<PyObject*>(objects[i]), Decimal.obj())) {
return ConvertDecimals();
} else {
@@ -955,34 +967,6 @@ Status PandasObjectsToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo,
// ----------------------------------------------------------------------
// pandas 0.x DataFrame conversion internals
-inline void set_numpy_metadata(int type, DataType* datatype, PyArrayObject* out) {
- if (type == NPY_DATETIME) {
- PyArray_Descr* descr = PyArray_DESCR(out);
- auto date_dtype = reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(descr->c_metadata);
- if (datatype->type == Type::TIMESTAMP) {
- auto timestamp_type = static_cast<TimestampType*>(datatype);
-
- switch (timestamp_type->unit) {
- case TimestampType::Unit::SECOND:
- date_dtype->meta.base = NPY_FR_s;
- break;
- case TimestampType::Unit::MILLI:
- date_dtype->meta.base = NPY_FR_ms;
- break;
- case TimestampType::Unit::MICRO:
- date_dtype->meta.base = NPY_FR_us;
- break;
- case TimestampType::Unit::NANO:
- date_dtype->meta.base = NPY_FR_ns;
- break;
- }
- } else {
- // datatype->type == Type::DATE64
- date_dtype->meta.base = NPY_FR_D;
- }
- }
-}
-
class PandasBlock {
public:
enum type {
@@ -1148,8 +1132,9 @@ static void ConvertBooleanNoNulls(const ChunkedArray& data, uint8_t* out_values)
}
}
-template <typename ArrayType>
+template <typename Type>
inline Status ConvertBinaryLike(const ChunkedArray& data, PyObject** out_values) {
+ using ArrayType = typename TypeTraits<Type>::ArrayType;
PyAcquireGIL lock;
for (int c = 0; c < data.num_chunks(); c++) {
auto arr = static_cast<ArrayType*>(data.chunk(c).get());
@@ -1287,21 +1272,7 @@ inline void ConvertNumericNullableCast(
}
}
-template <typename T>
-inline void ConvertDates(const ChunkedArray& data, T na_value, T* out_values) {
- for (int c = 0; c < data.num_chunks(); c++) {
- const std::shared_ptr<Array> arr = data.chunk(c);
- auto prim_arr = static_cast<PrimitiveArray*>(arr.get());
- auto in_values = reinterpret_cast<const T*>(prim_arr->data()->data());
-
- for (int64_t i = 0; i < arr->length(); ++i) {
- // There are 1000 * 60 * 60 * 24 = 86400000ms in a day
- *out_values++ = arr->IsNull(i) ? na_value : in_values[i] / 86400000;
- }
- }
-}
-
-template <typename InType, int SHIFT>
+template <typename InType, int64_t SHIFT>
inline void ConvertDatetimeNanos(const ChunkedArray& data, int64_t* out_values) {
for (int c = 0; c < data.num_chunks(); c++) {
const std::shared_ptr<Array> arr = data.chunk(c);
@@ -1339,9 +1310,9 @@ class ObjectBlock : public PandasBlock {
if (type == Type::BOOL) {
RETURN_NOT_OK(ConvertBooleanWithNulls(data, out_buffer));
} else if (type == Type::BINARY) {
- RETURN_NOT_OK(ConvertBinaryLike<BinaryArray>(data, out_buffer));
+ RETURN_NOT_OK(ConvertBinaryLike<BinaryType>(data, out_buffer));
} else if (type == Type::STRING) {
- RETURN_NOT_OK(ConvertBinaryLike<StringArray>(data, out_buffer));
+ RETURN_NOT_OK(ConvertBinaryLike<StringType>(data, out_buffer));
} else if (type == Type::FIXED_SIZE_BINARY) {
RETURN_NOT_OK(ConvertFixedSizeBinary(data, out_buffer));
} else if (type == Type::DECIMAL) {
@@ -1532,7 +1503,11 @@ class DatetimeBlock : public PandasBlock {
const ChunkedArray& data = *col.get()->data();
- if (type == Type::DATE64) {
+ if (type == Type::DATE32) {
+ // Date64Type is millisecond timestamp stored as int64_t
+ // TODO(wesm): Do we want to make sure to zero out the milliseconds?
+ ConvertDatetimeNanos<int32_t, kNanosecondsInDay>(data, out_buffer);
+ } else if (type == Type::DATE64) {
// Date64Type is millisecond timestamp stored as int64_t
// TODO(wesm): Do we want to make sure to zero out the milliseconds?
ConvertDatetimeNanos<int64_t, 1000000L>(data, out_buffer);
@@ -1779,6 +1754,9 @@ class DataFrameBlockCreator {
case Type::FIXED_SIZE_BINARY:
output_type = PandasBlock::OBJECT;
break;
+ case Type::DATE32:
+ output_type = PandasBlock::DATETIME;
+ break;
case Type::DATE64:
output_type = PandasBlock::DATETIME;
break;
@@ -1960,6 +1938,34 @@ class DataFrameBlockCreator {
BlockMap datetimetz_blocks_;
};
+inline void set_numpy_metadata(int type, DataType* datatype, PyArrayObject* out) {
+ if (type == NPY_DATETIME) {
+ PyArray_Descr* descr = PyArray_DESCR(out);
+ auto date_dtype = reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(descr->c_metadata);
+ if (datatype->type == Type::TIMESTAMP) {
+ auto timestamp_type = static_cast<TimestampType*>(datatype);
+
+ switch (timestamp_type->unit) {
+ case TimestampType::Unit::SECOND:
+ date_dtype->meta.base = NPY_FR_s;
+ break;
+ case TimestampType::Unit::MILLI:
+ date_dtype->meta.base = NPY_FR_ms;
+ break;
+ case TimestampType::Unit::MICRO:
+ date_dtype->meta.base = NPY_FR_us;
+ break;
+ case TimestampType::Unit::NANO:
+ date_dtype->meta.base = NPY_FR_ns;
+ break;
+ }
+ } else {
+ // datatype->type == Type::DATE64
+ date_dtype->meta.base = NPY_FR_D;
+ }
+ }
+}
+
class ArrowDeserializer {
public:
ArrowDeserializer(const std::shared_ptr<Column>& col, PyObject* py_ref)
@@ -2024,51 +2030,14 @@ class ArrowDeserializer {
// Allocate new array and deserialize. Can do a zero copy conversion for some
// types
- Status Convert(PyObject** out) {
-#define CONVERT_CASE(TYPE) \
- case Type::TYPE: { \
- RETURN_NOT_OK(ConvertValues<Type::TYPE>()); \
- } break;
-
- switch (col_->type()->type) {
- CONVERT_CASE(BOOL);
- CONVERT_CASE(INT8);
- CONVERT_CASE(INT16);
- CONVERT_CASE(INT32);
- CONVERT_CASE(INT64);
- CONVERT_CASE(UINT8);
- CONVERT_CASE(UINT16);
- CONVERT_CASE(UINT32);
- CONVERT_CASE(UINT64);
- CONVERT_CASE(FLOAT);
- CONVERT_CASE(DOUBLE);
- CONVERT_CASE(BINARY);
- CONVERT_CASE(STRING);
- CONVERT_CASE(FIXED_SIZE_BINARY);
- CONVERT_CASE(DATE64);
- CONVERT_CASE(TIMESTAMP);
- CONVERT_CASE(DICTIONARY);
- CONVERT_CASE(LIST);
- CONVERT_CASE(DECIMAL);
- default: {
- std::stringstream ss;
- ss << "Arrow type reading not implemented for " << col_->type()->ToString();
- return Status::NotImplemented(ss.str());
- }
- }
-
-#undef CONVERT_CASE
-
- *out = result_;
- return Status::OK();
- }
+ template <typename Type>
+ typename std::enable_if<std::is_base_of<FloatingPoint, Type>::value, Status>::type
+ Visit(const Type& type) {
+ constexpr int TYPE = Type::type_id;
+ using traits = arrow_traits<TYPE>;
- template <int TYPE>
- inline typename std::enable_if<
- (TYPE != Type::DATE64) & arrow_traits<TYPE>::is_numeric_nullable, Status>::type
- ConvertValues() {
- typedef typename arrow_traits<TYPE>::T T;
- int npy_type = arrow_traits<TYPE>::npy_type;
+ typedef typename traits::T T;
+ int npy_type = traits::npy_type;
if (data_.num_chunks() == 1 && data_.null_count() == 0 && py_ref_ != nullptr) {
return ConvertValuesZeroCopy<TYPE>(npy_type, data_.chunk(0));
@@ -2076,31 +2045,56 @@ class ArrowDeserializer {
RETURN_NOT_OK(AllocateOutput(npy_type));
auto out_values = reinterpret_cast<T*>(PyArray_DATA(arr_));
- ConvertNumericNullable<T>(data_, arrow_traits<TYPE>::na_value, out_values);
+ ConvertNumericNullable<T>(data_, traits::na_value, out_values);
return Status::OK();
}
- template <int TYPE>
- inline typename std::enable_if<TYPE == Type::DATE64, Status>::type ConvertValues() {
- typedef typename arrow_traits<TYPE>::T T;
+ template <typename Type>
+ typename std::enable_if<std::is_base_of<DateType, Type>::value ||
+ std::is_base_of<TimestampType, Type>::value,
+ Status>::type
+ Visit(const Type& type) {
+ constexpr int TYPE = Type::type_id;
+ using traits = arrow_traits<TYPE>;
+
+ typedef typename traits::T T;
- RETURN_NOT_OK(AllocateOutput(arrow_traits<TYPE>::npy_type));
+ RETURN_NOT_OK(AllocateOutput(traits::npy_type));
auto out_values = reinterpret_cast<T*>(PyArray_DATA(arr_));
- ConvertDates<T>(data_, arrow_traits<TYPE>::na_value, out_values);
+
+ constexpr T na_value = traits::na_value;
+ constexpr int64_t kShift = traits::npy_shift;
+
+ for (int c = 0; c < data_.num_chunks(); c++) {
+ const std::shared_ptr<Array> arr = data_.chunk(c);
+ auto prim_arr = static_cast<PrimitiveArray*>(arr.get());
+ auto in_values = reinterpret_cast<const T*>(prim_arr->data()->data());
+
+ for (int64_t i = 0; i < arr->length(); ++i) {
+ *out_values++ = arr->IsNull(i) ? na_value : in_values[i] / kShift;
+ }
+ }
return Status::OK();
}
+ template <typename Type>
+ typename std::enable_if<std::is_base_of<TimeType, Type>::value, Status>::type Visit(
+ const Type& type) {
+ return Status::NotImplemented("Don't know how to serialize Arrow time type to NumPy");
+ }
+
// Integer specialization
- template <int TYPE>
- inline
- typename std::enable_if<arrow_traits<TYPE>::is_numeric_not_nullable, Status>::type
- ConvertValues() {
- typedef typename arrow_traits<TYPE>::T T;
- int npy_type = arrow_traits<TYPE>::npy_type;
+ template <typename Type>
+ typename std::enable_if<std::is_base_of<Integer, Type>::value, Status>::type Visit(
+ const Type& type) {
+ constexpr int TYPE = Type::type_id;
+ using traits = arrow_traits<TYPE>;
+
+ typedef typename traits::T T;
if (data_.num_chunks() == 1 && data_.null_count() == 0 && py_ref_ != nullptr) {
- return ConvertValuesZeroCopy<TYPE>(npy_type, data_.chunk(0));
+ return ConvertValuesZeroCopy<TYPE>(traits::npy_type, data_.chunk(0));
}
if (data_.null_count() > 0) {
@@ -2108,7 +2102,7 @@ class ArrowDeserializer {
auto out_values = reinterpret_cast<double*>(PyArray_DATA(arr_));
ConvertIntegerWithNulls<T>(data_, out_values);
} else {
- RETURN_NOT_OK(AllocateOutput(arrow_traits<TYPE>::npy_type));
+ RETURN_NOT_OK(AllocateOutput(traits::npy_type));
auto out_values = reinterpret_cast<T*>(PyArray_DATA(arr_));
ConvertIntegerNoNullsSameType<T>(data_, out_values);
}
@@ -2117,15 +2111,13 @@ class ArrowDeserializer {
}
// Boolean specialization
- template <int TYPE>
- inline typename std::enable_if<arrow_traits<TYPE>::is_boolean, Status>::type
- ConvertValues() {
+ Status Visit(const BooleanType& type) {
if (data_.null_count() > 0) {
RETURN_NOT_OK(AllocateOutput(NPY_OBJECT));
auto out_values = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
RETURN_NOT_OK(ConvertBooleanWithNulls(data_, out_values));
} else {
- RETURN_NOT_OK(AllocateOutput(arrow_traits<TYPE>::npy_type));
+ RETURN_NOT_OK(AllocateOutput(arrow_traits<Type::BOOL>::npy_type));
auto out_values = reinterpret_cast<uint8_t*>(PyArray_DATA(arr_));
ConvertBooleanNoNulls(data_, out_values);
}
@@ -2133,43 +2125,32 @@ class ArrowDeserializer {
}
// UTF8 strings
- template <int TYPE>
- inline typename std::enable_if<TYPE == Type::STRING, Status>::type ConvertValues() {
+ template <typename Type>
+ typename std::enable_if<std::is_base_of<BinaryType, Type>::value, Status>::type Visit(
+ const Type& type) {
RETURN_NOT_OK(AllocateOutput(NPY_OBJECT));
auto out_values = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
- return ConvertBinaryLike<StringArray>(data_, out_values);
- }
-
- // Binary strings
- template <int T2>
- inline typename std::enable_if<T2 == Type::BINARY, Status>::type ConvertValues() {
- RETURN_NOT_OK(AllocateOutput(NPY_OBJECT));
- auto out_values = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
- return ConvertBinaryLike<BinaryArray>(data_, out_values);
+ return ConvertBinaryLike<Type>(data_, out_values);
}
// Fixed length binary strings
- template <int TYPE>
- inline typename std::enable_if<TYPE == Type::FIXED_SIZE_BINARY, Status>::type
- ConvertValues() {
+ Status Visit(const FixedSizeBinaryType& type) {
RETURN_NOT_OK(AllocateOutput(NPY_OBJECT));
auto out_values = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
return ConvertFixedSizeBinary(data_, out_values);
}
- template <int TYPE>
- inline typename std::enable_if<TYPE == Type::DECIMAL, Status>::type ConvertValues() {
+ Status Visit(const DecimalType& type) {
RETURN_NOT_OK(AllocateOutput(NPY_OBJECT));
auto out_values = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
return ConvertDecimals(data_, out_values);
}
+ Status Visit(const ListType& type) {
#define CONVERTVALUES_LISTSLIKE_CASE(ArrowType, ArrowEnum) \
case Type::ArrowEnum: \
return ConvertListsLike<ArrowType>(col_, out_values);
- template <int T2>
- inline typename std::enable_if<T2 == Type::LIST, Status>::type ConvertValues() {
RETURN_NOT_OK(AllocateOutput(NPY_OBJECT));
auto out_values = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
auto list_type = std::static_pointer_cast<ListType>(col_->type());
@@ -2193,10 +2174,10 @@ class ArrowDeserializer {
return Status::NotImplemented(ss.str());
}
}
+#undef CONVERTVALUES_LISTSLIKE_CASE
}
- template <int TYPE>
- inline typename std::enable_if<TYPE == Type::DICTIONARY, Status>::type ConvertValues() {
+ Status Visit(const DictionaryType& type) {
std::shared_ptr<PandasBlock> block;
RETURN_NOT_OK(MakeCategoricalBlock(col_->type(), col_->length(), &block));
RETURN_NOT_OK(block->Write(col_, 0, 0));
@@ -2216,6 +2197,18 @@ class ArrowDeserializer {
return Status::OK();
}
+ Status Visit(const NullType& type) { return Status::NotImplemented("null type"); }
+
+ Status Visit(const StructType& type) { return Status::NotImplemented("struct type"); }
+
+ Status Visit(const UnionType& type) { return Status::NotImplemented("union type"); }
+
+ Status Convert(PyObject** out) {
+ RETURN_NOT_OK(VisitTypeInline(*col_->type(), this));
+ *out = result_;
+ return Status::OK();
+ }
+
private:
std::shared_ptr<Column> col_;
const ChunkedArray& data_;
http://git-wip-us.apache.org/repos/asf/arrow/blob/137aade4/cpp/src/arrow/python/pandas_convert.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/pandas_convert.h b/cpp/src/arrow/python/pandas_convert.h
index 8fd3107..4d32c8b 100644
--- a/cpp/src/arrow/python/pandas_convert.h
+++ b/cpp/src/arrow/python/pandas_convert.h
@@ -71,9 +71,6 @@ ARROW_EXPORT
Status PandasObjectsToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo,
const std::shared_ptr<DataType>& type, std::shared_ptr<Array>* out);
-ARROW_EXPORT
-Status InvalidConversion(PyObject* obj, const std::string& expected_type_name);
-
} // namespace py
} // namespace arrow
http://git-wip-us.apache.org/repos/asf/arrow/blob/137aade4/cpp/src/arrow/python/type_traits.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/type_traits.h b/cpp/src/arrow/python/type_traits.h
index f78dc36..c464d65 100644
--- a/cpp/src/arrow/python/type_traits.h
+++ b/cpp/src/arrow/python/type_traits.h
@@ -119,9 +119,6 @@ template <>
struct arrow_traits<Type::BOOL> {
static constexpr int npy_type = NPY_BOOL;
static constexpr bool supports_nulls = false;
- static constexpr bool is_boolean = true;
- static constexpr bool is_numeric_not_nullable = false;
- static constexpr bool is_numeric_nullable = false;
};
#define INT_DECL(TYPE) \
@@ -130,9 +127,6 @@ struct arrow_traits<Type::BOOL> {
static constexpr int npy_type = NPY_##TYPE; \
static constexpr bool supports_nulls = false; \
static constexpr double na_value = NAN; \
- static constexpr bool is_boolean = false; \
- static constexpr bool is_numeric_not_nullable = true; \
- static constexpr bool is_numeric_nullable = false; \
typedef typename npy_traits<NPY_##TYPE>::value_type T; \
};
@@ -150,9 +144,6 @@ struct arrow_traits<Type::FLOAT> {
static constexpr int npy_type = NPY_FLOAT32;
static constexpr bool supports_nulls = true;
static constexpr float na_value = NAN;
- static constexpr bool is_boolean = false;
- static constexpr bool is_numeric_not_nullable = false;
- static constexpr bool is_numeric_nullable = true;
typedef typename npy_traits<NPY_FLOAT32>::value_type T;
};
@@ -161,33 +152,63 @@ struct arrow_traits<Type::DOUBLE> {
static constexpr int npy_type = NPY_FLOAT64;
static constexpr bool supports_nulls = true;
static constexpr double na_value = NAN;
- static constexpr bool is_boolean = false;
- static constexpr bool is_numeric_not_nullable = false;
- static constexpr bool is_numeric_nullable = true;
typedef typename npy_traits<NPY_FLOAT64>::value_type T;
};
static constexpr int64_t kPandasTimestampNull = std::numeric_limits<int64_t>::min();
+constexpr int64_t kNanosecondsInDay = 86400000000000LL;
+
template <>
struct arrow_traits<Type::TIMESTAMP> {
static constexpr int npy_type = NPY_DATETIME;
+ static constexpr int64_t npy_shift = 1;
+
static constexpr bool supports_nulls = true;
static constexpr int64_t na_value = kPandasTimestampNull;
- static constexpr bool is_boolean = false;
- static constexpr bool is_numeric_not_nullable = false;
- static constexpr bool is_numeric_nullable = true;
typedef typename npy_traits<NPY_DATETIME>::value_type T;
};
template <>
+struct arrow_traits<Type::DATE32> {
+ // Data stores as FR_D day unit
+ static constexpr int npy_type = NPY_DATETIME;
+ static constexpr int64_t npy_shift = 1;
+
+ static constexpr bool supports_nulls = true;
+ typedef typename npy_traits<NPY_DATETIME>::value_type T;
+
+ static constexpr int64_t na_value = kPandasTimestampNull;
+ static inline bool isnull(int64_t v) { return npy_traits<NPY_DATETIME>::isnull(v); }
+};
+
+template <>
struct arrow_traits<Type::DATE64> {
+ // Data stores as FR_D day unit
static constexpr int npy_type = NPY_DATETIME;
+
+ // There are 1000 * 60 * 60 * 24 = 86400000ms in a day
+ static constexpr int64_t npy_shift = 86400000;
+
+ static constexpr bool supports_nulls = true;
+ typedef typename npy_traits<NPY_DATETIME>::value_type T;
+
+ static constexpr int64_t na_value = kPandasTimestampNull;
+ static inline bool isnull(int64_t v) { return npy_traits<NPY_DATETIME>::isnull(v); }
+};
+
+template <>
+struct arrow_traits<Type::TIME32> {
+ static constexpr int npy_type = NPY_OBJECT;
static constexpr bool supports_nulls = true;
static constexpr int64_t na_value = kPandasTimestampNull;
- static constexpr bool is_boolean = false;
- static constexpr bool is_numeric_not_nullable = false;
- static constexpr bool is_numeric_nullable = true;
+ typedef typename npy_traits<NPY_DATETIME>::value_type T;
+};
+
+template <>
+struct arrow_traits<Type::TIME64> {
+ static constexpr int npy_type = NPY_OBJECT;
+ static constexpr bool supports_nulls = true;
typedef typename npy_traits<NPY_DATETIME>::value_type T;
};
@@ -195,18 +216,12 @@ template <>
struct arrow_traits<Type::STRING> {
static constexpr int npy_type = NPY_OBJECT;
static constexpr bool supports_nulls = true;
- static constexpr bool is_boolean = false;
- static constexpr bool is_numeric_not_nullable = false;
- static constexpr bool is_numeric_nullable = false;
};
template <>
struct arrow_traits<Type::BINARY> {
static constexpr int npy_type = NPY_OBJECT;
static constexpr bool supports_nulls = true;
- static constexpr bool is_boolean = false;
- static constexpr bool is_numeric_not_nullable = false;
- static constexpr bool is_numeric_nullable = false;
};
} // namespace py
http://git-wip-us.apache.org/repos/asf/arrow/blob/137aade4/cpp/src/arrow/python/util/datetime.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/util/datetime.h b/cpp/src/arrow/python/util/datetime.h
index f704a96..82cf6fc 100644
--- a/cpp/src/arrow/python/util/datetime.h
+++ b/cpp/src/arrow/python/util/datetime.h
@@ -24,7 +24,7 @@
namespace arrow {
namespace py {
-inline int64_t PyDate_to_ms(PyDateTime_Date* pydate) {
+static inline int64_t PyDate_to_ms(PyDateTime_Date* pydate) {
struct tm date = {0};
date.tm_year = PyDateTime_GET_YEAR(pydate) - 1900;
date.tm_mon = PyDateTime_GET_MONTH(pydate) - 1;
@@ -36,6 +36,10 @@ inline int64_t PyDate_to_ms(PyDateTime_Date* pydate) {
return lrint(difftime(mktime(&date), mktime(&epoch)) * 1000);
}
+static inline int32_t PyDate_to_days(PyDateTime_Date* pydate) {
+ return static_cast<int32_t>(PyDate_to_ms(pydate) / 86400000LL);
+}
+
} // namespace py
} // namespace arrow
http://git-wip-us.apache.org/repos/asf/arrow/blob/137aade4/cpp/src/arrow/table-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/table-test.cc b/cpp/src/arrow/table-test.cc
index 156c3d1..cdc0238 100644
--- a/cpp/src/arrow/table-test.cc
+++ b/cpp/src/arrow/table-test.cc
@@ -398,62 +398,35 @@ TEST_F(TestTable, AddColumn) {
ASSERT_TRUE(status.IsInvalid());
// Add column with wrong length
- auto longer_col = std::make_shared<Column>(
- schema_->field(0), MakePrimitive<Int32Array>(length + 1));
+ auto longer_col =
+ std::make_shared<Column>(schema_->field(0), MakePrimitive<Int32Array>(length + 1));
status = table.AddColumn(0, longer_col, &result);
ASSERT_TRUE(status.IsInvalid());
// Add column 0 in different places
ASSERT_OK(table.AddColumn(0, columns_[0], &result));
- auto ex_schema = std::shared_ptr<Schema>(new Schema({
- schema_->field(0),
- schema_->field(0),
- schema_->field(1),
- schema_->field(2)}));
+ auto ex_schema = std::shared_ptr<Schema>(new Schema(
+ {schema_->field(0), schema_->field(0), schema_->field(1), schema_->field(2)}));
std::vector<std::shared_ptr<Column>> ex_columns = {
- table.column(0),
- table.column(0),
- table.column(1),
- table.column(2)};
+ table.column(0), table.column(0), table.column(1), table.column(2)};
ASSERT_TRUE(result->Equals(Table(ex_schema, ex_columns)));
ASSERT_OK(table.AddColumn(1, columns_[0], &result));
- ex_schema = std::shared_ptr<Schema>(new Schema({
- schema_->field(0),
- schema_->field(0),
- schema_->field(1),
- schema_->field(2)}));
- ex_columns = {
- table.column(0),
- table.column(0),
- table.column(1),
- table.column(2)};
+ ex_schema = std::shared_ptr<Schema>(new Schema(
+ {schema_->field(0), schema_->field(0), schema_->field(1), schema_->field(2)}));
+ ex_columns = {table.column(0), table.column(0), table.column(1), table.column(2)};
ASSERT_TRUE(result->Equals(Table(ex_schema, ex_columns)));
ASSERT_OK(table.AddColumn(2, columns_[0], &result));
- ex_schema = std::shared_ptr<Schema>(new Schema({
- schema_->field(0),
- schema_->field(1),
- schema_->field(0),
- schema_->field(2)}));
- ex_columns = {
- table.column(0),
- table.column(1),
- table.column(0),
- table.column(2)};
+ ex_schema = std::shared_ptr<Schema>(new Schema(
+ {schema_->field(0), schema_->field(1), schema_->field(0), schema_->field(2)}));
+ ex_columns = {table.column(0), table.column(1), table.column(0), table.column(2)};
ASSERT_TRUE(result->Equals(Table(ex_schema, ex_columns)));
ASSERT_OK(table.AddColumn(3, columns_[0], &result));
- ex_schema = std::shared_ptr<Schema>(new Schema({
- schema_->field(0),
- schema_->field(1),
- schema_->field(2),
- schema_->field(0)}));
- ex_columns = {
- table.column(0),
- table.column(1),
- table.column(2),
- table.column(0)};
+ ex_schema = std::shared_ptr<Schema>(new Schema(
+ {schema_->field(0), schema_->field(1), schema_->field(2), schema_->field(0)}));
+ ex_columns = {table.column(0), table.column(1), table.column(2), table.column(0)};
ASSERT_TRUE(result->Equals(Table(ex_schema, ex_columns)));
}
http://git-wip-us.apache.org/repos/asf/arrow/blob/137aade4/cpp/src/arrow/table.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc
index 9b39f77..4c5257b 100644
--- a/cpp/src/arrow/table.cc
+++ b/cpp/src/arrow/table.cc
@@ -321,11 +321,9 @@ Status Table::RemoveColumn(int i, std::shared_ptr<Table>* out) const {
return Status::OK();
}
-Status Table::AddColumn(int i, const std::shared_ptr<Column>& col,
- std::shared_ptr<Table>* out) const {
- if (i < 0 || i > num_columns() + 1) {
- return Status::Invalid("Invalid column index.");
- }
+Status Table::AddColumn(
+ int i, const std::shared_ptr<Column>& col, std::shared_ptr<Table>* out) const {
+ if (i < 0 || i > num_columns() + 1) { return Status::Invalid("Invalid column index."); }
if (col == nullptr) {
std::stringstream ss;
ss << "Column " << i << " was null";
@@ -334,7 +332,7 @@ Status Table::AddColumn(int i, const std::shared_ptr<Column>& col,
if (col->length() != num_rows_) {
std::stringstream ss;
ss << "Added column's length must match table's length. Expected length " << num_rows_
- << " but got length " << col->length();
+ << " but got length " << col->length();
return Status::Invalid(ss.str());
}
http://git-wip-us.apache.org/repos/asf/arrow/blob/137aade4/cpp/src/arrow/table.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/table.h b/cpp/src/arrow/table.h
index dcea53d..b15d31b 100644
--- a/cpp/src/arrow/table.h
+++ b/cpp/src/arrow/table.h
@@ -182,8 +182,8 @@ class ARROW_EXPORT Table {
Status RemoveColumn(int i, std::shared_ptr<Table>* out) const;
/// Add column to the table, producing a new Table
- Status AddColumn(int i, const std::shared_ptr<Column>& column,
- std::shared_ptr<Table>* out) const;
+ Status AddColumn(
+ int i, const std::shared_ptr<Column>& column, std::shared_ptr<Table>* out) const;
// @returns: the number of columns in the table
int num_columns() const { return static_cast<int>(columns_.size()); }
http://git-wip-us.apache.org/repos/asf/arrow/blob/137aade4/cpp/src/arrow/type.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc
index df4590f..93cab14 100644
--- a/cpp/src/arrow/type.cc
+++ b/cpp/src/arrow/type.cc
@@ -258,8 +258,8 @@ std::shared_ptr<Field> Schema::GetFieldByName(const std::string& name) {
}
}
-Status Schema::AddField(int i, const std::shared_ptr<Field>& field,
- std::shared_ptr<Schema>* out) const {
+Status Schema::AddField(
+ int i, const std::shared_ptr<Field>& field, std::shared_ptr<Schema>* out) const {
DCHECK_GE(i, 0);
DCHECK_LE(i, this->num_fields());
http://git-wip-us.apache.org/repos/asf/arrow/blob/137aade4/cpp/src/arrow/type.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
index 3a35f56..730cbed 100644
--- a/cpp/src/arrow/type.h
+++ b/cpp/src/arrow/type.h
@@ -633,8 +633,8 @@ class ARROW_EXPORT Schema {
// Render a string representation of the schema suitable for debugging
std::string ToString() const;
- Status AddField(int i, const std::shared_ptr<Field>& field,
- std::shared_ptr<Schema>* out) const;
+ Status AddField(
+ int i, const std::shared_ptr<Field>& field, std::shared_ptr<Schema>* out) const;
Status RemoveField(int i, std::shared_ptr<Schema>* out) const;
int num_fields() const { return static_cast<int>(fields_.size()); }
http://git-wip-us.apache.org/repos/asf/arrow/blob/137aade4/cpp/src/arrow/util/stl.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/util/stl.h b/cpp/src/arrow/util/stl.h
index bd25053..bfce111 100644
--- a/cpp/src/arrow/util/stl.h
+++ b/cpp/src/arrow/util/stl.h
@@ -40,8 +40,8 @@ inline std::vector<T> DeleteVectorElement(const std::vector<T>& values, size_t i
}
template <typename T>
-inline std::vector<T> AddVectorElement(const std::vector<T>& values, size_t index,
- const T& new_element) {
+inline std::vector<T> AddVectorElement(
+ const std::vector<T>& values, size_t index, const T& new_element) {
DCHECK_LE(index, values.size());
std::vector<T> out;
out.reserve(values.size() + 1);
http://git-wip-us.apache.org/repos/asf/arrow/blob/137aade4/python/pyarrow/scalar.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/scalar.pyx b/python/pyarrow/scalar.pyx
index f3d9321..196deed 100644
--- a/python/pyarrow/scalar.pyx
+++ b/python/pyarrow/scalar.pyx
@@ -134,7 +134,11 @@ cdef class UInt64Value(ArrayValue):
cdef class Date32Value(ArrayValue):
def as_py(self):
- raise NotImplementedError
+ cdef CDate32Array* ap = <CDate32Array*> self.sp_array.get()
+
+ # Shift to seconds since epoch
+ return datetime.datetime.utcfromtimestamp(
+ int(ap.Value(self.index)) * 86400).date()
cdef class Date64Value(ArrayValue):
http://git-wip-us.apache.org/repos/asf/arrow/blob/137aade4/python/pyarrow/tests/test_convert_pandas.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index 0504e1d..d1bea0b 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -28,7 +28,7 @@ import pandas as pd
import pandas.util.testing as tm
from pyarrow.compat import u
-import pyarrow as A
+import pyarrow as pa
from .pandas_examples import dataframe_with_arrays, dataframe_with_lists
@@ -67,7 +67,7 @@ class TestPandasConversion(unittest.TestCase):
def _check_pandas_roundtrip(self, df, expected=None, nthreads=1,
timestamps_to_ms=False, expected_schema=None,
check_dtype=True, schema=None):
- table = A.Table.from_pandas(df, timestamps_to_ms=timestamps_to_ms,
+ table = pa.Table.from_pandas(df, timestamps_to_ms=timestamps_to_ms,
schema=schema)
result = table.to_pandas(nthreads=nthreads)
if expected_schema:
@@ -78,7 +78,7 @@ class TestPandasConversion(unittest.TestCase):
def _check_array_roundtrip(self, values, expected=None, mask=None,
timestamps_to_ms=False, type=None):
- arr = A.Array.from_numpy(values, timestamps_to_ms=timestamps_to_ms,
+ arr = pa.Array.from_numpy(values, timestamps_to_ms=timestamps_to_ms,
mask=mask, type=type)
result = arr.to_pandas()
@@ -99,23 +99,23 @@ class TestPandasConversion(unittest.TestCase):
def test_float_no_nulls(self):
data = {}
fields = []
- dtypes = [('f4', A.float32()), ('f8', A.float64())]
+ dtypes = [('f4', pa.float32()), ('f8', pa.float64())]
num_values = 100
for numpy_dtype, arrow_dtype in dtypes:
values = np.random.randn(num_values)
data[numpy_dtype] = values.astype(numpy_dtype)
- fields.append(A.Field.from_py(numpy_dtype, arrow_dtype))
+ fields.append(pa.Field.from_py(numpy_dtype, arrow_dtype))
df = pd.DataFrame(data)
- schema = A.Schema.from_fields(fields)
+ schema = pa.Schema.from_fields(fields)
self._check_pandas_roundtrip(df, expected_schema=schema)
def test_float_nulls(self):
num_values = 100
null_mask = np.random.randint(0, 10, size=num_values) < 3
- dtypes = [('f4', A.float32()), ('f8', A.float64())]
+ dtypes = [('f4', pa.float32()), ('f8', pa.float64())]
names = ['f4', 'f8']
expected_cols = []
@@ -124,9 +124,9 @@ class TestPandasConversion(unittest.TestCase):
for name, arrow_dtype in dtypes:
values = np.random.randn(num_values).astype(name)
- arr = A.Array.from_numpy(values, null_mask)
+ arr = pa.Array.from_numpy(values, null_mask)
arrays.append(arr)
- fields.append(A.Field.from_py(name, arrow_dtype))
+ fields.append(pa.Field.from_py(name, arrow_dtype))
values[null_mask] = np.nan
expected_cols.append(values)
@@ -134,8 +134,8 @@ class TestPandasConversion(unittest.TestCase):
ex_frame = pd.DataFrame(dict(zip(names, expected_cols)),
columns=names)
- table = A.Table.from_arrays(arrays, names)
- assert table.schema.equals(A.Schema.from_fields(fields))
+ table = pa.Table.from_arrays(arrays, names)
+ assert table.schema.equals(pa.Schema.from_fields(fields))
result = table.to_pandas()
tm.assert_frame_equal(result, ex_frame)
@@ -144,11 +144,11 @@ class TestPandasConversion(unittest.TestCase):
fields = []
numpy_dtypes = [
- ('i1', A.int8()), ('i2', A.int16()),
- ('i4', A.int32()), ('i8', A.int64()),
- ('u1', A.uint8()), ('u2', A.uint16()),
- ('u4', A.uint32()), ('u8', A.uint64()),
- ('longlong', A.int64()), ('ulonglong', A.uint64())
+ ('i1', pa.int8()), ('i2', pa.int16()),
+ ('i4', pa.int32()), ('i8', pa.int64()),
+ ('u1', pa.uint8()), ('u2', pa.uint16()),
+ ('u4', pa.uint32()), ('u8', pa.uint64()),
+ ('longlong', pa.int64()), ('ulonglong', pa.uint64())
]
num_values = 100
@@ -158,10 +158,10 @@ class TestPandasConversion(unittest.TestCase):
min(info.max, np.iinfo('i8').max),
size=num_values)
data[dtype] = values.astype(dtype)
- fields.append(A.Field.from_py(dtype, arrow_dtype))
+ fields.append(pa.Field.from_py(dtype, arrow_dtype))
df = pd.DataFrame(data)
- schema = A.Schema.from_fields(fields)
+ schema = pa.Schema.from_fields(fields)
self._check_pandas_roundtrip(df, expected_schema=schema)
def test_integer_with_nulls(self):
@@ -177,7 +177,7 @@ class TestPandasConversion(unittest.TestCase):
for name in int_dtypes:
values = np.random.randint(0, 100, size=num_values)
- arr = A.Array.from_numpy(values, null_mask)
+ arr = pa.Array.from_numpy(values, null_mask)
arrays.append(arr)
expected = values.astype('f8')
@@ -188,7 +188,7 @@ class TestPandasConversion(unittest.TestCase):
ex_frame = pd.DataFrame(dict(zip(int_dtypes, expected_cols)),
columns=int_dtypes)
- table = A.Table.from_arrays(arrays, int_dtypes)
+ table = pa.Table.from_arrays(arrays, int_dtypes)
result = table.to_pandas()
tm.assert_frame_equal(result, ex_frame)
@@ -199,8 +199,8 @@ class TestPandasConversion(unittest.TestCase):
np.random.seed(0)
df = pd.DataFrame({'bools': np.random.randn(num_values) > 0})
- field = A.Field.from_py('bools', A.bool_())
- schema = A.Schema.from_fields([field])
+ field = pa.Field.from_py('bools', pa.bool_())
+ schema = pa.Schema.from_fields([field])
self._check_pandas_roundtrip(df, expected_schema=schema)
def test_boolean_nulls(self):
@@ -211,16 +211,16 @@ class TestPandasConversion(unittest.TestCase):
mask = np.random.randint(0, 10, size=num_values) < 3
values = np.random.randint(0, 10, size=num_values) < 5
- arr = A.Array.from_numpy(values, mask)
+ arr = pa.Array.from_numpy(values, mask)
expected = values.astype(object)
expected[mask] = None
- field = A.Field.from_py('bools', A.bool_())
- schema = A.Schema.from_fields([field])
+ field = pa.Field.from_py('bools', pa.bool_())
+ schema = pa.Schema.from_fields([field])
ex_frame = pd.DataFrame({'bools': expected})
- table = A.Table.from_arrays([arr], ['bools'])
+ table = pa.Table.from_arrays([arr], ['bools'])
assert table.schema.equals(schema)
result = table.to_pandas()
@@ -229,16 +229,16 @@ class TestPandasConversion(unittest.TestCase):
def test_boolean_object_nulls(self):
arr = np.array([False, None, True] * 100, dtype=object)
df = pd.DataFrame({'bools': arr})
- field = A.Field.from_py('bools', A.bool_())
- schema = A.Schema.from_fields([field])
+ field = pa.Field.from_py('bools', pa.bool_())
+ schema = pa.Schema.from_fields([field])
self._check_pandas_roundtrip(df, expected_schema=schema)
def test_unicode(self):
repeats = 1000
values = [u'foo', None, u'bar', u'ma�ana', np.nan]
df = pd.DataFrame({'strings': values * repeats})
- field = A.Field.from_py('strings', A.string())
- schema = A.Schema.from_fields([field])
+ field = pa.Field.from_py('strings', pa.string())
+ schema = pa.Schema.from_fields([field])
self._check_pandas_roundtrip(df, expected_schema=schema)
@@ -246,8 +246,8 @@ class TestPandasConversion(unittest.TestCase):
values = [u('qux'), b'foo', None, 'bar', 'qux', np.nan]
df = pd.DataFrame({'strings': values})
- table = A.Table.from_pandas(df)
- assert table[0].type == A.binary()
+ table = pa.Table.from_pandas(df)
+ assert table[0].type == pa.binary()
values2 = [b'qux', b'foo', None, b'bar', b'qux', np.nan]
expected = pd.DataFrame({'strings': values2})
@@ -256,8 +256,8 @@ class TestPandasConversion(unittest.TestCase):
def test_fixed_size_bytes(self):
values = [b'foo', None, b'bar', None, None, b'hey']
df = pd.DataFrame({'strings': values})
- schema = A.Schema.from_fields([A.field('strings', A.binary(3))])
- table = A.Table.from_pandas(df, schema=schema)
+ schema = pa.Schema.from_fields([pa.field('strings', pa.binary(3))])
+ table = pa.Table.from_pandas(df, schema=schema)
assert table.schema[0].type == schema[0].type
assert table.schema[0].name == schema[0].name
result = table.to_pandas()
@@ -266,9 +266,9 @@ class TestPandasConversion(unittest.TestCase):
def test_fixed_size_bytes_does_not_accept_varying_lengths(self):
values = [b'foo', None, b'ba', None, None, b'hey']
df = pd.DataFrame({'strings': values})
- schema = A.Schema.from_fields([A.field('strings', A.binary(3))])
- with self.assertRaises(A.ArrowInvalid):
- A.Table.from_pandas(df, schema=schema)
+ schema = pa.Schema.from_fields([pa.field('strings', pa.binary(3))])
+ with self.assertRaises(pa.ArrowInvalid):
+ pa.Table.from_pandas(df, schema=schema)
def test_timestamps_notimezone_no_nulls(self):
df = pd.DataFrame({
@@ -278,8 +278,8 @@ class TestPandasConversion(unittest.TestCase):
'2010-08-13T05:46:57.437'],
dtype='datetime64[ms]')
})
- field = A.Field.from_py('datetime64', A.timestamp('ms'))
- schema = A.Schema.from_fields([field])
+ field = pa.Field.from_py('datetime64', pa.timestamp('ms'))
+ schema = pa.Schema.from_fields([field])
self._check_pandas_roundtrip(df, timestamps_to_ms=True,
expected_schema=schema)
@@ -290,8 +290,8 @@ class TestPandasConversion(unittest.TestCase):
'2010-08-13T05:46:57.437699912'],
dtype='datetime64[ns]')
})
- field = A.Field.from_py('datetime64', A.timestamp('ns'))
- schema = A.Schema.from_fields([field])
+ field = pa.Field.from_py('datetime64', pa.timestamp('ns'))
+ schema = pa.Schema.from_fields([field])
self._check_pandas_roundtrip(df, timestamps_to_ms=False,
expected_schema=schema)
@@ -303,8 +303,8 @@ class TestPandasConversion(unittest.TestCase):
'2010-08-13T05:46:57.437'],
dtype='datetime64[ms]')
})
- field = A.Field.from_py('datetime64', A.timestamp('ms'))
- schema = A.Schema.from_fields([field])
+ field = pa.Field.from_py('datetime64', pa.timestamp('ms'))
+ schema = pa.Schema.from_fields([field])
self._check_pandas_roundtrip(df, timestamps_to_ms=True,
expected_schema=schema)
@@ -315,8 +315,8 @@ class TestPandasConversion(unittest.TestCase):
'2010-08-13T05:46:57.437699912'],
dtype='datetime64[ns]')
})
- field = A.Field.from_py('datetime64', A.timestamp('ns'))
- schema = A.Schema.from_fields([field])
+ field = pa.Field.from_py('datetime64', pa.timestamp('ns'))
+ schema = pa.Schema.from_fields([field])
self._check_pandas_roundtrip(df, timestamps_to_ms=False,
expected_schema=schema)
@@ -345,25 +345,77 @@ class TestPandasConversion(unittest.TestCase):
.to_frame())
self._check_pandas_roundtrip(df, timestamps_to_ms=False)
- def test_date(self):
+ def test_date_infer(self):
df = pd.DataFrame({
'date': [datetime.date(2000, 1, 1),
None,
datetime.date(1970, 1, 1),
datetime.date(2040, 2, 26)]})
- table = A.Table.from_pandas(df)
- field = A.Field.from_py('date', A.date64())
- schema = A.Schema.from_fields([field])
+ table = pa.Table.from_pandas(df)
+ field = pa.Field.from_py('date', pa.date32())
+ schema = pa.Schema.from_fields([field])
assert table.schema.equals(schema)
result = table.to_pandas()
expected = df.copy()
expected['date'] = pd.to_datetime(df['date'])
tm.assert_frame_equal(result, expected)
+ def test_date_objects_typed(self):
+ arr = np.array([
+ datetime.date(2017, 4, 3),
+ None,
+ datetime.date(2017, 4, 4),
+ datetime.date(2017, 4, 5)], dtype=object)
+
+ arr_i4 = np.array([17259, -1, 17260, 17261], dtype='int32')
+ arr_i8 = arr_i4.astype('int64') * 86400000
+ mask = np.array([False, True, False, False])
+
+ t32 = pa.date32()
+ t64 = pa.date64()
+
+ a32 = pa.Array.from_numpy(arr, type=t32)
+ a64 = pa.Array.from_numpy(arr, type=t64)
+
+ a32_expected = pa.Array.from_numpy(arr_i4, mask=mask, type=t32)
+ a64_expected = pa.Array.from_numpy(arr_i8, mask=mask, type=t64)
+
+ assert a32.equals(a32_expected)
+ assert a64.equals(a64_expected)
+
+ # Test converting back to pandas
+ colnames = ['date32', 'date64']
+ table = pa.Table.from_arrays([a32, a64], colnames)
+ table_pandas = table.to_pandas()
+
+ ex_values = (np.array(['2017-04-03', '2017-04-04', '2017-04-04',
+ '2017-04-05'],
+ dtype='datetime64[D]')
+ .astype('datetime64[ns]'))
+ ex_values[1] = pd.NaT.value
+ expected_pandas = pd.DataFrame({'date32': ex_values,
+ 'date64': ex_values},
+ columns=colnames)
+ tm.assert_frame_equal(table_pandas, expected_pandas)
+
+ def test_dates_from_integers(self):
+ t1 = pa.date32()
+ t2 = pa.date64()
+
+ arr = np.array([17259, 17260, 17261], dtype='int32')
+ arr2 = arr.astype('int64') * 86400000
+
+ a1 = pa.Array.from_numpy(arr, type=t1)
+ a2 = pa.Array.from_numpy(arr2, type=t2)
+
+ expected = datetime.date(2017, 4, 3)
+ assert a1[0].as_py() == expected
+ assert a2[0].as_py() == expected
+
def test_column_of_arrays(self):
df, schema = dataframe_with_arrays()
self._check_pandas_roundtrip(df, schema=schema, expected_schema=schema)
- table = A.Table.from_pandas(df, schema=schema)
+ table = pa.Table.from_pandas(df, schema=schema)
assert table.schema.equals(schema)
for column in df.columns:
@@ -373,7 +425,7 @@ class TestPandasConversion(unittest.TestCase):
def test_column_of_lists(self):
df, schema = dataframe_with_lists()
self._check_pandas_roundtrip(df, schema=schema, expected_schema=schema)
- table = A.Table.from_pandas(df, schema=schema)
+ table = pa.Table.from_pandas(df, schema=schema)
assert table.schema.equals(schema)
for column in df.columns:
@@ -410,8 +462,8 @@ class TestPandasConversion(unittest.TestCase):
def test_mixed_types_fails(self):
data = pd.DataFrame({'a': ['a', 1, 2.0]})
- with self.assertRaises(A.ArrowException):
- A.Table.from_pandas(data)
+ with self.assertRaises(pa.ArrowException):
+ pa.Table.from_pandas(data)
def test_strided_data_import(self):
cases = []
@@ -460,9 +512,9 @@ class TestPandasConversion(unittest.TestCase):
decimal.Decimal('1234.439'),
]
})
- converted = A.Table.from_pandas(expected)
- field = A.Field.from_py('decimals', A.decimal(7, 3))
- schema = A.Schema.from_fields([field])
+ converted = pa.Table.from_pandas(expected)
+ field = pa.Field.from_py('decimals', pa.decimal(7, 3))
+ schema = pa.Schema.from_fields([field])
assert converted.schema.equals(schema)
def test_decimal_32_to_pandas(self):
@@ -472,7 +524,7 @@ class TestPandasConversion(unittest.TestCase):
decimal.Decimal('1234.439'),
]
})
- converted = A.Table.from_pandas(expected)
+ converted = pa.Table.from_pandas(expected)
df = converted.to_pandas()
tm.assert_frame_equal(df, expected)
@@ -483,9 +535,9 @@ class TestPandasConversion(unittest.TestCase):
decimal.Decimal('129534.123731'),
]
})
- converted = A.Table.from_pandas(expected)
- field = A.Field.from_py('decimals', A.decimal(12, 6))
- schema = A.Schema.from_fields([field])
+ converted = pa.Table.from_pandas(expected)
+ field = pa.Field.from_py('decimals', pa.decimal(12, 6))
+ schema = pa.Schema.from_fields([field])
assert converted.schema.equals(schema)
def test_decimal_64_to_pandas(self):
@@ -495,7 +547,7 @@ class TestPandasConversion(unittest.TestCase):
decimal.Decimal('129534.123731'),
]
})
- converted = A.Table.from_pandas(expected)
+ converted = pa.Table.from_pandas(expected)
df = converted.to_pandas()
tm.assert_frame_equal(df, expected)
@@ -506,9 +558,9 @@ class TestPandasConversion(unittest.TestCase):
-decimal.Decimal('314292388910493.12343437128'),
]
})
- converted = A.Table.from_pandas(expected)
- field = A.Field.from_py('decimals', A.decimal(26, 11))
- schema = A.Schema.from_fields([field])
+ converted = pa.Table.from_pandas(expected)
+ field = pa.Field.from_py('decimals', pa.decimal(26, 11))
+ schema = pa.Schema.from_fields([field])
assert converted.schema.equals(schema)
def test_decimal_128_to_pandas(self):
@@ -518,6 +570,6 @@ class TestPandasConversion(unittest.TestCase):
-decimal.Decimal('314292388910493.12343437128'),
]
})
- converted = A.Table.from_pandas(expected)
+ converted = pa.Table.from_pandas(expected)
df = converted.to_pandas()
tm.assert_frame_equal(df, expected)