You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/07/16 21:29:50 UTC
[2/2] arrow git commit: ARROW-1183: [Python] Implement pandas
conversions between Time32, Time64 types and datetime.time
ARROW-1183: [Python] Implement pandas conversions between Time32, Time64 types and datetime.time
There's also a little bit of code reorganization; sorry about the large diff.
Author: Wes McKinney <we...@twosigma.com>
Closes #849 from wesm/ARROW-1183 and squashes the following commits:
7e94e25c [Wes McKinney] Improve error messages to add extra context when using InvalidConversion
9659b903 [Wes McKinney] Always install thrift-cpp in toolchain Parquet MSVC build
41ce47ea [Wes McKinney] Fix MSVC compiler warning
681459c9 [Wes McKinney] Add missing PyDateTime_IMPORT
7024a366 [Wes McKinney] Finish roundtrip of time32/time64 to array of pytime
58fe4c00 [Wes McKinney] Add time to_pandas test
58f99f60 [Wes McKinney] Test from_pandas conversions from pytime
9228ec46 [Wes McKinney] Start in on time conversions
Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/50b518af
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/50b518af
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/50b518af
Branch: refs/heads/master
Commit: 50b518afcb03172e6133643eb5cbecd03c63368c
Parents: bf01966
Author: Wes McKinney <we...@twosigma.com>
Authored: Sun Jul 16 17:29:43 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Sun Jul 16 17:29:43 2017 -0400
----------------------------------------------------------------------
ci/msvc-build.bat | 16 +-
cpp/src/arrow/builder.cc | 11 +-
cpp/src/arrow/ipc/metadata.cc | 4 +-
cpp/src/arrow/python/builtin_convert.cc | 21 +-
cpp/src/arrow/python/builtin_convert.h | 4 +-
cpp/src/arrow/python/pandas_convert.cc | 216 +++--
cpp/src/arrow/python/util/datetime.h | 45 +
cpp/src/arrow/util/logging.h | 3 +-
cpp/src/plasma/protocol.cc | 4 +-
python/pyarrow/array.pxi | 1119 +---------------------
python/pyarrow/lib.pyx | 6 +
python/pyarrow/pandas_compat.py | 2 +
python/pyarrow/scalar.pxi | 376 ++++++++
python/pyarrow/tests/test_array.py | 4 +-
python/pyarrow/tests/test_convert_pandas.py | 106 ++
python/pyarrow/types.pxi | 776 +++++++++++++++
16 files changed, 1506 insertions(+), 1207 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/arrow/blob/50b518af/ci/msvc-build.bat
----------------------------------------------------------------------
diff --git a/ci/msvc-build.bat b/ci/msvc-build.bat
index be5fd7b..22108ab 100644
--- a/ci/msvc-build.bat
+++ b/ci/msvc-build.bat
@@ -35,14 +35,26 @@ if "%JOB%" == "Build_Debug" (
)
conda update --yes --quiet conda
+conda config --set auto_update_conda false
+conda info -a
+
+conda config --set show_channel_urls True
+
+# Help with SSL timeouts to S3
+conda config --set remote_connect_timeout_secs 12
+
+conda config --add channels https://repo.continuum.io/pkgs/free
+conda config --add channels conda-forge
+conda info -a
conda create -n arrow -q -y python=%PYTHON% ^
- six pytest setuptools numpy pandas cython
+ six pytest setuptools numpy pandas cython ^
+ thrift-cpp
if "%JOB%" == "Toolchain" (
conda install -n arrow -q -y -c conda-forge ^
flatbuffers rapidjson cmake git boost-cpp ^
- thrift-cpp snappy zlib brotli gflags lz4-c zstd
+ snappy zlib brotli gflags lz4-c zstd
)
call activate arrow
http://git-wip-us.apache.org/repos/asf/arrow/blob/50b518af/cpp/src/arrow/builder.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc
index e466838..885c650 100644
--- a/cpp/src/arrow/builder.cc
+++ b/cpp/src/arrow/builder.cc
@@ -984,16 +984,17 @@ Status DecimalBuilder::Finish(std::shared_ptr<Array>* out) {
ListBuilder::ListBuilder(MemoryPool* pool, std::unique_ptr<ArrayBuilder> value_builder,
const std::shared_ptr<DataType>& type)
- : ArrayBuilder(
- pool, type ? type : std::static_pointer_cast<DataType>(
- std::make_shared<ListType>(value_builder->type()))),
+ : ArrayBuilder(pool,
+ type ? type : std::static_pointer_cast<DataType>(
+ std::make_shared<ListType>(value_builder->type()))),
offset_builder_(pool),
value_builder_(std::move(value_builder)) {}
ListBuilder::ListBuilder(MemoryPool* pool, std::shared_ptr<Array> values,
const std::shared_ptr<DataType>& type)
- : ArrayBuilder(pool, type ? type : std::static_pointer_cast<DataType>(
- std::make_shared<ListType>(values->type()))),
+ : ArrayBuilder(pool,
+ type ? type : std::static_pointer_cast<DataType>(
+ std::make_shared<ListType>(values->type()))),
offset_builder_(pool),
values_(values) {}
http://git-wip-us.apache.org/repos/asf/arrow/blob/50b518af/cpp/src/arrow/ipc/metadata.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/metadata.cc b/cpp/src/arrow/ipc/metadata.cc
index 5b2ca3b..49c24c7 100644
--- a/cpp/src/arrow/ipc/metadata.cc
+++ b/cpp/src/arrow/ipc/metadata.cc
@@ -933,9 +933,7 @@ const void* Message::header() const {
bool Message::Equals(const Message& other) const {
int64_t metadata_bytes = std::min(metadata()->size(), other.metadata()->size());
- if (!metadata()->Equals(*other.metadata(), metadata_bytes)) {
- return false;
- }
+ if (!metadata()->Equals(*other.metadata(), metadata_bytes)) { return false; }
// Compare bodies, if they have them
auto this_body = body();
http://git-wip-us.apache.org/repos/asf/arrow/blob/50b518af/cpp/src/arrow/python/builtin_convert.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc
index 816f95a..83154bb 100644
--- a/cpp/src/arrow/python/builtin_convert.cc
+++ b/cpp/src/arrow/python/builtin_convert.cc
@@ -44,7 +44,8 @@ static inline bool IsPyInteger(PyObject* obj) {
#endif
}
-Status InvalidConversion(PyObject* obj, const std::string& expected_type_name) {
+Status InvalidConversion(PyObject* obj, const std::string& expected_types,
+ std::ostream* out) {
OwnedRef type(PyObject_Type(obj));
RETURN_IF_PYERROR();
DCHECK_NE(type.obj(), nullptr);
@@ -63,10 +64,10 @@ Status InvalidConversion(PyObject* obj, const std::string& expected_type_name) {
std::string cpp_type_name(bytes, size);
- std::stringstream ss;
- ss << "Python object of type " << cpp_type_name << " is not None and is not a "
- << expected_type_name << " object";
- return Status::Invalid(ss.str());
+ (*out) << "Got Python object of type " << cpp_type_name
+ << " but can only handle these types: "
+ << expected_types;
+ return Status::OK();
}
class ScalarVisitor {
@@ -462,7 +463,10 @@ class BytesConverter : public TypedConverterVisitor<BinaryBuilder, BytesConverte
} else if (PyBytes_Check(item.obj())) {
bytes_obj = item.obj();
} else {
- return InvalidConversion(item.obj(), "bytes");
+ std::stringstream ss;
+ ss << "Error converting to Binary type: ";
+ RETURN_NOT_OK(InvalidConversion(item.obj(), "bytes", &ss));
+ return Status::Invalid(ss.str());
}
// No error checking
length = PyBytes_GET_SIZE(bytes_obj);
@@ -489,7 +493,10 @@ class FixedWidthBytesConverter
} else if (PyBytes_Check(item.obj())) {
bytes_obj = item.obj();
} else {
- return InvalidConversion(item.obj(), "bytes");
+ std::stringstream ss;
+ ss << "Error converting to FixedSizeBinary type: ";
+ RETURN_NOT_OK(InvalidConversion(item.obj(), "bytes", &ss));
+ return Status::Invalid(ss.str());
}
// No error checking
RETURN_NOT_OK(CheckPythonBytesAreFixedLength(bytes_obj, expected_length));
http://git-wip-us.apache.org/repos/asf/arrow/blob/50b518af/cpp/src/arrow/python/builtin_convert.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/builtin_convert.h b/cpp/src/arrow/python/builtin_convert.h
index dd878b2..4f84fbb 100644
--- a/cpp/src/arrow/python/builtin_convert.h
+++ b/cpp/src/arrow/python/builtin_convert.h
@@ -24,6 +24,7 @@
#include "arrow/python/platform.h"
#include <memory>
+#include <ostream>
#include <string>
#include "arrow/type.h"
@@ -62,7 +63,8 @@ Status ConvertPySequence(PyObject* obj, MemoryPool* pool, std::shared_ptr<Array>
const std::shared_ptr<DataType>& type, int64_t size);
ARROW_EXPORT
-Status InvalidConversion(PyObject* obj, const std::string& expected_type_name);
+Status InvalidConversion(
+ PyObject* obj, const std::string& expected_type_name, std::ostream* out);
ARROW_EXPORT Status CheckPythonBytesAreFixedLength(
PyObject* obj, Py_ssize_t expected_length);
http://git-wip-us.apache.org/repos/asf/arrow/blob/50b518af/cpp/src/arrow/python/pandas_convert.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/pandas_convert.cc b/cpp/src/arrow/python/pandas_convert.cc
index 83cd35a..c520d8d 100644
--- a/cpp/src/arrow/python/pandas_convert.cc
+++ b/cpp/src/arrow/python/pandas_convert.cc
@@ -189,7 +189,10 @@ static Status AppendObjectStrings(
const int32_t length = static_cast<int32_t>(PyBytes_GET_SIZE(obj));
RETURN_NOT_OK(builder->Append(PyBytes_AS_STRING(obj), length));
} else {
- return InvalidConversion(obj, "string or bytes");
+ std::stringstream ss;
+ ss << "Error converting to Python objects to String/UTF8: ";
+ RETURN_NOT_OK(InvalidConversion(obj, "str, bytes", &ss));
+ return Status::Invalid(ss.str());
}
}
@@ -230,7 +233,10 @@ static Status AppendObjectFixedWidthBytes(PyArrayObject* arr, PyArrayObject* mas
RETURN_NOT_OK(
builder->Append(reinterpret_cast<const uint8_t*>(PyBytes_AS_STRING(obj))));
} else {
- return InvalidConversion(obj, "string or bytes");
+ std::stringstream ss;
+ ss << "Error converting to Python objects to FixedSizeBinary: ";
+ RETURN_NOT_OK(InvalidConversion(obj, "str, bytes", &ss));
+ return Status::Invalid(ss.str());
}
}
@@ -416,6 +422,7 @@ class PandasConverter {
const std::shared_ptr<DataType>& type, ListBuilder& list_builder, PyObject* list);
Status ConvertObjects();
Status ConvertDecimals();
+ Status ConvertTimes();
protected:
MemoryPool* pool_;
@@ -536,7 +543,6 @@ Status PandasConverter::ConvertDates() {
/// datetime API otherwise
PyDateTime_IMPORT;
- Status s;
PyObject* obj;
for (int64_t i = 0; i < length_; ++i) {
obj = objects[i];
@@ -545,7 +551,11 @@ Status PandasConverter::ConvertDates() {
} else if (PandasObjectIsNull(obj)) {
RETURN_NOT_OK(date_builder.AppendNull());
} else {
- return InvalidConversion(obj, "date");
+ std::stringstream ss;
+ ss << "Error converting from Python objects to "
+ << type_->ToString() << ": ";
+ RETURN_NOT_OK(InvalidConversion(obj, "datetime.date", &ss));
+ return Status::Invalid(ss.str());
}
}
return date_builder.Finish(&out_);
@@ -582,7 +592,6 @@ Status PandasConverter::ConvertDecimals() {
const int bit_width = std::dynamic_pointer_cast<DecimalType>(type_)->bit_width();
DecimalBuilder decimal_builder(pool_, type_);
-
RETURN_NOT_OK(decimal_builder.Resize(length_));
for (int64_t i = 0; i < length_; ++i) {
@@ -598,12 +607,45 @@ Status PandasConverter::ConvertDecimals() {
} else if (PandasObjectIsNull(object)) {
RETURN_NOT_OK(decimal_builder.AppendNull());
} else {
- return InvalidConversion(object, "decimal.Decimal");
+ std::stringstream ss;
+ ss << "Error converting from Python objects to "
+ << type_->ToString() << ": ";
+ RETURN_NOT_OK(InvalidConversion(object, "decimal.Decimal", &ss));
+ return Status::Invalid(ss.str());
}
}
return decimal_builder.Finish(&out_);
}
+Status PandasConverter::ConvertTimes() {
+ // Convert array of datetime.time objects to Arrow
+ PyAcquireGIL lock;
+ PyDateTime_IMPORT;
+
+ PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
+
+ // datetime.time stores microsecond resolution
+ Time64Builder builder(pool_, ::arrow::time64(TimeUnit::MICRO));
+ RETURN_NOT_OK(builder.Resize(length_));
+
+ PyObject* obj;
+ for (int64_t i = 0; i < length_; ++i) {
+ obj = objects[i];
+ if (PyTime_Check(obj)) {
+ RETURN_NOT_OK(builder.Append(PyTime_to_us(obj)));
+ } else if (PandasObjectIsNull(obj)) {
+ RETURN_NOT_OK(builder.AppendNull());
+ } else {
+ std::stringstream ss;
+ ss << "Error converting from Python objects to "
+ << type_->ToString() << ": ";
+ RETURN_NOT_OK(InvalidConversion(obj, "datetime.time", &ss));
+ return Status::Invalid(ss.str());
+ }
+ }
+ return builder.Finish(&out_);
+}
+
#undef CONVERT_DECIMAL_CASE
Status PandasConverter::ConvertObjectStrings() {
@@ -631,9 +673,6 @@ Status PandasConverter::ConvertObjectStrings() {
Status PandasConverter::ConvertObjectFloats() {
PyAcquireGIL lock;
- DoubleBuilder builder(pool_);
- RETURN_NOT_OK(builder.Resize(length_));
-
Ndarray1DIndexer<PyObject*> objects(arr_);
Ndarray1DIndexer<uint8_t> mask_values;
@@ -643,6 +682,9 @@ Status PandasConverter::ConvertObjectFloats() {
have_mask = true;
}
+ DoubleBuilder builder(pool_);
+ RETURN_NOT_OK(builder.Resize(length_));
+
PyObject* obj;
for (int64_t i = 0; i < objects.size(); ++i) {
obj = objects[i];
@@ -653,7 +695,11 @@ Status PandasConverter::ConvertObjectFloats() {
RETURN_IF_PYERROR();
RETURN_NOT_OK(builder.Append(val));
} else {
- return InvalidConversion(obj, "float");
+ std::stringstream ss;
+ ss << "Error converting from Python objects to "
+ << type_->ToString() << ": ";
+ RETURN_NOT_OK(InvalidConversion(obj, "float", &ss));
+ return Status::Invalid(ss.str());
}
}
@@ -685,7 +731,11 @@ Status PandasConverter::ConvertObjectIntegers() {
RETURN_IF_PYERROR();
RETURN_NOT_OK(builder.Append(val));
} else {
- return InvalidConversion(obj, "integer");
+ std::stringstream ss;
+ ss << "Error converting from Python objects to "
+ << type_->ToString() << ": ";
+ RETURN_NOT_OK(InvalidConversion(obj, "integer", &ss));
+ return Status::Invalid(ss.str());
}
}
@@ -791,6 +841,36 @@ static Status ConvertDecimals(const ChunkedArray& data, PyObject** out_values) {
return Status::OK();
}
+template <typename TYPE>
+static Status ConvertTimes(const ChunkedArray& data, PyObject** out_values) {
+ using ArrayType = typename TypeTraits<TYPE>::ArrayType;
+
+ PyAcquireGIL lock;
+ OwnedRef time_ref;
+
+ PyDateTime_IMPORT;
+
+ for (int c = 0; c < data.num_chunks(); c++) {
+ const auto& arr = static_cast<const ArrayType&>(*data.chunk(c));
+ auto type = std::dynamic_pointer_cast<TYPE>(arr.type());
+ DCHECK(type);
+
+ const TimeUnit::type unit = type->unit();
+
+ for (int64_t i = 0; i < arr.length(); ++i) {
+ if (arr.IsNull(i)) {
+ Py_INCREF(Py_None);
+ *out_values++ = Py_None;
+ } else {
+ RETURN_NOT_OK(PyTime_from_int(arr.Value(i), unit, out_values++));
+ RETURN_IF_PYERROR();
+ }
+ }
+ }
+
+ return Status::OK();
+}
+
Status PandasConverter::ConvertBooleans() {
PyAcquireGIL lock;
@@ -821,7 +901,11 @@ Status PandasConverter::ConvertBooleans() {
} else if (obj == Py_False) {
BitUtil::SetBit(null_bitmap_data_, i);
} else {
- return InvalidConversion(obj, "bool");
+ std::stringstream ss;
+ ss << "Error converting from Python objects to "
+ << type_->ToString() << ": ";
+ RETURN_NOT_OK(InvalidConversion(obj, "bool", &ss));
+ return Status::Invalid(ss.str());
}
}
@@ -882,28 +966,35 @@ Status PandasConverter::ConvertObjects() {
RETURN_NOT_OK(ImportFromModule(decimal, "Decimal", &Decimal));
for (int64_t i = 0; i < length_; ++i) {
- if (PandasObjectIsNull(objects[i])) {
+ PyObject* obj = objects[i];
+ if (PandasObjectIsNull(obj)) {
continue;
- } else if (PyObject_is_string(objects[i])) {
+ } else if (PyObject_is_string(obj)) {
return ConvertObjectStrings();
- } else if (PyObject_is_float(objects[i])) {
+ } else if (PyObject_is_float(obj)) {
return ConvertObjectFloats();
- } else if (PyBool_Check(objects[i])) {
+ } else if (PyBool_Check(obj)) {
return ConvertBooleans();
- } else if (PyObject_is_integer(objects[i])) {
+ } else if (PyObject_is_integer(obj)) {
return ConvertObjectIntegers();
- } else if (PyDate_CheckExact(objects[i])) {
+ } else if (PyDate_CheckExact(obj)) {
// We could choose Date32 or Date64
return ConvertDates<Date32Type>();
- } else if (PyObject_IsInstance(const_cast<PyObject*>(objects[i]), Decimal.obj())) {
+ } else if (PyTime_Check(obj)) {
+ return ConvertTimes();
+ } else if (PyObject_IsInstance(const_cast<PyObject*>(obj), Decimal.obj())) {
return ConvertDecimals();
- } else if (PyList_Check(objects[i]) || PyArray_Check(objects[i])) {
+ } else if (PyList_Check(obj) || PyArray_Check(obj)) {
std::shared_ptr<DataType> inferred_type;
- RETURN_NOT_OK(InferArrowType(objects[i], &inferred_type));
+ RETURN_NOT_OK(InferArrowType(obj, &inferred_type));
return ConvertLists(inferred_type);
} else {
- return InvalidConversion(const_cast<PyObject*>(objects[i]),
- "string, bool, float, int, date, decimal, list, array");
+ const std::string supported_types =
+ "string, bool, float, int, date, time, decimal, list, array";
+ std::stringstream ss;
+ ss << "Error inferring Arrow type for Python object array. ";
+ RETURN_NOT_OK(InvalidConversion(obj, supported_types, &ss));
+ return Status::Invalid(ss.str());
}
}
}
@@ -1187,7 +1278,6 @@ class PandasBlock {
INT64,
FLOAT,
DOUBLE,
- DECIMAL,
BOOL,
DATETIME,
DATETIME_WITH_TZ,
@@ -1598,6 +1688,10 @@ class ObjectBlock : public PandasBlock {
RETURN_NOT_OK(ConvertBinaryLike<StringType>(data, out_buffer));
} else if (type == Type::FIXED_SIZE_BINARY) {
RETURN_NOT_OK(ConvertFixedSizeBinary(data, out_buffer));
+ } else if (type == Type::TIME32) {
+ RETURN_NOT_OK(ConvertTimes<Time32Type>(data, out_buffer));
+ } else if (type == Type::TIME64) {
+ RETURN_NOT_OK(ConvertTimes<Time64Type>(data, out_buffer));
} else if (type == Type::DECIMAL) {
RETURN_NOT_OK(ConvertDecimals(data, out_buffer));
} else if (type == Type::NA) {
@@ -1950,7 +2044,6 @@ Status MakeBlock(PandasBlock::type type, int64_t num_rows, int num_columns,
BLOCK_CASE(DOUBLE, Float64Block);
BLOCK_CASE(BOOL, BoolBlock);
BLOCK_CASE(DATETIME, DatetimeBlock);
- BLOCK_CASE(DECIMAL, ObjectBlock);
default:
return Status::NotImplemented("Unsupported block type");
}
@@ -2052,9 +2145,14 @@ class DataFrameBlockCreator {
case Type::DOUBLE:
output_type = PandasBlock::DOUBLE;
break;
+ case Type::NA:
case Type::STRING:
case Type::BINARY:
case Type::FIXED_SIZE_BINARY:
+ case Type::STRUCT:
+ case Type::TIME32:
+ case Type::TIME64:
+ case Type::DECIMAL:
output_type = PandasBlock::OBJECT;
break;
case Type::DATE32:
@@ -2084,13 +2182,6 @@ class DataFrameBlockCreator {
case Type::DICTIONARY:
output_type = PandasBlock::CATEGORICAL;
break;
- case Type::DECIMAL:
- output_type = PandasBlock::DECIMAL;
- break;
- case Type::NA:
- case Type::STRUCT:
- output_type = PandasBlock::OBJECT;
- break;
default:
std::stringstream ss;
ss << "No known equivalent Pandas block for Arrow data of type ";
@@ -2386,40 +2477,45 @@ class ArrowDeserializer {
return Status::OK();
}
- // Boolean specialization
- Status Visit(const BooleanType& type) {
- if (data_.null_count() > 0) {
- RETURN_NOT_OK(AllocateOutput(NPY_OBJECT));
- auto out_values = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
- RETURN_NOT_OK(ConvertBooleanWithNulls(data_, out_values));
- } else {
- RETURN_NOT_OK(AllocateOutput(arrow_traits<Type::BOOL>::npy_type));
- auto out_values = reinterpret_cast<uint8_t*>(PyArray_DATA(arr_));
- ConvertBooleanNoNulls(data_, out_values);
- }
- return Status::OK();
+ template <typename FUNCTOR>
+ inline Status VisitObjects(FUNCTOR func) {
+ RETURN_NOT_OK(AllocateOutput(NPY_OBJECT));
+ auto out_values = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
+ return func(data_, out_values);
}
// UTF8 strings
template <typename Type>
typename std::enable_if<std::is_base_of<BinaryType, Type>::value, Status>::type Visit(
const Type& type) {
- RETURN_NOT_OK(AllocateOutput(NPY_OBJECT));
- auto out_values = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
- return ConvertBinaryLike<Type>(data_, out_values);
+ return VisitObjects(ConvertBinaryLike<Type>);
}
+ Status Visit(const NullType& type) { return VisitObjects(ConvertNulls); }
+
// Fixed length binary strings
Status Visit(const FixedSizeBinaryType& type) {
- RETURN_NOT_OK(AllocateOutput(NPY_OBJECT));
- auto out_values = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
- return ConvertFixedSizeBinary(data_, out_values);
+ return VisitObjects(ConvertFixedSizeBinary);
}
- Status Visit(const DecimalType& type) {
- RETURN_NOT_OK(AllocateOutput(NPY_OBJECT));
- auto out_values = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
- return ConvertDecimals(data_, out_values);
+ Status Visit(const DecimalType& type) { return VisitObjects(ConvertDecimals); }
+
+ Status Visit(const Time32Type& type) { return VisitObjects(ConvertTimes<Time32Type>); }
+
+ Status Visit(const Time64Type& type) { return VisitObjects(ConvertTimes<Time64Type>); }
+
+ Status Visit(const StructType& type) { return VisitObjects(ConvertStruct); }
+
+ // Boolean specialization
+ Status Visit(const BooleanType& type) {
+ if (data_.null_count() > 0) {
+ return VisitObjects(ConvertBooleanWithNulls);
+ } else {
+ RETURN_NOT_OK(AllocateOutput(arrow_traits<Type::BOOL>::npy_type));
+ auto out_values = reinterpret_cast<uint8_t*>(PyArray_DATA(arr_));
+ ConvertBooleanNoNulls(data_, out_values);
+ }
+ return Status::OK();
}
Status Visit(const ListType& type) {
@@ -2479,18 +2575,6 @@ class ArrowDeserializer {
return Status::OK();
}
- Status Visit(const NullType& type) {
- RETURN_NOT_OK(AllocateOutput(NPY_OBJECT));
- auto out_values = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
- return ConvertNulls(data_, out_values);
- }
-
- Status Visit(const StructType& type) {
- RETURN_NOT_OK(AllocateOutput(NPY_OBJECT));
- auto out_values = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
- return ConvertStruct(data_, out_values);
- }
-
Status Visit(const UnionType& type) { return Status::NotImplemented("union type"); }
Status Convert(PyObject** out) {
http://git-wip-us.apache.org/repos/asf/arrow/blob/50b518af/cpp/src/arrow/python/util/datetime.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/util/datetime.h b/cpp/src/arrow/python/util/datetime.h
index 7ebf46a..d32421e 100644
--- a/cpp/src/arrow/python/util/datetime.h
+++ b/cpp/src/arrow/python/util/datetime.h
@@ -24,6 +24,51 @@
namespace arrow {
namespace py {
+static inline int64_t PyTime_to_us(PyObject* pytime) {
+ return (static_cast<int64_t>(PyDateTime_TIME_GET_HOUR(pytime)) * 3600000000LL +
+ static_cast<int64_t>(PyDateTime_TIME_GET_MINUTE(pytime)) * 60000000LL +
+ static_cast<int64_t>(PyDateTime_TIME_GET_SECOND(pytime)) * 1000000LL +
+ PyDateTime_TIME_GET_MICROSECOND(pytime));
+}
+
+static inline Status PyTime_from_int(
+ int64_t val, const TimeUnit::type unit, PyObject** out) {
+ int64_t hour = 0, minute = 0, second = 0, microsecond = 0;
+ switch (unit) {
+ case TimeUnit::NANO:
+ if (val % 1000 != 0) {
+ std::stringstream ss;
+ ss << "Value " << val << " has non-zero nanoseconds";
+ return Status::Invalid(ss.str());
+ }
+ val /= 1000;
+ // fall through
+ case TimeUnit::MICRO:
+ microsecond = val - (val / 1000000LL) * 1000000LL;
+ val /= 1000000LL;
+ second = val - (val / 60) * 60;
+ val /= 60;
+ minute = val - (val / 60) * 60;
+ hour = val / 60;
+ break;
+ case TimeUnit::MILLI:
+ microsecond = (val - (val / 1000) * 1000) * 1000;
+ val /= 1000;
+ // fall through
+ case TimeUnit::SECOND:
+ second = val - (val / 60) * 60;
+ val /= 60;
+ minute = val - (val / 60) * 60;
+ hour = val / 60;
+ break;
+ default:
+ break;
+ }
+ *out = PyTime_FromTime(static_cast<int32_t>(hour), static_cast<int32_t>(minute),
+ static_cast<int32_t>(second), static_cast<int32_t>(microsecond));
+ return Status::OK();
+}
+
static inline int64_t PyDate_to_ms(PyDateTime_Date* pydate) {
struct tm date = {0};
date.tm_year = PyDateTime_GET_YEAR(pydate) - 1900;
http://git-wip-us.apache.org/repos/asf/arrow/blob/50b518af/cpp/src/arrow/util/logging.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/util/logging.h b/cpp/src/arrow/util/logging.h
index b618121..2fec4fa 100644
--- a/cpp/src/arrow/util/logging.h
+++ b/cpp/src/arrow/util/logging.h
@@ -103,8 +103,7 @@ class NullLog {
class CerrLog {
public:
CerrLog(int severity) // NOLINT(runtime/explicit)
- : severity_(severity),
- has_logged_(false) {}
+ : severity_(severity), has_logged_(false) {}
virtual ~CerrLog() {
if (has_logged_) { std::cerr << std::endl; }
http://git-wip-us.apache.org/repos/asf/arrow/blob/50b518af/cpp/src/plasma/protocol.cc
----------------------------------------------------------------------
diff --git a/cpp/src/plasma/protocol.cc b/cpp/src/plasma/protocol.cc
index 246aa29..9739d77 100644
--- a/cpp/src/plasma/protocol.cc
+++ b/cpp/src/plasma/protocol.cc
@@ -38,8 +38,8 @@ to_flatbuffer(flatbuffers::FlatBufferBuilder* fbb, const ObjectID* object_ids,
Status PlasmaReceive(int sock, int64_t message_type, std::vector<uint8_t>* buffer) {
int64_t type;
RETURN_NOT_OK(ReadMessage(sock, &type, buffer));
- ARROW_CHECK(type == message_type) << "type = " << type
- << ", message_type = " << message_type;
+ ARROW_CHECK(type == message_type)
+ << "type = " << type << ", message_type = " << message_type;
return Status::OK();
}
http://git-wip-us.apache.org/repos/asf/arrow/blob/50b518af/python/pyarrow/array.pxi
----------------------------------------------------------------------
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index ae9ff88..f8bccc7 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -15,1122 +15,6 @@
# specific language governing permissions and limitations
# under the License.
-from pyarrow.includes.libarrow cimport *
-
-# These are imprecise because the type (in pandas 0.x) depends on the presence
-# of nulls
-cdef dict _pandas_type_map = {
- _Type_NA: np.float64, # NaNs
- _Type_BOOL: np.bool_,
- _Type_INT8: np.int8,
- _Type_INT16: np.int16,
- _Type_INT32: np.int32,
- _Type_INT64: np.int64,
- _Type_UINT8: np.uint8,
- _Type_UINT16: np.uint16,
- _Type_UINT32: np.uint32,
- _Type_UINT64: np.uint64,
- _Type_HALF_FLOAT: np.float16,
- _Type_FLOAT: np.float32,
- _Type_DOUBLE: np.float64,
- _Type_DATE32: np.dtype('datetime64[ns]'),
- _Type_DATE64: np.dtype('datetime64[ns]'),
- _Type_TIMESTAMP: np.dtype('datetime64[ns]'),
- _Type_BINARY: np.object_,
- _Type_FIXED_SIZE_BINARY: np.object_,
- _Type_STRING: np.object_,
- _Type_LIST: np.object_,
- _Type_DECIMAL: np.object_,
-}
-
-
-cdef class DataType:
-
- def __cinit__(self):
- pass
-
- cdef void init(self, const shared_ptr[CDataType]& type):
- self.sp_type = type
- self.type = type.get()
-
- property id:
-
- def __get__(self):
- return self.type.id()
-
- def __str__(self):
- if self.type is NULL:
- raise TypeError(
- '{} is incomplete. The correct way to construct types is '
- 'through public API functions named '
- 'pyarrow.int64, pyarrow.list_, etc.'.format(
- type(self).__name__
- )
- )
- return frombytes(self.type.ToString())
-
- def __repr__(self):
- return '{0.__class__.__name__}({0})'.format(self)
-
- def __richcmp__(DataType self, DataType other, int op):
- if op == cp.Py_EQ:
- return self.type.Equals(deref(other.type))
- elif op == cp.Py_NE:
- return not self.type.Equals(deref(other.type))
- else:
- raise TypeError('Invalid comparison')
-
- def to_pandas_dtype(self):
- """
- Return the NumPy dtype that would be used for storing this
- """
- cdef Type type_id = self.type.id()
- if type_id in _pandas_type_map:
- return _pandas_type_map[type_id]
- else:
- raise NotImplementedError(str(self))
-
-
-cdef class DictionaryType(DataType):
-
- cdef void init(self, const shared_ptr[CDataType]& type):
- DataType.init(self, type)
- self.dict_type = <const CDictionaryType*> type.get()
-
-
-cdef class ListType(DataType):
-
- cdef void init(self, const shared_ptr[CDataType]& type):
- DataType.init(self, type)
- self.list_type = <const CListType*> type.get()
-
- property value_type:
-
- def __get__(self):
- return pyarrow_wrap_data_type(self.list_type.value_type())
-
-
-cdef class TimestampType(DataType):
-
- cdef void init(self, const shared_ptr[CDataType]& type):
- DataType.init(self, type)
- self.ts_type = <const CTimestampType*> type.get()
-
- property unit:
-
- def __get__(self):
- return timeunit_to_string(self.ts_type.unit())
-
- property tz:
-
- def __get__(self):
- if self.ts_type.timezone().size() > 0:
- return frombytes(self.ts_type.timezone())
- else:
- return None
-
-
-cdef class Time32Type(DataType):
-
- cdef void init(self, const shared_ptr[CDataType]& type):
- DataType.init(self, type)
- self.time_type = <const CTime32Type*> type.get()
-
- property unit:
-
- def __get__(self):
- return timeunit_to_string(self.time_type.unit())
-
-
-cdef class Time64Type(DataType):
-
- cdef void init(self, const shared_ptr[CDataType]& type):
- DataType.init(self, type)
- self.time_type = <const CTime64Type*> type.get()
-
- property unit:
-
- def __get__(self):
- return timeunit_to_string(self.time_type.unit())
-
-
-cdef class FixedSizeBinaryType(DataType):
-
- cdef void init(self, const shared_ptr[CDataType]& type):
- DataType.init(self, type)
- self.fixed_size_binary_type = (
- <const CFixedSizeBinaryType*> type.get())
-
- property byte_width:
-
- def __get__(self):
- return self.fixed_size_binary_type.byte_width()
-
-
-cdef class DecimalType(FixedSizeBinaryType):
-
- cdef void init(self, const shared_ptr[CDataType]& type):
- DataType.init(self, type)
- self.decimal_type = <const CDecimalType*> type.get()
-
- property precision:
-
- def __get__(self):
- return self.decimal_type.precision()
-
- property scale:
-
- def __get__(self):
- return self.decimal_type.scale()
-
-
-cdef class Field:
- """
- Represents a named field, with a data type, nullability, and optional
- metadata
-
- Notes
- -----
- Do not use this class's constructor directly; use pyarrow.field
- """
- def __cinit__(self):
- pass
-
- cdef void init(self, const shared_ptr[CField]& field):
- self.sp_field = field
- self.field = field.get()
- self.type = pyarrow_wrap_data_type(field.get().type())
-
- def equals(self, Field other):
- """
- Test if this field is equal to the other
- """
- return self.field.Equals(deref(other.field))
-
- def __str__(self):
- self._check_null()
- return 'pyarrow.Field<{0}>'.format(frombytes(self.field.ToString()))
-
- def __repr__(self):
- return self.__str__()
-
- property nullable:
-
- def __get__(self):
- self._check_null()
- return self.field.nullable()
-
- property name:
-
- def __get__(self):
- self._check_null()
- return frombytes(self.field.name())
-
- property metadata:
-
- def __get__(self):
- self._check_null()
- return box_metadata(self.field.metadata().get())
-
- def _check_null(self):
- if self.field == NULL:
- raise ReferenceError(
- 'Field not initialized (references NULL pointer)')
-
- def add_metadata(self, dict metadata):
- """
- Add metadata as dict of string keys and values to Field
-
- Parameters
- ----------
- metadata : dict
- Keys and values must be string-like / coercible to bytes
-
- Returns
- -------
- field : pyarrow.Field
- """
- cdef shared_ptr[CKeyValueMetadata] c_meta
- convert_metadata(metadata, &c_meta)
-
- cdef shared_ptr[CField] new_field
- with nogil:
- check_status(self.field.AddMetadata(c_meta, &new_field))
-
- return pyarrow_wrap_field(new_field)
-
- def remove_metadata(self):
- """
- Create new field without metadata, if any
-
- Returns
- -------
- field : pyarrow.Field
- """
- cdef shared_ptr[CField] new_field
- with nogil:
- new_field = self.field.RemoveMetadata()
- return pyarrow_wrap_field(new_field)
-
-
-cdef class Schema:
-
- def __cinit__(self):
- pass
-
- def __len__(self):
- return self.schema.num_fields()
-
- def __getitem__(self, int64_t i):
-
- cdef:
- Field result = Field()
- int64_t num_fields = self.schema.num_fields()
- int64_t index
-
- if not -num_fields <= i < num_fields:
- raise IndexError(
- 'Schema field index {:d} is out of range'.format(i)
- )
-
- index = i if i >= 0 else num_fields + i
- assert index >= 0
-
- result.init(self.schema.field(index))
- result.type = pyarrow_wrap_data_type(result.field.type())
-
- return result
-
- cdef void init(self, const vector[shared_ptr[CField]]& fields):
- self.schema = new CSchema(fields)
- self.sp_schema.reset(self.schema)
-
- cdef void init_schema(self, const shared_ptr[CSchema]& schema):
- self.schema = schema.get()
- self.sp_schema = schema
-
- property names:
-
- def __get__(self):
- cdef int i
- result = []
- for i in range(self.schema.num_fields()):
- name = frombytes(self.schema.field(i).get().name())
- result.append(name)
- return result
-
- property metadata:
-
- def __get__(self):
- return box_metadata(self.schema.metadata().get())
-
- def equals(self, other):
- """
- Test if this schema is equal to the other
- """
- cdef Schema _other
- _other = other
-
- return self.sp_schema.get().Equals(deref(_other.schema))
-
- def field_by_name(self, name):
- """
- Access a field by its name rather than the column index.
-
- Parameters
- ----------
- name: str
-
- Returns
- -------
- field: pyarrow.Field
- """
- return pyarrow_wrap_field(self.schema.GetFieldByName(tobytes(name)))
-
- def get_field_index(self, name):
- return self.schema.GetFieldIndex(tobytes(name))
-
- def add_metadata(self, dict metadata):
- """
- Add metadata as dict of string keys and values to Schema
-
- Parameters
- ----------
- metadata : dict
- Keys and values must be string-like / coercible to bytes
-
- Returns
- -------
- schema : pyarrow.Schema
- """
- cdef shared_ptr[CKeyValueMetadata] c_meta
- convert_metadata(metadata, &c_meta)
-
- cdef shared_ptr[CSchema] new_schema
- with nogil:
- check_status(self.schema.AddMetadata(c_meta, &new_schema))
-
- return pyarrow_wrap_schema(new_schema)
-
- def remove_metadata(self):
- """
- Create new schema without metadata, if any
-
- Returns
- -------
- schema : pyarrow.Schema
- """
- cdef shared_ptr[CSchema] new_schema
- with nogil:
- new_schema = self.schema.RemoveMetadata()
- return pyarrow_wrap_schema(new_schema)
-
- def __str__(self):
- return frombytes(self.schema.ToString())
-
- def __repr__(self):
- return self.__str__()
-
-
-cdef dict box_metadata(const CKeyValueMetadata* metadata):
- cdef unordered_map[c_string, c_string] result
- if metadata != nullptr:
- metadata.ToUnorderedMap(&result)
- return result
- else:
- return None
-
-
-cdef dict _type_cache = {}
-
-
-cdef DataType primitive_type(Type type):
- if type in _type_cache:
- return _type_cache[type]
-
- cdef DataType out = DataType()
- out.init(GetPrimitiveType(type))
-
- _type_cache[type] = out
- return out
-
-#------------------------------------------------------------
-# Type factory functions
-
-cdef int convert_metadata(dict metadata,
- shared_ptr[CKeyValueMetadata]* out) except -1:
- cdef:
- shared_ptr[CKeyValueMetadata] meta = (
- make_shared[CKeyValueMetadata]())
- c_string key, value
-
- for py_key, py_value in metadata.items():
- key = tobytes(py_key)
- value = tobytes(py_value)
- meta.get().Append(key, value)
- out[0] = meta
- return 0
-
-
-def field(name, DataType type, bint nullable=True, dict metadata=None):
- """
- Create a pyarrow.Field instance
-
- Parameters
- ----------
- name : string or bytes
- type : pyarrow.DataType
- nullable : boolean, default True
- metadata : dict, default None
- Keys and values must be coercible to bytes
-
- Returns
- -------
- field : pyarrow.Field
- """
- cdef:
- shared_ptr[CKeyValueMetadata] c_meta
- Field result = Field()
-
- if metadata is not None:
- convert_metadata(metadata, &c_meta)
-
- result.sp_field.reset(new CField(tobytes(name), type.sp_type,
- nullable, c_meta))
- result.field = result.sp_field.get()
- result.type = type
- return result
-
-
-cdef set PRIMITIVE_TYPES = set([
- _Type_NA, _Type_BOOL,
- _Type_UINT8, _Type_INT8,
- _Type_UINT16, _Type_INT16,
- _Type_UINT32, _Type_INT32,
- _Type_UINT64, _Type_INT64,
- _Type_TIMESTAMP, _Type_DATE32,
- _Type_TIME32, _Type_TIME64,
- _Type_DATE64,
- _Type_HALF_FLOAT,
- _Type_FLOAT,
- _Type_DOUBLE])
-
-
-def null():
- return primitive_type(_Type_NA)
-
-
-def bool_():
- return primitive_type(_Type_BOOL)
-
-
-def uint8():
- return primitive_type(_Type_UINT8)
-
-
-def int8():
- return primitive_type(_Type_INT8)
-
-
-def uint16():
- return primitive_type(_Type_UINT16)
-
-
-def int16():
- return primitive_type(_Type_INT16)
-
-
-def uint32():
- return primitive_type(_Type_UINT32)
-
-
-def int32():
- return primitive_type(_Type_INT32)
-
-
-def uint64():
- return primitive_type(_Type_UINT64)
-
-
-def int64():
- return primitive_type(_Type_INT64)
-
-
-cdef dict _timestamp_type_cache = {}
-cdef dict _time_type_cache = {}
-
-
-cdef timeunit_to_string(TimeUnit unit):
- if unit == TimeUnit_SECOND:
- return 's'
- elif unit == TimeUnit_MILLI:
- return 'ms'
- elif unit == TimeUnit_MICRO:
- return 'us'
- elif unit == TimeUnit_NANO:
- return 'ns'
-
-
-def timestamp(unit_str, tz=None):
- cdef:
- TimeUnit unit
- c_string c_timezone
-
- if unit_str == "s":
- unit = TimeUnit_SECOND
- elif unit_str == 'ms':
- unit = TimeUnit_MILLI
- elif unit_str == 'us':
- unit = TimeUnit_MICRO
- elif unit_str == 'ns':
- unit = TimeUnit_NANO
- else:
- raise ValueError('Invalid TimeUnit string')
-
- cdef TimestampType out = TimestampType()
-
- if tz is None:
- out.init(ctimestamp(unit))
- if unit in _timestamp_type_cache:
- return _timestamp_type_cache[unit]
- _timestamp_type_cache[unit] = out
- else:
- if not isinstance(tz, six.string_types):
- tz = tz.zone
-
- c_timezone = tobytes(tz)
- out.init(ctimestamp(unit, c_timezone))
-
- return out
-
-
-def time32(unit_str):
- cdef:
- TimeUnit unit
- c_string c_timezone
-
- if unit_str == "s":
- unit = TimeUnit_SECOND
- elif unit_str == 'ms':
- unit = TimeUnit_MILLI
- else:
- raise ValueError('Invalid TimeUnit for time32: {}'.format(unit_str))
-
- cdef Time32Type out
- if unit in _time_type_cache:
- return _time_type_cache[unit]
- else:
- out = Time32Type()
- out.init(ctime32(unit))
- _time_type_cache[unit] = out
- return out
-
-
-def time64(unit_str):
- cdef:
- TimeUnit unit
- c_string c_timezone
-
- if unit_str == "us":
- unit = TimeUnit_MICRO
- elif unit_str == 'ns':
- unit = TimeUnit_NANO
- else:
- raise ValueError('Invalid TimeUnit for time64: {}'.format(unit_str))
-
- cdef Time64Type out
- if unit in _time_type_cache:
- return _time_type_cache[unit]
- else:
- out = Time64Type()
- out.init(ctime64(unit))
- _time_type_cache[unit] = out
- return out
-
-
-def date32():
- return primitive_type(_Type_DATE32)
-
-
-def date64():
- return primitive_type(_Type_DATE64)
-
-
-def float16():
- return primitive_type(_Type_HALF_FLOAT)
-
-
-def float32():
- return primitive_type(_Type_FLOAT)
-
-
-def float64():
- return primitive_type(_Type_DOUBLE)
-
-
-cpdef DataType decimal(int precision, int scale=0):
- cdef shared_ptr[CDataType] decimal_type
- decimal_type.reset(new CDecimalType(precision, scale))
- return pyarrow_wrap_data_type(decimal_type)
-
-
-def string():
- """
- UTF8 string
- """
- return primitive_type(_Type_STRING)
-
-
-def binary(int length=-1):
- """Binary (PyBytes-like) type
-
- Parameters
- ----------
- length : int, optional, default -1
- If length == -1 then return a variable length binary type. If length is
- greater than or equal to 0 then return a fixed size binary type of
- width `length`.
- """
- if length == -1:
- return primitive_type(_Type_BINARY)
-
- cdef shared_ptr[CDataType] fixed_size_binary_type
- fixed_size_binary_type.reset(new CFixedSizeBinaryType(length))
- return pyarrow_wrap_data_type(fixed_size_binary_type)
-
-
-cpdef ListType list_(value_type):
- """
- Create ListType instance from child data type or field
-
- Parameters
- ----------
- value_type : DataType or Field
-
- Returns
- -------
- list_type : DataType
- """
- cdef:
- DataType data_type
- Field field
- shared_ptr[CDataType] list_type
- ListType out = ListType()
-
- if isinstance(value_type, DataType):
- list_type.reset(new CListType((<DataType> value_type).sp_type))
- elif isinstance(value_type, Field):
- list_type.reset(new CListType((<Field> value_type).sp_field))
- else:
- raise ValueError('List requires DataType or Field')
-
- out.init(list_type)
- return out
-
-
-cpdef DictionaryType dictionary(DataType index_type, Array dictionary):
- """
- Dictionary (categorical, or simply encoded) type
-
- Parameters
- ----------
- index_type : DataType
- dictionary : Array
-
- Returns
- -------
- type : DictionaryType
- """
- cdef DictionaryType out = DictionaryType()
- cdef shared_ptr[CDataType] dict_type
- dict_type.reset(new CDictionaryType(index_type.sp_type,
- dictionary.sp_array))
- out.init(dict_type)
- return out
-
-
-def struct(fields):
- """
- Create StructType instance from fields
-
- Parameters
- ----------
- fields : sequence of Field values
-
- Examples
- --------
- import pyarrow as pa
- fields = [
- pa.field('f1', pa.int32()),
- pa.field('f2', pa.string())
- ]
- struct_type = pa.struct(fields)
-
- Returns
- -------
- type : DataType
- """
- cdef:
- Field field
- vector[shared_ptr[CField]] c_fields
- cdef shared_ptr[CDataType] struct_type
-
- for field in fields:
- c_fields.push_back(field.sp_field)
-
- struct_type.reset(new CStructType(c_fields))
- return pyarrow_wrap_data_type(struct_type)
-
-
-def schema(fields):
- """
- Construct pyarrow.Schema from collection of fields
-
- Parameters
- ----------
- field : list or iterable
-
- Returns
- -------
- schema : pyarrow.Schema
- """
- cdef:
- Schema result
- Field field
- vector[shared_ptr[CField]] c_fields
-
- for i, field in enumerate(fields):
- c_fields.push_back(field.sp_field)
-
- result = Schema()
- result.init(c_fields)
- return result
-
-
-def from_numpy_dtype(object dtype):
- """
- Convert NumPy dtype to pyarrow.DataType
- """
- cdef shared_ptr[CDataType] c_type
- with nogil:
- check_status(NumPyDtypeToArrow(dtype, &c_type))
-
- return pyarrow_wrap_data_type(c_type)
-
-
-NA = None
-
-
-cdef class NAType(Scalar):
-
- def __cinit__(self):
- global NA
- if NA is not None:
- raise Exception('Cannot create multiple NAType instances')
-
- self.type = null()
-
- def __repr__(self):
- return 'NA'
-
- def as_py(self):
- return None
-
-
-NA = NAType()
-
-
-cdef class ArrayValue(Scalar):
-
- cdef void init(self, DataType type, const shared_ptr[CArray]& sp_array,
- int64_t index):
- self.type = type
- self.index = index
- self._set_array(sp_array)
-
- cdef void _set_array(self, const shared_ptr[CArray]& sp_array):
- self.sp_array = sp_array
-
- def _check_null(self):
- if self.sp_array.get() == NULL:
- raise ReferenceError(
- 'ArrayValue instance not propertly initialized '
- '(references NULL pointer)')
-
- def __repr__(self):
- self._check_null()
- if hasattr(self, 'as_py'):
- return repr(self.as_py())
- else:
- return super(Scalar, self).__repr__()
-
-
-cdef class BooleanValue(ArrayValue):
-
- def as_py(self):
- cdef CBooleanArray* ap = <CBooleanArray*> self.sp_array.get()
- return ap.Value(self.index)
-
-
-cdef class Int8Value(ArrayValue):
-
- def as_py(self):
- cdef CInt8Array* ap = <CInt8Array*> self.sp_array.get()
- return ap.Value(self.index)
-
-
-cdef class UInt8Value(ArrayValue):
-
- def as_py(self):
- cdef CUInt8Array* ap = <CUInt8Array*> self.sp_array.get()
- return ap.Value(self.index)
-
-
-cdef class Int16Value(ArrayValue):
-
- def as_py(self):
- cdef CInt16Array* ap = <CInt16Array*> self.sp_array.get()
- return ap.Value(self.index)
-
-
-cdef class UInt16Value(ArrayValue):
-
- def as_py(self):
- cdef CUInt16Array* ap = <CUInt16Array*> self.sp_array.get()
- return ap.Value(self.index)
-
-
-cdef class Int32Value(ArrayValue):
-
- def as_py(self):
- cdef CInt32Array* ap = <CInt32Array*> self.sp_array.get()
- return ap.Value(self.index)
-
-
-cdef class UInt32Value(ArrayValue):
-
- def as_py(self):
- cdef CUInt32Array* ap = <CUInt32Array*> self.sp_array.get()
- return ap.Value(self.index)
-
-
-cdef class Int64Value(ArrayValue):
-
- def as_py(self):
- cdef CInt64Array* ap = <CInt64Array*> self.sp_array.get()
- return ap.Value(self.index)
-
-
-cdef class UInt64Value(ArrayValue):
-
- def as_py(self):
- cdef CUInt64Array* ap = <CUInt64Array*> self.sp_array.get()
- return ap.Value(self.index)
-
-
-cdef class Date32Value(ArrayValue):
-
- def as_py(self):
- cdef CDate32Array* ap = <CDate32Array*> self.sp_array.get()
-
- # Shift to seconds since epoch
- return datetime.datetime.utcfromtimestamp(
- int(ap.Value(self.index)) * 86400).date()
-
-
-cdef class Date64Value(ArrayValue):
-
- def as_py(self):
- cdef CDate64Array* ap = <CDate64Array*> self.sp_array.get()
- return datetime.datetime.utcfromtimestamp(
- ap.Value(self.index) / 1000).date()
-
-
-cdef class Time32Value(ArrayValue):
-
- def as_py(self):
- cdef:
- CTime32Array* ap = <CTime32Array*> self.sp_array.get()
- CTime32Type* dtype = <CTime32Type*> ap.type().get()
-
- if dtype.unit() == TimeUnit_SECOND:
- return (datetime.datetime(1970, 1, 1) + datetime.timedelta(seconds=ap.Value(self.index))).time()
- else:
- return (datetime.datetime(1970, 1, 1) + datetime.timedelta(milliseconds=ap.Value(self.index))).time()
-
-
-cdef class Time64Value(ArrayValue):
-
- def as_py(self):
- cdef:
- CTime64Array* ap = <CTime64Array*> self.sp_array.get()
- CTime64Type* dtype = <CTime64Type*> ap.type().get()
-
- if dtype.unit() == TimeUnit_MICRO:
- return (datetime.datetime(1970, 1, 1) + datetime.timedelta(microseconds=ap.Value(self.index))).time()
- else:
- return (datetime.datetime(1970, 1, 1) + datetime.timedelta(microseconds=ap.Value(self.index) / 1000)).time()
-
-
-cdef dict DATETIME_CONVERSION_FUNCTIONS
-
-try:
- import pandas as pd
-except ImportError:
- DATETIME_CONVERSION_FUNCTIONS = {
- TimeUnit_SECOND: lambda x, tzinfo: (
- datetime.datetime.utcfromtimestamp(x).replace(tzinfo=tzinfo)
- ),
- TimeUnit_MILLI: lambda x, tzinfo: (
- datetime.datetime.utcfromtimestamp(x / 1e3).replace(tzinfo=tzinfo)
- ),
- TimeUnit_MICRO: lambda x, tzinfo: (
- datetime.datetime.utcfromtimestamp(x / 1e6).replace(tzinfo=tzinfo)
- ),
- }
-else:
- DATETIME_CONVERSION_FUNCTIONS = {
- TimeUnit_SECOND: lambda x, tzinfo: pd.Timestamp(
- x * 1000000000, tz=tzinfo, unit='ns',
- ),
- TimeUnit_MILLI: lambda x, tzinfo: pd.Timestamp(
- x * 1000000, tz=tzinfo, unit='ns',
- ),
- TimeUnit_MICRO: lambda x, tzinfo: pd.Timestamp(
- x * 1000, tz=tzinfo, unit='ns',
- ),
- TimeUnit_NANO: lambda x, tzinfo: pd.Timestamp(
- x, tz=tzinfo, unit='ns',
- )
- }
-
-
-cdef class TimestampValue(ArrayValue):
-
- def as_py(self):
- cdef:
- CTimestampArray* ap = <CTimestampArray*> self.sp_array.get()
- CTimestampType* dtype = <CTimestampType*> ap.type().get()
- int64_t value = ap.Value(self.index)
-
- if not dtype.timezone().empty():
- import pytz
- tzinfo = pytz.timezone(frombytes(dtype.timezone()))
- else:
- tzinfo = None
-
- try:
- converter = DATETIME_CONVERSION_FUNCTIONS[dtype.unit()]
- except KeyError:
- raise ValueError(
- 'Cannot convert nanosecond timestamps without pandas'
- )
- return converter(value, tzinfo=tzinfo)
-
-
-cdef class FloatValue(ArrayValue):
-
- def as_py(self):
- cdef CFloatArray* ap = <CFloatArray*> self.sp_array.get()
- return ap.Value(self.index)
-
-
-cdef class DoubleValue(ArrayValue):
-
- def as_py(self):
- cdef CDoubleArray* ap = <CDoubleArray*> self.sp_array.get()
- return ap.Value(self.index)
-
-
-cdef class DecimalValue(ArrayValue):
-
- def as_py(self):
- cdef:
- CDecimalArray* ap = <CDecimalArray*> self.sp_array.get()
- c_string s = ap.FormatValue(self.index)
- return _pydecimal.Decimal(s.decode('utf8'))
-
-
-cdef class StringValue(ArrayValue):
-
- def as_py(self):
- cdef CStringArray* ap = <CStringArray*> self.sp_array.get()
- return ap.GetString(self.index).decode('utf-8')
-
-
-cdef class BinaryValue(ArrayValue):
-
- def as_py(self):
- cdef:
- const uint8_t* ptr
- int32_t length
- CBinaryArray* ap = <CBinaryArray*> self.sp_array.get()
-
- ptr = ap.GetValue(self.index, &length)
- return cp.PyBytes_FromStringAndSize(<const char*>(ptr), length)
-
-
-cdef class ListValue(ArrayValue):
-
- def __len__(self):
- return self.ap.value_length(self.index)
-
- def __getitem__(self, i):
- return self.getitem(i)
-
- def __iter__(self):
- for i in range(len(self)):
- yield self.getitem(i)
- raise StopIteration
-
- cdef void _set_array(self, const shared_ptr[CArray]& sp_array):
- self.sp_array = sp_array
- self.ap = <CListArray*> sp_array.get()
- self.value_type = pyarrow_wrap_data_type(self.ap.value_type())
-
- cdef getitem(self, int64_t i):
- cdef int64_t j = self.ap.value_offset(self.index) + i
- return box_scalar(self.value_type, self.ap.values(), j)
-
- def as_py(self):
- cdef:
- int64_t j
- list result = []
-
- for j in range(len(self)):
- result.append(self.getitem(j).as_py())
-
- return result
-
-
-cdef class FixedSizeBinaryValue(ArrayValue):
-
- def as_py(self):
- cdef:
- CFixedSizeBinaryArray* ap
- CFixedSizeBinaryType* ap_type
- int32_t length
- const char* data
- ap = <CFixedSizeBinaryArray*> self.sp_array.get()
- ap_type = <CFixedSizeBinaryType*> ap.type().get()
- length = ap_type.byte_width()
- data = <const char*> ap.GetValue(self.index)
- return cp.PyBytes_FromStringAndSize(data, length)
-
-
-cdef class StructValue(ArrayValue):
- def as_py(self):
- cdef:
- CStructArray* ap
- vector[shared_ptr[CField]] child_fields = self.type.type.children()
- ap = <CStructArray*> self.sp_array.get()
- wrapped_arrays = (pyarrow_wrap_array(ap.field(i))
- for i in range(ap.num_fields()))
- child_names = (child.get().name() for child in child_fields)
- # Return the struct as a dict
- return {
- frombytes(name): child_array[self.index].as_py()
- for name, child_array in
- zip(child_names, wrapped_arrays)
- }
-
-cdef dict _scalar_classes = {
- _Type_BOOL: BooleanValue,
- _Type_UINT8: Int8Value,
- _Type_UINT16: Int16Value,
- _Type_UINT32: Int32Value,
- _Type_UINT64: Int64Value,
- _Type_INT8: Int8Value,
- _Type_INT16: Int16Value,
- _Type_INT32: Int32Value,
- _Type_INT64: Int64Value,
- _Type_DATE32: Date32Value,
- _Type_DATE64: Date64Value,
- _Type_TIME32: Time32Value,
- _Type_TIME64: Time64Value,
- _Type_TIMESTAMP: TimestampValue,
- _Type_FLOAT: FloatValue,
- _Type_DOUBLE: DoubleValue,
- _Type_LIST: ListValue,
- _Type_BINARY: BinaryValue,
- _Type_STRING: StringValue,
- _Type_FIXED_SIZE_BINARY: FixedSizeBinaryValue,
- _Type_DECIMAL: DecimalValue,
- _Type_STRUCT: StructValue,
-}
-
-cdef object box_scalar(DataType type, const shared_ptr[CArray]& sp_array,
- int64_t index):
- cdef ArrayValue val
- if type.type.id() == _Type_NA:
- return NA
- elif sp_array.get().IsNull(index):
- return NA
- else:
- val = _scalar_classes[type.type.id()]()
- val.init(type, sp_array, index)
- return val
-
cdef maybe_coerce_datetime64(values, dtype, DataType type,
timestamps_to_ms=False):
@@ -1229,7 +113,7 @@ cdef class Array:
series : pandas.Series or numpy.ndarray
mask : pandas.Series or numpy.ndarray, optional
- boolean mask if the object is valid or null
+ boolean mask if the object is null (True) or valid (False)
type : pyarrow.DataType
Explicit type to attempt to coerce to
@@ -1690,6 +574,7 @@ cdef class DictionaryArray(Array):
result.init(c_result)
return result
+
cdef class StructArray(Array):
@staticmethod
def from_arrays(field_names, arrays):
http://git-wip-us.apache.org/repos/asf/arrow/blob/50b518af/python/pyarrow/lib.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx
index cf8e4df..5990308 100644
--- a/python/pyarrow/lib.pyx
+++ b/python/pyarrow/lib.pyx
@@ -100,6 +100,12 @@ include "error.pxi"
# Memory pools and allocation
include "memory.pxi"
+# DataType, Field, Schema
+include "types.pxi"
+
+# Array scalar values
+include "scalar.pxi"
+
# Array types
include "array.pxi"
http://git-wip-us.apache.org/repos/asf/arrow/blob/50b518af/python/pyarrow/pandas_compat.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index c909b3e..9b2a5c4 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -62,6 +62,8 @@ def get_logical_type_map():
pa.lib.Type_DOUBLE: 'float64',
pa.lib.Type_DATE32: 'date',
pa.lib.Type_DATE64: 'date',
+ pa.lib.Type_TIME32: 'time',
+ pa.lib.Type_TIME64: 'time',
pa.lib.Type_BINARY: 'bytes',
pa.lib.Type_FIXED_SIZE_BINARY: 'bytes',
pa.lib.Type_STRING: 'unicode',
http://git-wip-us.apache.org/repos/asf/arrow/blob/50b518af/python/pyarrow/scalar.pxi
----------------------------------------------------------------------
diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi
new file mode 100644
index 0000000..11ed0ef
--- /dev/null
+++ b/python/pyarrow/scalar.pxi
@@ -0,0 +1,376 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+NA = None
+
+
+cdef class NAType(Scalar):
+
+ def __cinit__(self):
+ global NA
+ if NA is not None:
+ raise Exception('Cannot create multiple NAType instances')
+
+ self.type = null()
+
+ def __repr__(self):
+ return 'NA'
+
+ def as_py(self):
+ return None
+
+
+NA = NAType()
+
+
+cdef class ArrayValue(Scalar):
+
+ cdef void init(self, DataType type, const shared_ptr[CArray]& sp_array,
+ int64_t index):
+ self.type = type
+ self.index = index
+ self._set_array(sp_array)
+
+ cdef void _set_array(self, const shared_ptr[CArray]& sp_array):
+ self.sp_array = sp_array
+
+ def _check_null(self):
+ if self.sp_array.get() == NULL:
+ raise ReferenceError(
+ 'ArrayValue instance not propertly initialized '
+ '(references NULL pointer)')
+
+ def __repr__(self):
+ self._check_null()
+ if hasattr(self, 'as_py'):
+ return repr(self.as_py())
+ else:
+ return super(Scalar, self).__repr__()
+
+
+cdef class BooleanValue(ArrayValue):
+
+ def as_py(self):
+ cdef CBooleanArray* ap = <CBooleanArray*> self.sp_array.get()
+ return ap.Value(self.index)
+
+
+cdef class Int8Value(ArrayValue):
+
+ def as_py(self):
+ cdef CInt8Array* ap = <CInt8Array*> self.sp_array.get()
+ return ap.Value(self.index)
+
+
+cdef class UInt8Value(ArrayValue):
+
+ def as_py(self):
+ cdef CUInt8Array* ap = <CUInt8Array*> self.sp_array.get()
+ return ap.Value(self.index)
+
+
+cdef class Int16Value(ArrayValue):
+
+ def as_py(self):
+ cdef CInt16Array* ap = <CInt16Array*> self.sp_array.get()
+ return ap.Value(self.index)
+
+
+cdef class UInt16Value(ArrayValue):
+
+ def as_py(self):
+ cdef CUInt16Array* ap = <CUInt16Array*> self.sp_array.get()
+ return ap.Value(self.index)
+
+
+cdef class Int32Value(ArrayValue):
+
+ def as_py(self):
+ cdef CInt32Array* ap = <CInt32Array*> self.sp_array.get()
+ return ap.Value(self.index)
+
+
+cdef class UInt32Value(ArrayValue):
+
+ def as_py(self):
+ cdef CUInt32Array* ap = <CUInt32Array*> self.sp_array.get()
+ return ap.Value(self.index)
+
+
+cdef class Int64Value(ArrayValue):
+
+ def as_py(self):
+ cdef CInt64Array* ap = <CInt64Array*> self.sp_array.get()
+ return ap.Value(self.index)
+
+
+cdef class UInt64Value(ArrayValue):
+
+ def as_py(self):
+ cdef CUInt64Array* ap = <CUInt64Array*> self.sp_array.get()
+ return ap.Value(self.index)
+
+
+cdef class Date32Value(ArrayValue):
+
+ def as_py(self):
+ cdef CDate32Array* ap = <CDate32Array*> self.sp_array.get()
+
+ # Shift to seconds since epoch
+ return datetime.datetime.utcfromtimestamp(
+ int(ap.Value(self.index)) * 86400).date()
+
+
+cdef class Date64Value(ArrayValue):
+
+ def as_py(self):
+ cdef CDate64Array* ap = <CDate64Array*> self.sp_array.get()
+ return datetime.datetime.utcfromtimestamp(
+ ap.Value(self.index) / 1000).date()
+
+
+cdef class Time32Value(ArrayValue):
+
+ def as_py(self):
+ cdef:
+ CTime32Array* ap = <CTime32Array*> self.sp_array.get()
+ CTime32Type* dtype = <CTime32Type*> ap.type().get()
+
+ if dtype.unit() == TimeUnit_SECOND:
+ return (datetime.datetime(1970, 1, 1) +
+ datetime.timedelta(seconds=ap.Value(self.index))).time()
+ else:
+ return (datetime.datetime(1970, 1, 1) +
+ datetime.timedelta(milliseconds=ap.Value(self.index))).time()
+
+
+cdef class Time64Value(ArrayValue):
+
+ def as_py(self):
+ cdef:
+ CTime64Array* ap = <CTime64Array*> self.sp_array.get()
+ CTime64Type* dtype = <CTime64Type*> ap.type().get()
+
+ cdef int64_t val = ap.Value(self.index)
+ print(val)
+ if dtype.unit() == TimeUnit_MICRO:
+ return (datetime.datetime(1970, 1, 1) +
+ datetime.timedelta(microseconds=val)).time()
+ else:
+ return (datetime.datetime(1970, 1, 1) +
+ datetime.timedelta(microseconds=val / 1000)).time()
+
+
+cdef dict DATETIME_CONVERSION_FUNCTIONS
+
+try:
+ import pandas as pd
+except ImportError:
+ DATETIME_CONVERSION_FUNCTIONS = {
+ TimeUnit_SECOND: lambda x, tzinfo: (
+ datetime.datetime.utcfromtimestamp(x).replace(tzinfo=tzinfo)
+ ),
+ TimeUnit_MILLI: lambda x, tzinfo: (
+ datetime.datetime.utcfromtimestamp(x / 1e3).replace(tzinfo=tzinfo)
+ ),
+ TimeUnit_MICRO: lambda x, tzinfo: (
+ datetime.datetime.utcfromtimestamp(x / 1e6).replace(tzinfo=tzinfo)
+ ),
+ }
+else:
+ DATETIME_CONVERSION_FUNCTIONS = {
+ TimeUnit_SECOND: lambda x, tzinfo: pd.Timestamp(
+ x * 1000000000, tz=tzinfo, unit='ns',
+ ),
+ TimeUnit_MILLI: lambda x, tzinfo: pd.Timestamp(
+ x * 1000000, tz=tzinfo, unit='ns',
+ ),
+ TimeUnit_MICRO: lambda x, tzinfo: pd.Timestamp(
+ x * 1000, tz=tzinfo, unit='ns',
+ ),
+ TimeUnit_NANO: lambda x, tzinfo: pd.Timestamp(
+ x, tz=tzinfo, unit='ns',
+ )
+ }
+
+
+cdef class TimestampValue(ArrayValue):
+
+ def as_py(self):
+ cdef:
+ CTimestampArray* ap = <CTimestampArray*> self.sp_array.get()
+ CTimestampType* dtype = <CTimestampType*> ap.type().get()
+ int64_t value = ap.Value(self.index)
+
+ if not dtype.timezone().empty():
+ import pytz
+ tzinfo = pytz.timezone(frombytes(dtype.timezone()))
+ else:
+ tzinfo = None
+
+ try:
+ converter = DATETIME_CONVERSION_FUNCTIONS[dtype.unit()]
+ except KeyError:
+ raise ValueError(
+ 'Cannot convert nanosecond timestamps without pandas'
+ )
+ return converter(value, tzinfo=tzinfo)
+
+
+cdef class FloatValue(ArrayValue):
+
+ def as_py(self):
+ cdef CFloatArray* ap = <CFloatArray*> self.sp_array.get()
+ return ap.Value(self.index)
+
+
+cdef class DoubleValue(ArrayValue):
+
+ def as_py(self):
+ cdef CDoubleArray* ap = <CDoubleArray*> self.sp_array.get()
+ return ap.Value(self.index)
+
+
+cdef class DecimalValue(ArrayValue):
+
+ def as_py(self):
+ cdef:
+ CDecimalArray* ap = <CDecimalArray*> self.sp_array.get()
+ c_string s = ap.FormatValue(self.index)
+ return _pydecimal.Decimal(s.decode('utf8'))
+
+
+cdef class StringValue(ArrayValue):
+
+ def as_py(self):
+ cdef CStringArray* ap = <CStringArray*> self.sp_array.get()
+ return ap.GetString(self.index).decode('utf-8')
+
+
+cdef class BinaryValue(ArrayValue):
+
+ def as_py(self):
+ cdef:
+ const uint8_t* ptr
+ int32_t length
+ CBinaryArray* ap = <CBinaryArray*> self.sp_array.get()
+
+ ptr = ap.GetValue(self.index, &length)
+ return cp.PyBytes_FromStringAndSize(<const char*>(ptr), length)
+
+
+cdef class ListValue(ArrayValue):
+
+ def __len__(self):
+ return self.ap.value_length(self.index)
+
+ def __getitem__(self, i):
+ return self.getitem(i)
+
+ def __iter__(self):
+ for i in range(len(self)):
+ yield self.getitem(i)
+ raise StopIteration
+
+ cdef void _set_array(self, const shared_ptr[CArray]& sp_array):
+ self.sp_array = sp_array
+ self.ap = <CListArray*> sp_array.get()
+ self.value_type = pyarrow_wrap_data_type(self.ap.value_type())
+
+ cdef getitem(self, int64_t i):
+ cdef int64_t j = self.ap.value_offset(self.index) + i
+ return box_scalar(self.value_type, self.ap.values(), j)
+
+ def as_py(self):
+ cdef:
+ int64_t j
+ list result = []
+
+ for j in range(len(self)):
+ result.append(self.getitem(j).as_py())
+
+ return result
+
+
+cdef class FixedSizeBinaryValue(ArrayValue):
+
+ def as_py(self):
+ cdef:
+ CFixedSizeBinaryArray* ap
+ CFixedSizeBinaryType* ap_type
+ int32_t length
+ const char* data
+ ap = <CFixedSizeBinaryArray*> self.sp_array.get()
+ ap_type = <CFixedSizeBinaryType*> ap.type().get()
+ length = ap_type.byte_width()
+ data = <const char*> ap.GetValue(self.index)
+ return cp.PyBytes_FromStringAndSize(data, length)
+
+
+cdef class StructValue(ArrayValue):
+ def as_py(self):
+ cdef:
+ CStructArray* ap
+ vector[shared_ptr[CField]] child_fields = self.type.type.children()
+ ap = <CStructArray*> self.sp_array.get()
+ wrapped_arrays = (pyarrow_wrap_array(ap.field(i))
+ for i in range(ap.num_fields()))
+ child_names = (child.get().name() for child in child_fields)
+ # Return the struct as a dict
+ return {
+ frombytes(name): child_array[self.index].as_py()
+ for name, child_array in
+ zip(child_names, wrapped_arrays)
+ }
+
+cdef dict _scalar_classes = {
+ _Type_BOOL: BooleanValue,
+ _Type_UINT8: Int8Value,
+ _Type_UINT16: Int16Value,
+ _Type_UINT32: Int32Value,
+ _Type_UINT64: Int64Value,
+ _Type_INT8: Int8Value,
+ _Type_INT16: Int16Value,
+ _Type_INT32: Int32Value,
+ _Type_INT64: Int64Value,
+ _Type_DATE32: Date32Value,
+ _Type_DATE64: Date64Value,
+ _Type_TIME32: Time32Value,
+ _Type_TIME64: Time64Value,
+ _Type_TIMESTAMP: TimestampValue,
+ _Type_FLOAT: FloatValue,
+ _Type_DOUBLE: DoubleValue,
+ _Type_LIST: ListValue,
+ _Type_BINARY: BinaryValue,
+ _Type_STRING: StringValue,
+ _Type_FIXED_SIZE_BINARY: FixedSizeBinaryValue,
+ _Type_DECIMAL: DecimalValue,
+ _Type_STRUCT: StructValue,
+}
+
+cdef object box_scalar(DataType type, const shared_ptr[CArray]& sp_array,
+ int64_t index):
+ cdef ArrayValue val
+ if type.type.id() == _Type_NA:
+ return NA
+ elif sp_array.get().IsNull(index):
+ return NA
+ else:
+ val = _scalar_classes[type.type.id()]()
+ val.init(type, sp_array, index)
+ return val
http://git-wip-us.apache.org/repos/asf/arrow/blob/50b518af/python/pyarrow/tests/test_array.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py
index ed81531..413a3be 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -223,8 +223,8 @@ def test_simple_type_construction():
(pa.decimal(18, 3), 'decimal'),
(pa.timestamp('ms'), 'datetime'),
(pa.timestamp('us', 'UTC'), 'datetimetz'),
- pytest.mark.xfail((pa.time32('s'), None), raises=NotImplementedError),
- pytest.mark.xfail((pa.time64('us'), None), raises=NotImplementedError),
+ (pa.time32('s'), 'time'),
+ (pa.time64('us'), 'time')
]
)
def test_logical_type(type, expected):
http://git-wip-us.apache.org/repos/asf/arrow/blob/50b518af/python/pyarrow/tests/test_convert_pandas.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index b8b85ca..5b84817 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -646,6 +646,95 @@ class TestPandasConversion(unittest.TestCase):
df = converted.to_pandas()
tm.assert_frame_equal(df, expected)
+ def test_pytime_from_pandas(self):
+ pytimes = [datetime.time(1, 2, 3, 1356),
+ datetime.time(4, 5, 6, 1356)]
+
+ # microseconds
+ t1 = pa.time64('us')
+
+ aobjs = np.array(pytimes + [None], dtype=object)
+ parr = pa.Array.from_pandas(aobjs)
+ assert parr.type == t1
+ assert parr[0].as_py() == pytimes[0]
+ assert parr[1].as_py() == pytimes[1]
+ assert parr[2] is pa.NA
+
+ # DataFrame
+ df = pd.DataFrame({'times': aobjs})
+ batch = pa.RecordBatch.from_pandas(df)
+ assert batch[0].equals(parr)
+
+ # Test ndarray of int64 values
+ arr = np.array([_pytime_to_micros(v) for v in pytimes],
+ dtype='int64')
+
+ a1 = pa.Array.from_pandas(arr, type=pa.time64('us'))
+ assert a1[0].as_py() == pytimes[0]
+
+ a2 = pa.Array.from_pandas(arr * 1000, type=pa.time64('ns'))
+ assert a2[0].as_py() == pytimes[0]
+
+ a3 = pa.Array.from_pandas((arr / 1000).astype('i4'),
+ type=pa.time32('ms'))
+ assert a3[0].as_py() == pytimes[0].replace(microsecond=1000)
+
+ a4 = pa.Array.from_pandas((arr / 1000000).astype('i4'),
+ type=pa.time32('s'))
+ assert a4[0].as_py() == pytimes[0].replace(microsecond=0)
+
+ def test_arrow_time_to_pandas(self):
+ pytimes = [datetime.time(1, 2, 3, 1356),
+ datetime.time(4, 5, 6, 1356),
+ datetime.time(0, 0, 0)]
+
+ expected = np.array(pytimes[:2] + [None])
+ expected_ms = np.array([x.replace(microsecond=1000)
+ for x in pytimes[:2]] +
+ [None])
+ expected_s = np.array([x.replace(microsecond=0)
+ for x in pytimes[:2]] +
+ [None])
+
+ arr = np.array([_pytime_to_micros(v) for v in pytimes],
+ dtype='int64')
+ arr = np.array([_pytime_to_micros(v) for v in pytimes],
+ dtype='int64')
+
+ null_mask = np.array([False, False, True], dtype=bool)
+
+ a1 = pa.Array.from_pandas(arr, mask=null_mask, type=pa.time64('us'))
+ a2 = pa.Array.from_pandas(arr * 1000, mask=null_mask,
+ type=pa.time64('ns'))
+
+ a3 = pa.Array.from_pandas((arr / 1000).astype('i4'), mask=null_mask,
+ type=pa.time32('ms'))
+ a4 = pa.Array.from_pandas((arr / 1000000).astype('i4'), mask=null_mask,
+ type=pa.time32('s'))
+
+ names = ['time64[us]', 'time64[ns]', 'time32[ms]', 'time32[s]']
+ batch = pa.RecordBatch.from_arrays([a1, a2, a3, a4], names)
+ arr = a1.to_pandas()
+ assert (arr == expected).all()
+
+ arr = a2.to_pandas()
+ assert (arr == expected).all()
+
+ arr = a3.to_pandas()
+ assert (arr == expected_ms).all()
+
+ arr = a4.to_pandas()
+ assert (arr == expected_s).all()
+
+ df = batch.to_pandas()
+ expected_df = pd.DataFrame({'time64[us]': expected,
+ 'time64[ns]': expected,
+ 'time32[ms]': expected_ms,
+ 'time32[s]': expected_s},
+ columns=names)
+
+ tm.assert_frame_equal(df, expected_df)
+
def test_all_nones(self):
def _check_series(s):
converted = pa.Array.from_pandas(s)
@@ -782,3 +871,20 @@ class TestPandasConversion(unittest.TestCase):
assert data_column['pandas_type'] == 'decimal'
assert data_column['numpy_type'] == 'object'
assert data_column['metadata'] == {'precision': 26, 'scale': 11}
+
+
+def _pytime_from_micros(val):
+ microseconds = val % 1000000
+ val //= 1000000
+ seconds = val % 60
+ val //= 60
+ minutes = val % 60
+ hours = val // 60
+ return datetime.time(hours, minutes, seconds, microseconds)
+
+
+def _pytime_to_micros(pytime):
+ return (pytime.hour * 3600000000 +
+ pytime.minute * 60000000 +
+ pytime.second * 1000000 +
+ pytime.microsecond)