You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/05/10 01:01:00 UTC
arrow git commit: ARROW-991: [Python] Create new dtype when
deserializing from Arrow to NumPy datetime64
Repository: arrow
Updated Branches:
refs/heads/master 2d6453b25 -> 02161456c
ARROW-991: [Python] Create new dtype when deserializing from Arrow to NumPy datetime64
I am not sure how to reproduce the problem, but this will prevent us from mutating a cached dtype in the NumPy internals
Author: Wes McKinney <we...@twosigma.com>
Closes #664 from wesm/ARROW-991 and squashes the following commits:
088aa25 [Wes McKinney] Also new PyArray_NewFromDescr for blocks. Fixes pandas test suite
ee1904e [Wes McKinney] Remove unit test
2e70b08 [Wes McKinney] Always create new NPY_DATETIME dtype when converting to pandas
Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/02161456
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/02161456
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/02161456
Branch: refs/heads/master
Commit: 02161456c3de5199ca0304484aff14fb7349bca4
Parents: 2d6453b
Author: Wes McKinney <we...@twosigma.com>
Authored: Tue May 9 20:59:42 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Tue May 9 20:59:42 2017 -0400
----------------------------------------------------------------------
cpp/src/arrow/python/common.cc | 2 +-
cpp/src/arrow/python/common.h | 10 +--
cpp/src/arrow/python/pandas_convert.cc | 103 ++++++++++++++++------------
cpp/src/arrow/status.h | 3 +-
cpp/src/arrow/type.cc | 11 ++-
cpp/src/arrow/type.h | 6 +-
6 files changed, 70 insertions(+), 65 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/arrow/blob/02161456/cpp/src/arrow/python/common.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/common.cc b/cpp/src/arrow/python/common.cc
index 5702c71..ba7b6cf 100644
--- a/cpp/src/arrow/python/common.cc
+++ b/cpp/src/arrow/python/common.cc
@@ -69,7 +69,7 @@ Status CheckPyError(StatusCode code) {
PyObject *exc_type, *exc_value, *traceback;
PyErr_Fetch(&exc_type, &exc_value, &traceback);
PyErr_NormalizeException(&exc_type, &exc_value, &traceback);
- PyObject *exc_value_str = PyObject_Str(exc_value);
+ PyObject* exc_value_str = PyObject_Str(exc_value);
PyObjectStringify stringified(exc_value_str);
std::string message(stringified.bytes);
Py_XDECREF(exc_type);
http://git-wip-us.apache.org/repos/asf/arrow/blob/02161456/cpp/src/arrow/python/common.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/common.h b/cpp/src/arrow/python/common.h
index c5745a5..f6e706b 100644
--- a/cpp/src/arrow/python/common.h
+++ b/cpp/src/arrow/python/common.h
@@ -34,9 +34,7 @@ namespace py {
class ARROW_EXPORT PyAcquireGIL {
public:
- PyAcquireGIL() : acquired_gil_(false) {
- acquire();
- }
+ PyAcquireGIL() : acquired_gil_(false) { acquire(); }
~PyAcquireGIL() { release(); }
@@ -113,11 +111,9 @@ struct ARROW_EXPORT PyObjectStringify {
Status CheckPyError(StatusCode code = StatusCode::UnknownError);
// TODO(wesm): We can just let errors pass through. To be explored later
-#define RETURN_IF_PYERROR() \
- RETURN_NOT_OK(CheckPyError());
+#define RETURN_IF_PYERROR() RETURN_NOT_OK(CheckPyError());
-#define PY_RETURN_IF_ERROR(CODE) \
- RETURN_NOT_OK(CheckPyError(CODE));
+#define PY_RETURN_IF_ERROR(CODE) RETURN_NOT_OK(CheckPyError(CODE));
// Return the common PyArrow memory pool
ARROW_EXPORT void set_default_memory_pool(MemoryPool* pool);
http://git-wip-us.apache.org/repos/asf/arrow/blob/02161456/cpp/src/arrow/python/pandas_convert.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/pandas_convert.cc b/cpp/src/arrow/python/pandas_convert.cc
index b54197e..264bed1 100644
--- a/cpp/src/arrow/python/pandas_convert.cc
+++ b/cpp/src/arrow/python/pandas_convert.cc
@@ -977,6 +977,55 @@ Status PandasObjectsToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo,
// ----------------------------------------------------------------------
// pandas 0.x DataFrame conversion internals
+inline void set_numpy_metadata(int type, DataType* datatype, PyArray_Descr* out) {
+ if (type == NPY_DATETIME) {
+ auto date_dtype = reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(out->c_metadata);
+ if (datatype->id() == Type::TIMESTAMP) {
+ auto timestamp_type = static_cast<TimestampType*>(datatype);
+
+ switch (timestamp_type->unit()) {
+ case TimestampType::Unit::SECOND:
+ date_dtype->meta.base = NPY_FR_s;
+ break;
+ case TimestampType::Unit::MILLI:
+ date_dtype->meta.base = NPY_FR_ms;
+ break;
+ case TimestampType::Unit::MICRO:
+ date_dtype->meta.base = NPY_FR_us;
+ break;
+ case TimestampType::Unit::NANO:
+ date_dtype->meta.base = NPY_FR_ns;
+ break;
+ }
+ } else {
+ // datatype->type == Type::DATE64
+ date_dtype->meta.base = NPY_FR_D;
+ }
+ }
+}
+
+static inline PyArray_Descr* GetSafeNumPyDtype(int type) {
+ if (type == NPY_DATETIME) {
+ // It is not safe to mutate the result of DescrFromType
+ return PyArray_DescrNewFromType(type);
+ } else {
+ return PyArray_DescrFromType(type);
+ }
+}
+static inline PyObject* NewArray1DFromType(
+ DataType* arrow_type, int type, int64_t length, void* data) {
+ npy_intp dims[1] = {length};
+
+ PyArray_Descr* descr = GetSafeNumPyDtype(type);
+ if (descr == nullptr) {
+ // Error occurred, trust error state is set
+ return nullptr;
+ }
+
+ set_numpy_metadata(type, arrow_type, descr);
+ return PyArray_NewFromDescr(&PyArray_Type, descr, 1, dims, nullptr, data, 0, nullptr);
+}
+
class PandasBlock {
public:
enum type {
@@ -1024,13 +1073,17 @@ class PandasBlock {
Status AllocateNDArray(int npy_type, int ndim = 2) {
PyAcquireGIL lock;
+ PyArray_Descr* descr = GetSafeNumPyDtype(npy_type);
+
PyObject* block_arr;
if (ndim == 2) {
npy_intp block_dims[2] = {num_columns_, num_rows_};
- block_arr = PyArray_SimpleNew(2, block_dims, npy_type);
+ block_arr = PyArray_NewFromDescr(
+ &PyArray_Type, descr, 2, block_dims, nullptr, nullptr, 0, nullptr);
} else {
npy_intp block_dims[1] = {num_rows_};
- block_arr = PyArray_SimpleNew(1, block_dims, npy_type);
+ block_arr = PyArray_NewFromDescr(
+ &PyArray_Type, descr, 1, block_dims, nullptr, nullptr, 0, nullptr);
}
if (block_arr == NULL) {
@@ -1947,34 +2000,6 @@ class DataFrameBlockCreator {
BlockMap datetimetz_blocks_;
};
-inline void set_numpy_metadata(int type, DataType* datatype, PyArrayObject* out) {
- if (type == NPY_DATETIME) {
- PyArray_Descr* descr = PyArray_DESCR(out);
- auto date_dtype = reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(descr->c_metadata);
- if (datatype->id() == Type::TIMESTAMP) {
- auto timestamp_type = static_cast<TimestampType*>(datatype);
-
- switch (timestamp_type->unit()) {
- case TimestampType::Unit::SECOND:
- date_dtype->meta.base = NPY_FR_s;
- break;
- case TimestampType::Unit::MILLI:
- date_dtype->meta.base = NPY_FR_ms;
- break;
- case TimestampType::Unit::MICRO:
- date_dtype->meta.base = NPY_FR_us;
- break;
- case TimestampType::Unit::NANO:
- date_dtype->meta.base = NPY_FR_ns;
- break;
- }
- } else {
- // datatype->type == Type::DATE64
- date_dtype->meta.base = NPY_FR_D;
- }
- }
-}
-
class ArrowDeserializer {
public:
ArrowDeserializer(const std::shared_ptr<Column>& col, PyObject* py_ref)
@@ -1983,17 +2008,8 @@ class ArrowDeserializer {
Status AllocateOutput(int type) {
PyAcquireGIL lock;
- npy_intp dims[1] = {col_->length()};
- result_ = PyArray_SimpleNew(1, dims, type);
+ result_ = NewArray1DFromType(col_->type().get(), type, col_->length(), nullptr);
arr_ = reinterpret_cast<PyArrayObject*>(result_);
-
- if (arr_ == NULL) {
- // Error occurred, trust that SimpleNew set the error state
- return Status::OK();
- }
-
- set_numpy_metadata(type, col_->type().get(), arr_);
-
return Status::OK();
}
@@ -2010,17 +2026,14 @@ class ArrowDeserializer {
PyAcquireGIL lock;
// Zero-Copy. We can pass the data pointer directly to NumPy.
- npy_intp dims[1] = {col_->length()};
- result_ = PyArray_SimpleNewFromData(1, dims, npy_type, data);
+ result_ = NewArray1DFromType(col_->type().get(), npy_type, col_->length(), data);
arr_ = reinterpret_cast<PyArrayObject*>(result_);
if (arr_ == NULL) {
- // Error occurred, trust that SimpleNew set the error state
+ // Error occurred, trust that error set
return Status::OK();
}
- set_numpy_metadata(npy_type, col_->type().get(), arr_);
-
if (PyArray_SetBaseObject(arr_, py_ref_) == -1) {
// Error occurred, trust that SetBaseObject set the error state
return Status::OK();
http://git-wip-us.apache.org/repos/asf/arrow/blob/02161456/cpp/src/arrow/status.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/status.h b/cpp/src/arrow/status.h
index 6a8cee2..1688b96 100644
--- a/cpp/src/arrow/status.h
+++ b/cpp/src/arrow/status.h
@@ -91,8 +91,7 @@ class ARROW_EXPORT Status {
Status() : state_(NULL) {}
~Status() { delete[] state_; }
- Status(StatusCode code, const std::string& msg)
- : Status(code, msg, -1) {}
+ Status(StatusCode code, const std::string& msg) : Status(code, msg, -1) {}
// Copy the specified status.
Status(const Status& s);
http://git-wip-us.apache.org/repos/asf/arrow/blob/02161456/cpp/src/arrow/type.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc
index b1e322c..afb3027 100644
--- a/cpp/src/arrow/type.cc
+++ b/cpp/src/arrow/type.cc
@@ -42,9 +42,7 @@ std::shared_ptr<Field> Field::RemoveMetadata() const {
}
bool Field::Equals(const Field& other) const {
- if (this == &other) {
- return true;
- }
+ if (this == &other) { return true; }
if (this->name_ == other.name_ && this->nullable_ == other.nullable_ &&
this->type_->Equals(*other.type_.get())) {
if (metadata_ == nullptr && other.metadata_ == nullptr) {
@@ -322,8 +320,7 @@ std::string Schema::ToString() const {
if (metadata_) {
buffer << "\n-- metadata --";
for (int64_t i = 0; i < metadata_->size(); ++i) {
- buffer << "\n" << metadata_->key(i) << ": "
- << metadata_->value(i);
+ buffer << "\n" << metadata_->key(i) << ": " << metadata_->value(i);
}
}
@@ -419,8 +416,8 @@ std::shared_ptr<DataType> dictionary(const std::shared_ptr<DataType>& index_type
return std::make_shared<DictionaryType>(index_type, dict_values);
}
-std::shared_ptr<Field> field(
- const std::string& name, const std::shared_ptr<DataType>& type, bool nullable,
+std::shared_ptr<Field> field(const std::string& name,
+ const std::shared_ptr<DataType>& type, bool nullable,
const std::shared_ptr<const KeyValueMetadata>& metadata) {
return std::make_shared<Field>(name, type, nullable, metadata);
}
http://git-wip-us.apache.org/repos/asf/arrow/blob/02161456/cpp/src/arrow/type.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
index bb25857..40615f7 100644
--- a/cpp/src/arrow/type.h
+++ b/cpp/src/arrow/type.h
@@ -205,7 +205,7 @@ class ARROW_EXPORT Field {
Field(const std::string& name, const std::shared_ptr<DataType>& type,
bool nullable = true,
const std::shared_ptr<const KeyValueMetadata>& metadata = nullptr)
- : name_(name), type_(type), nullable_(nullable), metadata_(metadata) {}
+ : name_(name), type_(type), nullable_(nullable), metadata_(metadata) {}
std::shared_ptr<const KeyValueMetadata> metadata() const { return metadata_; }
@@ -753,8 +753,8 @@ std::shared_ptr<DataType> ARROW_EXPORT union_(
std::shared_ptr<DataType> ARROW_EXPORT dictionary(
const std::shared_ptr<DataType>& index_type, const std::shared_ptr<Array>& values);
-std::shared_ptr<Field> ARROW_EXPORT field(
- const std::string& name, const std::shared_ptr<DataType>& type, bool nullable = true,
+std::shared_ptr<Field> ARROW_EXPORT field(const std::string& name,
+ const std::shared_ptr<DataType>& type, bool nullable = true,
const std::shared_ptr<const KeyValueMetadata>& metadata = nullptr);
// ----------------------------------------------------------------------