You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/05/10 01:01:00 UTC

arrow git commit: ARROW-991: [Python] Create new dtype when deserializing from Arrow to NumPy datetime64

Repository: arrow
Updated Branches:
  refs/heads/master 2d6453b25 -> 02161456c


ARROW-991: [Python] Create new dtype when deserializing from Arrow to NumPy datetime64

I am not sure how to reproduce the problem, but this will prevent us from mutating a cached dtype in the NumPy internals

Author: Wes McKinney <we...@twosigma.com>

Closes #664 from wesm/ARROW-991 and squashes the following commits:

088aa25 [Wes McKinney] Also new PyArray_NewFromDescr for blocks. Fixes pandas test suite
ee1904e [Wes McKinney] Remove unit test
2e70b08 [Wes McKinney] Always create new NPY_DATETIME dtype when converting to pandas


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/02161456
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/02161456
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/02161456

Branch: refs/heads/master
Commit: 02161456c3de5199ca0304484aff14fb7349bca4
Parents: 2d6453b
Author: Wes McKinney <we...@twosigma.com>
Authored: Tue May 9 20:59:42 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Tue May 9 20:59:42 2017 -0400

----------------------------------------------------------------------
 cpp/src/arrow/python/common.cc         |   2 +-
 cpp/src/arrow/python/common.h          |  10 +--
 cpp/src/arrow/python/pandas_convert.cc | 103 ++++++++++++++++------------
 cpp/src/arrow/status.h                 |   3 +-
 cpp/src/arrow/type.cc                  |  11 ++-
 cpp/src/arrow/type.h                   |   6 +-
 6 files changed, 70 insertions(+), 65 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/02161456/cpp/src/arrow/python/common.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/common.cc b/cpp/src/arrow/python/common.cc
index 5702c71..ba7b6cf 100644
--- a/cpp/src/arrow/python/common.cc
+++ b/cpp/src/arrow/python/common.cc
@@ -69,7 +69,7 @@ Status CheckPyError(StatusCode code) {
     PyObject *exc_type, *exc_value, *traceback;
     PyErr_Fetch(&exc_type, &exc_value, &traceback);
     PyErr_NormalizeException(&exc_type, &exc_value, &traceback);
-    PyObject *exc_value_str = PyObject_Str(exc_value);
+    PyObject* exc_value_str = PyObject_Str(exc_value);
     PyObjectStringify stringified(exc_value_str);
     std::string message(stringified.bytes);
     Py_XDECREF(exc_type);

http://git-wip-us.apache.org/repos/asf/arrow/blob/02161456/cpp/src/arrow/python/common.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/common.h b/cpp/src/arrow/python/common.h
index c5745a5..f6e706b 100644
--- a/cpp/src/arrow/python/common.h
+++ b/cpp/src/arrow/python/common.h
@@ -34,9 +34,7 @@ namespace py {
 
 class ARROW_EXPORT PyAcquireGIL {
  public:
-  PyAcquireGIL() : acquired_gil_(false) {
-    acquire();
-  }
+  PyAcquireGIL() : acquired_gil_(false) { acquire(); }
 
   ~PyAcquireGIL() { release(); }
 
@@ -113,11 +111,9 @@ struct ARROW_EXPORT PyObjectStringify {
 Status CheckPyError(StatusCode code = StatusCode::UnknownError);
 
 // TODO(wesm): We can just let errors pass through. To be explored later
-#define RETURN_IF_PYERROR()                     \
-  RETURN_NOT_OK(CheckPyError());
+#define RETURN_IF_PYERROR() RETURN_NOT_OK(CheckPyError());
 
-#define PY_RETURN_IF_ERROR(CODE)                \
-  RETURN_NOT_OK(CheckPyError(CODE));
+#define PY_RETURN_IF_ERROR(CODE) RETURN_NOT_OK(CheckPyError(CODE));
 
 // Return the common PyArrow memory pool
 ARROW_EXPORT void set_default_memory_pool(MemoryPool* pool);

http://git-wip-us.apache.org/repos/asf/arrow/blob/02161456/cpp/src/arrow/python/pandas_convert.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/pandas_convert.cc b/cpp/src/arrow/python/pandas_convert.cc
index b54197e..264bed1 100644
--- a/cpp/src/arrow/python/pandas_convert.cc
+++ b/cpp/src/arrow/python/pandas_convert.cc
@@ -977,6 +977,55 @@ Status PandasObjectsToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo,
 // ----------------------------------------------------------------------
 // pandas 0.x DataFrame conversion internals
 
+inline void set_numpy_metadata(int type, DataType* datatype, PyArray_Descr* out) {
+  if (type == NPY_DATETIME) {
+    auto date_dtype = reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(out->c_metadata);
+    if (datatype->id() == Type::TIMESTAMP) {
+      auto timestamp_type = static_cast<TimestampType*>(datatype);
+
+      switch (timestamp_type->unit()) {
+        case TimestampType::Unit::SECOND:
+          date_dtype->meta.base = NPY_FR_s;
+          break;
+        case TimestampType::Unit::MILLI:
+          date_dtype->meta.base = NPY_FR_ms;
+          break;
+        case TimestampType::Unit::MICRO:
+          date_dtype->meta.base = NPY_FR_us;
+          break;
+        case TimestampType::Unit::NANO:
+          date_dtype->meta.base = NPY_FR_ns;
+          break;
+      }
+    } else {
+      // datatype->type == Type::DATE64
+      date_dtype->meta.base = NPY_FR_D;
+    }
+  }
+}
+
+static inline PyArray_Descr* GetSafeNumPyDtype(int type) {
+  if (type == NPY_DATETIME) {
+    // It is not safe to mutate the result of DescrFromType
+    return PyArray_DescrNewFromType(type);
+  } else {
+    return PyArray_DescrFromType(type);
+  }
+}
+static inline PyObject* NewArray1DFromType(
+    DataType* arrow_type, int type, int64_t length, void* data) {
+  npy_intp dims[1] = {length};
+
+  PyArray_Descr* descr = GetSafeNumPyDtype(type);
+  if (descr == nullptr) {
+    // Error occurred, trust error state is set
+    return nullptr;
+  }
+
+  set_numpy_metadata(type, arrow_type, descr);
+  return PyArray_NewFromDescr(&PyArray_Type, descr, 1, dims, nullptr, data, 0, nullptr);
+}
+
 class PandasBlock {
  public:
   enum type {
@@ -1024,13 +1073,17 @@ class PandasBlock {
   Status AllocateNDArray(int npy_type, int ndim = 2) {
     PyAcquireGIL lock;
 
+    PyArray_Descr* descr = GetSafeNumPyDtype(npy_type);
+
     PyObject* block_arr;
     if (ndim == 2) {
       npy_intp block_dims[2] = {num_columns_, num_rows_};
-      block_arr = PyArray_SimpleNew(2, block_dims, npy_type);
+      block_arr = PyArray_NewFromDescr(
+          &PyArray_Type, descr, 2, block_dims, nullptr, nullptr, 0, nullptr);
     } else {
       npy_intp block_dims[1] = {num_rows_};
-      block_arr = PyArray_SimpleNew(1, block_dims, npy_type);
+      block_arr = PyArray_NewFromDescr(
+          &PyArray_Type, descr, 1, block_dims, nullptr, nullptr, 0, nullptr);
     }
 
     if (block_arr == NULL) {
@@ -1947,34 +2000,6 @@ class DataFrameBlockCreator {
   BlockMap datetimetz_blocks_;
 };
 
-inline void set_numpy_metadata(int type, DataType* datatype, PyArrayObject* out) {
-  if (type == NPY_DATETIME) {
-    PyArray_Descr* descr = PyArray_DESCR(out);
-    auto date_dtype = reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(descr->c_metadata);
-    if (datatype->id() == Type::TIMESTAMP) {
-      auto timestamp_type = static_cast<TimestampType*>(datatype);
-
-      switch (timestamp_type->unit()) {
-        case TimestampType::Unit::SECOND:
-          date_dtype->meta.base = NPY_FR_s;
-          break;
-        case TimestampType::Unit::MILLI:
-          date_dtype->meta.base = NPY_FR_ms;
-          break;
-        case TimestampType::Unit::MICRO:
-          date_dtype->meta.base = NPY_FR_us;
-          break;
-        case TimestampType::Unit::NANO:
-          date_dtype->meta.base = NPY_FR_ns;
-          break;
-      }
-    } else {
-      // datatype->type == Type::DATE64
-      date_dtype->meta.base = NPY_FR_D;
-    }
-  }
-}
-
 class ArrowDeserializer {
  public:
   ArrowDeserializer(const std::shared_ptr<Column>& col, PyObject* py_ref)
@@ -1983,17 +2008,8 @@ class ArrowDeserializer {
   Status AllocateOutput(int type) {
     PyAcquireGIL lock;
 
-    npy_intp dims[1] = {col_->length()};
-    result_ = PyArray_SimpleNew(1, dims, type);
+    result_ = NewArray1DFromType(col_->type().get(), type, col_->length(), nullptr);
     arr_ = reinterpret_cast<PyArrayObject*>(result_);
-
-    if (arr_ == NULL) {
-      // Error occurred, trust that SimpleNew set the error state
-      return Status::OK();
-    }
-
-    set_numpy_metadata(type, col_->type().get(), arr_);
-
     return Status::OK();
   }
 
@@ -2010,17 +2026,14 @@ class ArrowDeserializer {
     PyAcquireGIL lock;
 
     // Zero-Copy. We can pass the data pointer directly to NumPy.
-    npy_intp dims[1] = {col_->length()};
-    result_ = PyArray_SimpleNewFromData(1, dims, npy_type, data);
+    result_ = NewArray1DFromType(col_->type().get(), npy_type, col_->length(), data);
     arr_ = reinterpret_cast<PyArrayObject*>(result_);
 
     if (arr_ == NULL) {
-      // Error occurred, trust that SimpleNew set the error state
+      // Error occurred, trust that error set
       return Status::OK();
     }
 
-    set_numpy_metadata(npy_type, col_->type().get(), arr_);
-
     if (PyArray_SetBaseObject(arr_, py_ref_) == -1) {
       // Error occurred, trust that SetBaseObject set the error state
       return Status::OK();

http://git-wip-us.apache.org/repos/asf/arrow/blob/02161456/cpp/src/arrow/status.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/status.h b/cpp/src/arrow/status.h
index 6a8cee2..1688b96 100644
--- a/cpp/src/arrow/status.h
+++ b/cpp/src/arrow/status.h
@@ -91,8 +91,7 @@ class ARROW_EXPORT Status {
   Status() : state_(NULL) {}
   ~Status() { delete[] state_; }
 
-  Status(StatusCode code, const std::string& msg)
-    : Status(code, msg, -1) {}
+  Status(StatusCode code, const std::string& msg) : Status(code, msg, -1) {}
 
   // Copy the specified status.
   Status(const Status& s);

http://git-wip-us.apache.org/repos/asf/arrow/blob/02161456/cpp/src/arrow/type.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc
index b1e322c..afb3027 100644
--- a/cpp/src/arrow/type.cc
+++ b/cpp/src/arrow/type.cc
@@ -42,9 +42,7 @@ std::shared_ptr<Field> Field::RemoveMetadata() const {
 }
 
 bool Field::Equals(const Field& other) const {
-  if (this == &other) {
-    return true;
-  }
+  if (this == &other) { return true; }
   if (this->name_ == other.name_ && this->nullable_ == other.nullable_ &&
       this->type_->Equals(*other.type_.get())) {
     if (metadata_ == nullptr && other.metadata_ == nullptr) {
@@ -322,8 +320,7 @@ std::string Schema::ToString() const {
   if (metadata_) {
     buffer << "\n-- metadata --";
     for (int64_t i = 0; i < metadata_->size(); ++i) {
-      buffer << "\n" << metadata_->key(i) << ": "
-             << metadata_->value(i);
+      buffer << "\n" << metadata_->key(i) << ": " << metadata_->value(i);
     }
   }
 
@@ -419,8 +416,8 @@ std::shared_ptr<DataType> dictionary(const std::shared_ptr<DataType>& index_type
   return std::make_shared<DictionaryType>(index_type, dict_values);
 }
 
-std::shared_ptr<Field> field(
-    const std::string& name, const std::shared_ptr<DataType>& type, bool nullable,
+std::shared_ptr<Field> field(const std::string& name,
+    const std::shared_ptr<DataType>& type, bool nullable,
     const std::shared_ptr<const KeyValueMetadata>& metadata) {
   return std::make_shared<Field>(name, type, nullable, metadata);
 }

http://git-wip-us.apache.org/repos/asf/arrow/blob/02161456/cpp/src/arrow/type.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
index bb25857..40615f7 100644
--- a/cpp/src/arrow/type.h
+++ b/cpp/src/arrow/type.h
@@ -205,7 +205,7 @@ class ARROW_EXPORT Field {
   Field(const std::string& name, const std::shared_ptr<DataType>& type,
       bool nullable = true,
       const std::shared_ptr<const KeyValueMetadata>& metadata = nullptr)
-    : name_(name), type_(type), nullable_(nullable), metadata_(metadata) {}
+      : name_(name), type_(type), nullable_(nullable), metadata_(metadata) {}
 
   std::shared_ptr<const KeyValueMetadata> metadata() const { return metadata_; }
 
@@ -753,8 +753,8 @@ std::shared_ptr<DataType> ARROW_EXPORT union_(
 std::shared_ptr<DataType> ARROW_EXPORT dictionary(
     const std::shared_ptr<DataType>& index_type, const std::shared_ptr<Array>& values);
 
-std::shared_ptr<Field> ARROW_EXPORT field(
-    const std::string& name, const std::shared_ptr<DataType>& type, bool nullable = true,
+std::shared_ptr<Field> ARROW_EXPORT field(const std::string& name,
+    const std::shared_ptr<DataType>& type, bool nullable = true,
     const std::shared_ptr<const KeyValueMetadata>& metadata = nullptr);
 
 // ----------------------------------------------------------------------