You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2018/04/05 13:34:56 UTC
[arrow] branch master updated: ARROW-2380: [Python] Streamline
conversions
This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 9515fe9 ARROW-2380: [Python] Streamline conversions
9515fe9 is described below
commit 9515fe92d31c67590500b22abd4249ba5f6575bc
Author: Antoine Pitrou <an...@python.org>
AuthorDate: Thu Apr 5 15:34:46 2018 +0200
ARROW-2380: [Python] Streamline conversions
* Correctly check for overflow when constructing string / binary objects
* Accept bytearrays in the same places as bytes objects
* Other refactorings
Author: Antoine Pitrou <an...@python.org>
Closes #1835 from pitrou/ARROW-2380-py-streamline-conversions and squashes the following commits:
3ebb34f <Antoine Pitrou> Fix performance regression
b238d3e <Antoine Pitrou> Templatize integer conversions
1b269c9 <Antoine Pitrou> Make PyBytesView initializers non-static
08db003 <Antoine Pitrou> Create python/decimal.{h,cc}
b5bbb3e <Antoine Pitrou> ARROW-2380: Streamline conversions
---
cpp/src/arrow/python/CMakeLists.txt | 2 +
cpp/src/arrow/python/arrow_to_pandas.cc | 1 +
cpp/src/arrow/python/builtin_convert.cc | 254 ++--------------
cpp/src/arrow/python/builtin_convert.h | 3 -
cpp/src/arrow/python/common.cc | 16 +-
cpp/src/arrow/python/common.h | 96 ++++--
cpp/src/arrow/python/{helpers.cc => decimal.cc} | 118 +-------
cpp/src/arrow/python/{helpers.h => decimal.h} | 58 +---
cpp/src/arrow/python/helpers.cc | 371 +++++++++++++-----------
cpp/src/arrow/python/helpers.h | 110 +++----
cpp/src/arrow/python/numpy_to_arrow.cc | 100 ++-----
cpp/src/arrow/python/python-test.cc | 1 +
cpp/src/arrow/python/python_to_arrow.cc | 56 ++--
python/pyarrow/lib.pyx | 4 -
python/pyarrow/tests/test_convert_builtin.py | 38 ++-
python/pyarrow/tests/test_convert_pandas.py | 8 +-
python/pyarrow/tests/test_feather.py | 2 +-
17 files changed, 444 insertions(+), 794 deletions(-)
diff --git a/cpp/src/arrow/python/CMakeLists.txt b/cpp/src/arrow/python/CMakeLists.txt
index b985df9..f6c92a7 100644
--- a/cpp/src/arrow/python/CMakeLists.txt
+++ b/cpp/src/arrow/python/CMakeLists.txt
@@ -54,6 +54,7 @@ set(ARROW_PYTHON_SRCS
builtin_convert.cc
common.cc
config.cc
+ decimal.cc
helpers.cc
init.cc
io.cc
@@ -104,6 +105,7 @@ install(FILES
builtin_convert.h
common.h
config.h
+ decimal.h
helpers.h
init.h
io.h
diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc
index 92461fc..41a07d0 100644
--- a/cpp/src/arrow/python/arrow_to_pandas.cc
+++ b/cpp/src/arrow/python/arrow_to_pandas.cc
@@ -47,6 +47,7 @@
#include "arrow/python/builtin_convert.h"
#include "arrow/python/common.h"
#include "arrow/python/config.h"
+#include "arrow/python/decimal.h"
#include "arrow/python/helpers.h"
#include "arrow/python/numpy-internal.h"
#include "arrow/python/numpy_convert.h"
diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc
index 5e99992..459e299 100644
--- a/cpp/src/arrow/python/builtin_convert.cc
+++ b/cpp/src/arrow/python/builtin_convert.cc
@@ -33,6 +33,7 @@
#include "arrow/util/decimal.h"
#include "arrow/util/logging.h"
+#include "arrow/python/decimal.h"
#include "arrow/python/helpers.h"
#include "arrow/python/numpy_convert.h"
#include "arrow/python/util/datetime.h"
@@ -42,26 +43,9 @@ namespace py {
Status InvalidConversion(PyObject* obj, const std::string& expected_types,
std::ostream* out) {
- OwnedRef type(PyObject_Type(obj));
- RETURN_IF_PYERROR();
- DCHECK_NE(type.obj(), nullptr);
-
- OwnedRef type_name(PyObject_GetAttrString(type.obj(), "__name__"));
- RETURN_IF_PYERROR();
- DCHECK_NE(type_name.obj(), nullptr);
-
- PyObjectStringify bytestring(type_name.obj());
- RETURN_IF_PYERROR();
-
- const char* bytes = bytestring.bytes;
- DCHECK_NE(bytes, nullptr) << "bytes from type(...).__name__ were null";
-
- Py_ssize_t size = bytestring.size;
-
- std::string cpp_type_name(bytes, size);
-
- (*out) << "Got Python object of type " << cpp_type_name
+ (*out) << "Got Python object of type " << Py_TYPE(obj)->tp_name
<< " but can only handle these types: " << expected_types;
+ // XXX streamline this?
return Status::OK();
}
@@ -100,7 +84,7 @@ class ScalarVisitor {
++date_count_;
} else if (PyDateTime_CheckExact(obj)) {
++timestamp_count_;
- } else if (PyBytes_Check(obj)) {
+ } else if (internal::IsPyBinary(obj)) {
++binary_count_;
} else if (PyUnicode_Check(obj)) {
++unicode_count_;
@@ -445,122 +429,15 @@ class BoolConverter : public TypedConverterVisitor<BooleanBuilder, BoolConverter
}
};
-class Int8Converter : public TypedConverterVisitor<Int8Builder, Int8Converter> {
- public:
- // Append a non-missing item
- Status AppendItem(PyObject* obj) {
- const auto val = static_cast<int64_t>(PyLong_AsLongLong(obj));
-
- if (ARROW_PREDICT_FALSE(val > std::numeric_limits<int8_t>::max() ||
- val < std::numeric_limits<int8_t>::min())) {
- return Status::Invalid(
- "Cannot coerce values to array type that would "
- "lose data");
- }
- RETURN_IF_PYERROR();
- return typed_builder_->Append(static_cast<int8_t>(val));
- }
-};
-
-class Int16Converter : public TypedConverterVisitor<Int16Builder, Int16Converter> {
- public:
- // Append a non-missing item
- Status AppendItem(PyObject* obj) {
- const auto val = static_cast<int64_t>(PyLong_AsLongLong(obj));
-
- if (ARROW_PREDICT_FALSE(val > std::numeric_limits<int16_t>::max() ||
- val < std::numeric_limits<int16_t>::min())) {
- return Status::Invalid(
- "Cannot coerce values to array type that would "
- "lose data");
- }
- RETURN_IF_PYERROR();
- return typed_builder_->Append(static_cast<int16_t>(val));
- }
-};
-
-class Int32Converter : public TypedConverterVisitor<Int32Builder, Int32Converter> {
+template <typename IntType>
+class TypedIntConverter
+ : public TypedConverterVisitor<NumericBuilder<IntType>, TypedIntConverter<IntType>> {
public:
// Append a non-missing item
Status AppendItem(PyObject* obj) {
- const auto val = static_cast<int64_t>(PyLong_AsLongLong(obj));
-
- if (ARROW_PREDICT_FALSE(val > std::numeric_limits<int32_t>::max() ||
- val < std::numeric_limits<int32_t>::min())) {
- return Status::Invalid(
- "Cannot coerce values to array type that would "
- "lose data");
- }
- RETURN_IF_PYERROR();
- return typed_builder_->Append(static_cast<int32_t>(val));
- }
-};
-
-class Int64Converter : public TypedConverterVisitor<Int64Builder, Int64Converter> {
- public:
- // Append a non-missing item
- Status AppendItem(PyObject* obj) {
- const auto val = static_cast<int64_t>(PyLong_AsLongLong(obj));
- RETURN_IF_PYERROR();
- return typed_builder_->Append(val);
- }
-};
-
-class UInt8Converter : public TypedConverterVisitor<UInt8Builder, UInt8Converter> {
- public:
- // Append a non-missing item
- Status AppendItem(PyObject* obj) {
- const auto val = static_cast<uint64_t>(PyLong_AsLongLong(obj));
- RETURN_IF_PYERROR();
-
- if (ARROW_PREDICT_FALSE(val > std::numeric_limits<uint8_t>::max())) {
- return Status::Invalid(
- "Cannot coerce values to array type that would "
- "lose data");
- }
- return typed_builder_->Append(static_cast<uint8_t>(val));
- }
-};
-
-class UInt16Converter : public TypedConverterVisitor<UInt16Builder, UInt16Converter> {
- public:
- // Append a non-missing item
- Status AppendItem(PyObject* obj) {
- const auto val = static_cast<uint64_t>(PyLong_AsLongLong(obj));
- RETURN_IF_PYERROR();
-
- if (ARROW_PREDICT_FALSE(val > std::numeric_limits<uint16_t>::max())) {
- return Status::Invalid(
- "Cannot coerce values to array type that would "
- "lose data");
- }
- return typed_builder_->Append(static_cast<uint16_t>(val));
- }
-};
-
-class UInt32Converter : public TypedConverterVisitor<UInt32Builder, UInt32Converter> {
- public:
- // Append a non-missing item
- Status AppendItem(PyObject* obj) {
- const auto val = static_cast<uint64_t>(PyLong_AsLongLong(obj));
- RETURN_IF_PYERROR();
-
- if (ARROW_PREDICT_FALSE(val > std::numeric_limits<uint32_t>::max())) {
- return Status::Invalid(
- "Cannot coerce values to array type that would "
- "lose data");
- }
- return typed_builder_->Append(static_cast<uint32_t>(val));
- }
-};
-
-class UInt64Converter : public TypedConverterVisitor<UInt64Builder, UInt64Converter> {
- public:
- // Append a non-missing item
- Status AppendItem(PyObject* obj) {
- uint64_t val;
- RETURN_NOT_OK(internal::UInt64FromPythonInt(obj, &val));
- return typed_builder_->Append(val);
+ typename IntType::c_type value;
+ RETURN_NOT_OK(internal::CIntFromPython(obj, &value));
+ return this->typed_builder_->Append(value);
}
};
@@ -573,12 +450,7 @@ class Date32Converter : public TypedConverterVisitor<Date32Builder, Date32Conver
auto pydate = reinterpret_cast<PyDateTime_Date*>(obj);
t = static_cast<int32_t>(PyDate_to_s(pydate));
} else {
- const auto casted_val = static_cast<int64_t>(PyLong_AsLongLong(obj));
- RETURN_IF_PYERROR();
- if (casted_val > std::numeric_limits<int32_t>::max()) {
- return Status::Invalid("Integer as date32 larger than INT32_MAX");
- }
- t = static_cast<int32_t>(casted_val);
+ RETURN_NOT_OK(internal::CIntFromPython(obj, &t, "Integer too large for date32"));
}
return typed_builder_->Append(t);
}
@@ -593,8 +465,7 @@ class Date64Converter : public TypedConverterVisitor<Date64Builder, Date64Conver
auto pydate = reinterpret_cast<PyDateTime_Date*>(obj);
t = PyDate_to_ms(pydate);
} else {
- t = static_cast<int64_t>(PyLong_AsLongLong(obj));
- RETURN_IF_PYERROR();
+ RETURN_NOT_OK(internal::CIntFromPython(obj, &t, "Integer too large for date64"));
}
return typed_builder_->Append(t);
}
@@ -645,8 +516,7 @@ class TimestampConverter
t = reinterpret_cast<PyDatetimeScalarObject*>(obj)->obval;
} else {
- t = static_cast<int64_t>(PyLong_AsLongLong(obj));
- RETURN_IF_PYERROR();
+ RETURN_NOT_OK(internal::CIntFromPython(obj, &t));
}
return typed_builder_->Append(t);
}
@@ -690,27 +560,7 @@ class BytesConverter : public TypedConverterVisitor<BinaryBuilder, BytesConverte
public:
// Append a non-missing item
Status AppendItem(PyObject* obj) {
- PyObject* bytes_obj;
- const char* bytes;
- Py_ssize_t length;
- OwnedRef tmp;
-
- if (PyUnicode_Check(obj)) {
- tmp.reset(PyUnicode_AsUTF8String(obj));
- RETURN_IF_PYERROR();
- bytes_obj = tmp.obj();
- } else if (PyBytes_Check(obj)) {
- bytes_obj = obj;
- } else {
- std::stringstream ss;
- ss << "Error converting to Binary type: ";
- RETURN_NOT_OK(InvalidConversion(obj, "bytes", &ss));
- return Status::Invalid(ss.str());
- }
- // No error checking
- length = PyBytes_GET_SIZE(bytes_obj);
- bytes = PyBytes_AS_STRING(bytes_obj);
- return typed_builder_->Append(bytes, static_cast<int32_t>(length));
+ return internal::BuilderAppend(typed_builder_, obj);
}
};
@@ -719,27 +569,7 @@ class FixedWidthBytesConverter
public:
// Append a non-missing item
Status AppendItem(PyObject* obj) {
- PyObject* bytes_obj;
- OwnedRef tmp;
- Py_ssize_t expected_length =
- std::dynamic_pointer_cast<FixedSizeBinaryType>(typed_builder_->type())
- ->byte_width();
- if (PyUnicode_Check(obj)) {
- tmp.reset(PyUnicode_AsUTF8String(obj));
- RETURN_IF_PYERROR();
- bytes_obj = tmp.obj();
- } else if (PyBytes_Check(obj)) {
- bytes_obj = obj;
- } else {
- std::stringstream ss;
- ss << "Error converting to FixedSizeBinary type: ";
- RETURN_NOT_OK(InvalidConversion(obj, "bytes", &ss));
- return Status::Invalid(ss.str());
- }
- // No error checking
- RETURN_NOT_OK(CheckPythonBytesAreFixedLength(bytes_obj, expected_length));
- return typed_builder_->Append(
- reinterpret_cast<const uint8_t*>(PyBytes_AS_STRING(bytes_obj)));
+ return internal::BuilderAppend(typed_builder_, obj);
}
};
@@ -747,32 +577,7 @@ class UTF8Converter : public TypedConverterVisitor<StringBuilder, UTF8Converter>
public:
// Append a non-missing item
Status AppendItem(PyObject* obj) {
- PyObject* bytes_obj;
- OwnedRef tmp;
- const char* bytes;
- Py_ssize_t length;
-
- if (PyBytes_Check(obj)) {
- tmp.reset(
- PyUnicode_FromStringAndSize(PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj)));
- RETURN_IF_PYERROR();
- bytes_obj = obj;
- } else if (!PyUnicode_Check(obj)) {
- OwnedRef repr(PyObject_Repr(obj));
- PyObjectStringify stringified(repr.obj());
- std::stringstream ss;
- ss << "Non bytes/unicode value encountered: " << stringified.bytes;
- return Status::Invalid(ss.str());
- } else {
- tmp.reset(PyUnicode_AsUTF8String(obj));
- RETURN_IF_PYERROR();
- bytes_obj = tmp.obj();
- }
-
- // No error checking
- length = PyBytes_GET_SIZE(bytes_obj);
- bytes = PyBytes_AS_STRING(bytes_obj);
- return typed_builder_->Append(bytes, static_cast<int32_t>(length));
+ return internal::BuilderAppend(typed_builder_, obj, true /* check_valid */);
}
};
@@ -876,21 +681,21 @@ std::unique_ptr<SeqConverter> GetConverter(const std::shared_ptr<DataType>& type
case Type::BOOL:
return std::unique_ptr<SeqConverter>(new BoolConverter);
case Type::INT8:
- return std::unique_ptr<SeqConverter>(new Int8Converter);
+ return std::unique_ptr<SeqConverter>(new TypedIntConverter<Int8Type>);
case Type::INT16:
- return std::unique_ptr<SeqConverter>(new Int16Converter);
+ return std::unique_ptr<SeqConverter>(new TypedIntConverter<Int16Type>);
case Type::INT32:
- return std::unique_ptr<SeqConverter>(new Int32Converter);
+ return std::unique_ptr<SeqConverter>(new TypedIntConverter<Int32Type>);
case Type::INT64:
- return std::unique_ptr<SeqConverter>(new Int64Converter);
+ return std::unique_ptr<SeqConverter>(new TypedIntConverter<Int64Type>);
case Type::UINT8:
- return std::unique_ptr<SeqConverter>(new UInt8Converter);
+ return std::unique_ptr<SeqConverter>(new TypedIntConverter<UInt8Type>);
case Type::UINT16:
- return std::unique_ptr<SeqConverter>(new UInt16Converter);
+ return std::unique_ptr<SeqConverter>(new TypedIntConverter<UInt16Type>);
case Type::UINT32:
- return std::unique_ptr<SeqConverter>(new UInt32Converter);
+ return std::unique_ptr<SeqConverter>(new TypedIntConverter<UInt32Type>);
case Type::UINT64:
- return std::unique_ptr<SeqConverter>(new UInt64Converter);
+ return std::unique_ptr<SeqConverter>(new TypedIntConverter<UInt64Type>);
case Type::DATE32:
return std::unique_ptr<SeqConverter>(new Date32Converter);
case Type::DATE64:
@@ -1032,16 +837,5 @@ Status ConvertPySequence(PyObject* obj, int64_t size,
return ConvertPySequenceReal(obj, size, &type, pool, out);
}
-Status CheckPythonBytesAreFixedLength(PyObject* obj, Py_ssize_t expected_length) {
- const Py_ssize_t length = PyBytes_GET_SIZE(obj);
- if (length != expected_length) {
- std::stringstream ss;
- ss << "Found byte string of length " << length << ", expected length is "
- << expected_length;
- return Status::Invalid(ss.str());
- }
- return Status::OK();
-}
-
} // namespace py
} // namespace arrow
diff --git a/cpp/src/arrow/python/builtin_convert.h b/cpp/src/arrow/python/builtin_convert.h
index 4bd3f08..7a32bec 100644
--- a/cpp/src/arrow/python/builtin_convert.h
+++ b/cpp/src/arrow/python/builtin_convert.h
@@ -73,9 +73,6 @@ ARROW_EXPORT
Status InvalidConversion(PyObject* obj, const std::string& expected_type_name,
std::ostream* out);
-ARROW_EXPORT Status CheckPythonBytesAreFixedLength(PyObject* obj,
- Py_ssize_t expected_length);
-
} // namespace py
} // namespace arrow
diff --git a/cpp/src/arrow/python/common.cc b/cpp/src/arrow/python/common.cc
index 1ded880..bd13f29 100644
--- a/cpp/src/arrow/python/common.cc
+++ b/cpp/src/arrow/python/common.cc
@@ -25,6 +25,8 @@
#include "arrow/status.h"
#include "arrow/util/logging.h"
+#include "arrow/python/helpers.h"
+
namespace arrow {
namespace py {
@@ -87,19 +89,15 @@ Status CheckPyError(StatusCode code) {
PyObject* exc_value = nullptr;
PyObject* traceback = nullptr;
- OwnedRef exc_type_ref(exc_type);
- OwnedRef exc_value_ref(exc_value);
- OwnedRef traceback_ref(traceback);
-
PyErr_Fetch(&exc_type, &exc_value, &traceback);
-
PyErr_NormalizeException(&exc_type, &exc_value, &traceback);
- OwnedRef exc_value_str(PyObject_Str(exc_value));
- PyObjectStringify stringified(exc_value_str.obj());
- std::string message(stringified.bytes);
+ OwnedRef exc_type_ref(exc_type);
+ OwnedRef exc_value_ref(exc_value);
+ OwnedRef traceback_ref(traceback);
- PyErr_Clear();
+ std::string message;
+ RETURN_NOT_OK(internal::PyObject_StdStringStr(exc_value, &message));
return Status(code, message);
}
return Status::OK();
diff --git a/cpp/src/arrow/python/common.h b/cpp/src/arrow/python/common.h
index b2844b1..76aee16 100644
--- a/cpp/src/arrow/python/common.h
+++ b/cpp/src/arrow/python/common.h
@@ -19,7 +19,9 @@
#define ARROW_PYTHON_COMMON_H
#include <memory>
+#include <sstream>
#include <string>
+#include <utility>
#include "arrow/python/config.h"
@@ -33,6 +35,15 @@ class MemoryPool;
namespace py {
+ARROW_EXPORT Status CheckPyError(StatusCode code = StatusCode::UnknownError);
+
+ARROW_EXPORT Status PassPyError();
+
+// TODO(wesm): We can just let errors pass through. To be explored later
+#define RETURN_IF_PYERROR() RETURN_NOT_OK(CheckPyError());
+
+#define PY_RETURN_IF_ERROR(CODE) RETURN_NOT_OK(CheckPyError(CODE));
+
class ARROW_EXPORT PyAcquireGIL {
public:
PyAcquireGIL() : acquired_gil_(false) { acquire(); }
@@ -70,6 +81,11 @@ class ARROW_EXPORT OwnedRef {
OwnedRef(OwnedRef&& other) : OwnedRef(other.detach()) {}
explicit OwnedRef(PyObject* obj) : obj_(obj) {}
+ OwnedRef& operator=(OwnedRef&& other) {
+ obj_ = other.detach();
+ return *this;
+ }
+
~OwnedRef() { reset(); }
void reset(PyObject* obj) {
@@ -89,6 +105,8 @@ class ARROW_EXPORT OwnedRef {
PyObject** ref() { return &obj_; }
+ operator bool() const { return obj_ != NULLPTR; }
+
private:
ARROW_DISALLOW_COPY_AND_ASSIGN(OwnedRef);
@@ -110,36 +128,72 @@ class ARROW_EXPORT OwnedRefNoGIL : public OwnedRef {
}
};
-struct ARROW_EXPORT PyObjectStringify {
- OwnedRef tmp_obj;
+// A temporary conversion of a Python object to a bytes area.
+struct ARROW_EXPORT PyBytesView {
const char* bytes;
Py_ssize_t size;
- explicit PyObjectStringify(PyObject* obj) {
- PyObject* bytes_obj;
+ PyBytesView() : bytes(nullptr), size(0), ref(nullptr) {}
+
+ // View the given Python object as binary-like, i.e. bytes
+ Status FromBinary(PyObject* obj) { return FromBinary(obj, "a bytes object"); }
+
+ // View the given Python object as string-like, i.e. str or (utf8) bytes
+ Status FromString(PyObject* obj, bool check_valid = false) {
if (PyUnicode_Check(obj)) {
- bytes_obj = PyUnicode_AsUTF8String(obj);
- tmp_obj.reset(bytes_obj);
- bytes = PyBytes_AsString(bytes_obj);
- size = PyBytes_GET_SIZE(bytes_obj);
- } else if (PyBytes_Check(obj)) {
- bytes = PyBytes_AsString(obj);
- size = PyBytes_GET_SIZE(obj);
+#if PY_MAJOR_VERSION >= 3
+ Py_ssize_t size;
+ // The utf-8 representation is cached on the unicode object
+ const char* data = PyUnicode_AsUTF8AndSize(obj, &size);
+ RETURN_IF_PYERROR();
+ this->bytes = data;
+ this->size = size;
+ this->ref.reset();
+ return Status::OK();
+#else
+ PyObject* converted = PyUnicode_AsUTF8String(obj);
+ RETURN_IF_PYERROR();
+ this->bytes = PyBytes_AS_STRING(converted);
+ this->size = PyBytes_GET_SIZE(converted);
+ this->ref.reset(converted);
+ return Status::OK();
+#endif
} else {
- bytes = NULLPTR;
- size = -1;
+ RETURN_NOT_OK(FromBinary(obj, "a string or bytes object"));
+ if (check_valid) {
+ // Check the bytes are valid utf-8
+ OwnedRef decoded(PyUnicode_FromStringAndSize(bytes, size));
+ RETURN_IF_PYERROR();
+ }
+ return Status::OK();
}
}
-};
-
-Status CheckPyError(StatusCode code = StatusCode::UnknownError);
-Status PassPyError();
-
-// TODO(wesm): We can just let errors pass through. To be explored later
-#define RETURN_IF_PYERROR() RETURN_NOT_OK(CheckPyError());
+ protected:
+ PyBytesView(const char* b, Py_ssize_t s, PyObject* obj = nullptr)
+ : bytes(b), size(s), ref(obj) {}
+
+ Status FromBinary(PyObject* obj, const char* expected_msg) {
+ if (PyBytes_Check(obj)) {
+ this->bytes = PyBytes_AS_STRING(obj);
+ this->size = PyBytes_GET_SIZE(obj);
+ this->ref.reset();
+ return Status::OK();
+ } else if (PyByteArray_Check(obj)) {
+ this->bytes = PyByteArray_AS_STRING(obj);
+ this->size = PyByteArray_GET_SIZE(obj);
+ this->ref.reset();
+ return Status::OK();
+ } else {
+ std::stringstream ss;
+ ss << "Expected " << expected_msg << ", got a '" << Py_TYPE(obj)->tp_name
+ << "' object";
+ return Status::TypeError(ss.str());
+ }
+ }
-#define PY_RETURN_IF_ERROR(CODE) RETURN_NOT_OK(CheckPyError(CODE));
+ OwnedRef ref;
+};
// Return the common PyArrow memory pool
ARROW_EXPORT void set_default_memory_pool(MemoryPool* pool);
diff --git a/cpp/src/arrow/python/helpers.cc b/cpp/src/arrow/python/decimal.cc
similarity index 66%
copy from cpp/src/arrow/python/helpers.cc
copy to cpp/src/arrow/python/decimal.cc
index 4fd9ef2..10593c7 100644
--- a/cpp/src/arrow/python/helpers.cc
+++ b/cpp/src/arrow/python/decimal.cc
@@ -17,9 +17,9 @@
#include <algorithm>
#include <limits>
-#include <sstream>
#include "arrow/python/common.h"
+#include "arrow/python/decimal.h"
#include "arrow/python/helpers.h"
#include "arrow/util/decimal.h"
#include "arrow/util/logging.h"
@@ -28,75 +28,8 @@
namespace arrow {
namespace py {
-
-#define GET_PRIMITIVE_TYPE(NAME, FACTORY) \
- case Type::NAME: \
- return FACTORY()
-
-std::shared_ptr<DataType> GetPrimitiveType(Type::type type) {
- switch (type) {
- case Type::NA:
- return null();
- GET_PRIMITIVE_TYPE(UINT8, uint8);
- GET_PRIMITIVE_TYPE(INT8, int8);
- GET_PRIMITIVE_TYPE(UINT16, uint16);
- GET_PRIMITIVE_TYPE(INT16, int16);
- GET_PRIMITIVE_TYPE(UINT32, uint32);
- GET_PRIMITIVE_TYPE(INT32, int32);
- GET_PRIMITIVE_TYPE(UINT64, uint64);
- GET_PRIMITIVE_TYPE(INT64, int64);
- GET_PRIMITIVE_TYPE(DATE32, date32);
- GET_PRIMITIVE_TYPE(DATE64, date64);
- GET_PRIMITIVE_TYPE(BOOL, boolean);
- GET_PRIMITIVE_TYPE(HALF_FLOAT, float16);
- GET_PRIMITIVE_TYPE(FLOAT, float32);
- GET_PRIMITIVE_TYPE(DOUBLE, float64);
- GET_PRIMITIVE_TYPE(BINARY, binary);
- GET_PRIMITIVE_TYPE(STRING, utf8);
- default:
- return nullptr;
- }
-}
-
-PyObject* PyHalf_FromHalf(npy_half value) {
- PyObject* result = PyArrayScalar_New(Half);
- if (result != NULL) {
- PyArrayScalar_ASSIGN(result, Half, value);
- }
- return result;
-}
-
-Status PyFloat_AsHalf(PyObject* obj, npy_half* out) {
- if (PyArray_IsScalar(obj, Half)) {
- *out = PyArrayScalar_VAL(obj, Half);
- return Status::OK();
- } else {
- // XXX: cannot use npy_double_to_half() without linking with Numpy
- return Status::TypeError("Expected np.float16 instance");
- }
-}
-
namespace internal {
-Status ImportModule(const std::string& module_name, OwnedRef* ref) {
- PyObject* module = PyImport_ImportModule(module_name.c_str());
- RETURN_IF_PYERROR();
- DCHECK_NE(module, nullptr) << "unable to import the " << module_name << " module";
- ref->reset(module);
- return Status::OK();
-}
-
-Status ImportFromModule(const OwnedRef& module, const std::string& name, OwnedRef* ref) {
- /// Assumes that ImportModule was called first
- DCHECK_NE(module.obj(), nullptr) << "Cannot import from nullptr Python module";
-
- PyObject* attr = PyObject_GetAttrString(module.obj(), name.c_str());
- RETURN_IF_PYERROR();
- DCHECK_NE(attr, nullptr) << "unable to import the " << name << " object";
- ref->reset(attr);
- return Status::OK();
-}
-
Status ImportDecimalType(OwnedRef* decimal_type) {
OwnedRef decimal_module;
RETURN_NOT_OK(ImportModule("decimal", &decimal_module));
@@ -106,20 +39,7 @@ Status ImportDecimalType(OwnedRef* decimal_type) {
Status PythonDecimalToString(PyObject* python_decimal, std::string* out) {
// Call Python's str(decimal_object)
- OwnedRef str_obj(PyObject_Str(python_decimal));
- RETURN_IF_PYERROR();
-
- PyObjectStringify str(str_obj.obj());
- RETURN_IF_PYERROR();
-
- const char* bytes = str.bytes;
- DCHECK_NE(bytes, nullptr);
-
- Py_ssize_t size = str.size;
-
- std::string c_string(bytes, size);
- *out = c_string;
- return Status::OK();
+ return PyObject_StdStringStr(python_decimal, out);
}
// \brief Infer the precision and scale of a Python decimal.Decimal instance
@@ -219,31 +139,6 @@ Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arr
return Status::OK();
}
-bool IsPyInteger(PyObject* obj) {
-#if PYARROW_IS_PY2
- return PyLong_Check(obj) || PyInt_Check(obj);
-#else
- return PyLong_Check(obj);
-#endif
-}
-
-Status UInt64FromPythonInt(PyObject* obj, uint64_t* out) {
- OwnedRef ref;
- // PyLong_AsUnsignedLongLong() doesn't handle conversion from non-ints
- // (e.g. np.uint64), so do it ourselves
- if (!PyLong_Check(obj)) {
- ref.reset(PyNumber_Long(obj));
- RETURN_IF_PYERROR();
- obj = ref.obj();
- }
- auto result = static_cast<uint64_t>(PyLong_AsUnsignedLongLong(obj));
- if (result == static_cast<uint64_t>(-1)) {
- RETURN_IF_PYERROR();
- }
- *out = static_cast<uint64_t>(result);
- return Status::OK();
-}
-
bool PyDecimal_Check(PyObject* obj) {
static OwnedRef decimal_type;
if (!decimal_type.obj()) {
@@ -301,15 +196,6 @@ Status DecimalMetadata::Update(PyObject* object) {
return Update(precision, scale);
}
-bool PyFloat_IsNaN(PyObject* obj) {
- return PyFloat_Check(obj) && std::isnan(PyFloat_AsDouble(obj));
-}
-
-bool PandasObjectIsNull(PyObject* obj) {
- return obj == Py_None || obj == numpy_nan || PyFloat_IsNaN(obj) ||
- (internal::PyDecimal_Check(obj) && internal::PyDecimal_ISNAN(obj));
-}
-
} // namespace internal
} // namespace py
} // namespace arrow
diff --git a/cpp/src/arrow/python/helpers.h b/cpp/src/arrow/python/decimal.h
similarity index 63%
copy from cpp/src/arrow/python/helpers.h
copy to cpp/src/arrow/python/decimal.h
index e2f3b18..41d821f 100644
--- a/cpp/src/arrow/python/helpers.h
+++ b/cpp/src/arrow/python/decimal.h
@@ -15,20 +15,12 @@
// specific language governing permissions and limitations
// under the License.
-#ifndef PYARROW_HELPERS_H
-#define PYARROW_HELPERS_H
+#ifndef ARROW_PYTHON_DECIMAL_H
+#define ARROW_PYTHON_DECIMAL_H
-#include "arrow/python/platform.h"
-
-#include <memory>
#include <string>
-#include <utility>
-
-#include <numpy/halffloat.h>
#include "arrow/type.h"
-#include "arrow/util/macros.h"
-#include "arrow/util/visibility.h"
namespace arrow {
@@ -36,34 +28,13 @@ class Decimal128;
namespace py {
-class OwnedRef;
-
-// \brief Get an arrow DataType instance from Arrow's Type::type enum
-// \param[in] type One of the values of Arrow's Type::type enum
-// \return A shared pointer to DataType
-ARROW_EXPORT std::shared_ptr<DataType> GetPrimitiveType(Type::type type);
-
-// \brief Construct a np.float16 object from a npy_half value.
-ARROW_EXPORT PyObject* PyHalf_FromHalf(npy_half value);
-
-// \brief Convert a Python object to a npy_half value.
-ARROW_EXPORT Status PyFloat_AsHalf(PyObject* obj, npy_half* out);
+//
+// Python Decimal support
+//
namespace internal {
-// \brief Import a Python module
-// \param[in] module_name The name of the module
-// \param[out] ref The OwnedRef containing the module PyObject*
-Status ImportModule(const std::string& module_name, OwnedRef* ref);
-
-// \brief Import an object from a Python module
-// \param[in] module A Python module
-// \param[in] name The name of the object to import
-// \param[out] ref The OwnedRef containing the \c name attribute of the Python module \c
-// module
-Status ImportFromModule(const OwnedRef& module, const std::string& name, OwnedRef* ref);
-
-// \brief Import
+// \brief Import the Python Decimal type
Status ImportDecimalType(OwnedRef* decimal_type);
// \brief Convert a Python Decimal object to a C++ string
@@ -87,15 +58,6 @@ PyObject* DecimalFromString(PyObject* decimal_constructor,
Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type,
Decimal128* out);
-// \brief Check whether obj is an integer, independent of Python versions.
-bool IsPyInteger(PyObject* obj);
-
-// \brief Use pandas missing value semantics to check if a value is null
-bool PandasObjectIsNull(PyObject* obj);
-
-// \brief Check whether obj is nan
-bool PyFloat_IsNaN(PyObject* obj);
-
// \brief Check whether obj is an instance of Decimal
bool PyDecimal_Check(PyObject* obj);
@@ -103,12 +65,6 @@ bool PyDecimal_Check(PyObject* obj);
// is not a Decimal instance
bool PyDecimal_ISNAN(PyObject* obj);
-// \brief Convert a Python integer into an unsigned 64-bit integer
-// \param[in] obj A Python integer
-// \param[out] out A pointer to a C uint64_t to hold the result of the conversion
-// \return The status of the operation
-Status UInt64FromPythonInt(PyObject* obj, uint64_t* out);
-
// \brief Helper class to track and update the precision and scale of a decimal
class DecimalMetadata {
public:
@@ -137,4 +93,4 @@ class DecimalMetadata {
} // namespace py
} // namespace arrow
-#endif // PYARROW_HELPERS_H
+#endif // ARROW_PYTHON_DECIMAL_H
diff --git a/cpp/src/arrow/python/helpers.cc b/cpp/src/arrow/python/helpers.cc
index 4fd9ef2..bb0837c 100644
--- a/cpp/src/arrow/python/helpers.cc
+++ b/cpp/src/arrow/python/helpers.cc
@@ -15,13 +15,14 @@
// specific language governing permissions and limitations
// under the License.
-#include <algorithm>
#include <limits>
#include <sstream>
+#include <type_traits>
+#include <typeinfo>
#include "arrow/python/common.h"
+#include "arrow/python/decimal.h"
#include "arrow/python/helpers.h"
-#include "arrow/util/decimal.h"
#include "arrow/util/logging.h"
#include <arrow/api.h>
@@ -78,6 +79,60 @@ Status PyFloat_AsHalf(PyObject* obj, npy_half* out) {
namespace internal {
+std::string PyBytes_AsStdString(PyObject* obj) {
+ DCHECK(PyBytes_Check(obj));
+ return std::string(PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj));
+}
+
+Status PyUnicode_AsStdString(PyObject* obj, std::string* out) {
+ DCHECK(PyUnicode_Check(obj));
+#if PY_MAJOR_VERSION >= 3
+ Py_ssize_t size;
+ // The utf-8 representation is cached on the unicode object
+ const char* data = PyUnicode_AsUTF8AndSize(obj, &size);
+ RETURN_IF_PYERROR();
+ *out = std::string(data, size);
+ return Status::OK();
+#else
+ OwnedRef bytes_ref(PyUnicode_AsUTF8String(obj));
+ RETURN_IF_PYERROR();
+ *out = PyBytes_AsStdString(bytes_ref.obj());
+ return Status::OK();
+#endif
+}
+
+std::string PyObject_StdStringRepr(PyObject* obj) {
+#if PY_MAJOR_VERSION >= 3
+ OwnedRef unicode_ref(PyObject_Repr(obj));
+ OwnedRef bytes_ref;
+
+ if (unicode_ref) {
+ bytes_ref.reset(
+ PyUnicode_AsEncodedString(unicode_ref.obj(), "utf8", "backslashreplace"));
+ }
+#else
+ OwnedRef bytes_ref(PyObject_Repr(obj));
+ if (!bytes_ref) {
+ PyErr_Clear();
+ std::stringstream ss;
+ ss << "<object of type '" << Py_TYPE(obj)->tp_name << "' repr() failed>";
+ return ss.str();
+ }
+#endif
+ return PyBytes_AsStdString(bytes_ref.obj());
+}
+
+Status PyObject_StdStringStr(PyObject* obj, std::string* out) {
+ OwnedRef string_ref(PyObject_Str(obj));
+ RETURN_IF_PYERROR();
+#if PY_MAJOR_VERSION >= 3
+ return PyUnicode_AsStdString(string_ref.obj(), out);
+#else
+ *out = PyBytes_AsStdString(string_ref.obj());
+ return Status::OK();
+#endif
+}
+
Status ImportModule(const std::string& module_name, OwnedRef* ref) {
PyObject* module = PyImport_ImportModule(module_name.c_str());
RETURN_IF_PYERROR();
@@ -97,209 +152,177 @@ Status ImportFromModule(const OwnedRef& module, const std::string& name, OwnedRe
return Status::OK();
}
-Status ImportDecimalType(OwnedRef* decimal_type) {
- OwnedRef decimal_module;
- RETURN_NOT_OK(ImportModule("decimal", &decimal_module));
- RETURN_NOT_OK(ImportFromModule(decimal_module, "Decimal", decimal_type));
+Status BuilderAppend(BinaryBuilder* builder, PyObject* obj, bool* is_full) {
+ PyBytesView view;
+ // XXX For some reason, we must accept unicode objects here
+ RETURN_NOT_OK(view.FromString(obj));
+ int32_t length;
+ RETURN_NOT_OK(CastSize(view.size, &length));
+ // Did we reach the builder size limit?
+ if (ARROW_PREDICT_FALSE(builder->value_data_length() + length > kBinaryMemoryLimit)) {
+ if (is_full) {
+ *is_full = true;
+ return Status::OK();
+ } else {
+ return Status::Invalid("Maximum array size reached (2GB)");
+ }
+ }
+ RETURN_NOT_OK(builder->Append(view.bytes, length));
+ if (is_full) {
+ *is_full = false;
+ }
return Status::OK();
}
-Status PythonDecimalToString(PyObject* python_decimal, std::string* out) {
- // Call Python's str(decimal_object)
- OwnedRef str_obj(PyObject_Str(python_decimal));
- RETURN_IF_PYERROR();
-
- PyObjectStringify str(str_obj.obj());
- RETURN_IF_PYERROR();
-
- const char* bytes = str.bytes;
- DCHECK_NE(bytes, nullptr);
-
- Py_ssize_t size = str.size;
-
- std::string c_string(bytes, size);
- *out = c_string;
+Status BuilderAppend(FixedSizeBinaryBuilder* builder, PyObject* obj, bool* is_full) {
+ PyBytesView view;
+ // XXX For some reason, we must accept unicode objects here
+ RETURN_NOT_OK(view.FromString(obj));
+ const auto expected_length =
+ static_cast<const FixedSizeBinaryType&>(*builder->type()).byte_width();
+ if (ARROW_PREDICT_FALSE(view.size != expected_length)) {
+ std::stringstream ss;
+ ss << "Got bytestring of length " << view.size << " (expected " << expected_length
+ << ")";
+ return Status::Invalid(ss.str());
+ }
+ // Did we reach the builder size limit?
+ if (ARROW_PREDICT_FALSE(builder->value_data_length() + view.size >
+ kBinaryMemoryLimit)) {
+ if (is_full) {
+ *is_full = true;
+ return Status::OK();
+ } else {
+ return Status::Invalid("Maximum array size reached (2GB)");
+ }
+ }
+ RETURN_NOT_OK(builder->Append(view.bytes));
+ if (is_full) {
+ *is_full = false;
+ }
return Status::OK();
}
-// \brief Infer the precision and scale of a Python decimal.Decimal instance
-// \param python_decimal[in] An instance of decimal.Decimal
-// \param precision[out] The value of the inferred precision
-// \param scale[out] The value of the inferred scale
-// \return The status of the operation
-static Status InferDecimalPrecisionAndScale(PyObject* python_decimal, int32_t* precision,
- int32_t* scale) {
- DCHECK_NE(python_decimal, NULLPTR);
- DCHECK_NE(precision, NULLPTR);
- DCHECK_NE(scale, NULLPTR);
-
- // TODO(phillipc): Make sure we perform PyDecimal_Check(python_decimal) as a DCHECK
- OwnedRef as_tuple(PyObject_CallMethod(python_decimal, const_cast<char*>("as_tuple"),
- const_cast<char*>("")));
- RETURN_IF_PYERROR();
- DCHECK(PyTuple_Check(as_tuple.obj()));
-
- OwnedRef digits(PyObject_GetAttrString(as_tuple.obj(), "digits"));
- RETURN_IF_PYERROR();
- DCHECK(PyTuple_Check(digits.obj()));
-
- const auto num_digits = static_cast<int32_t>(PyTuple_Size(digits.obj()));
- RETURN_IF_PYERROR();
-
- OwnedRef py_exponent(PyObject_GetAttrString(as_tuple.obj(), "exponent"));
- RETURN_IF_PYERROR();
- DCHECK(IsPyInteger(py_exponent.obj()));
-
- const auto exponent = static_cast<int32_t>(PyLong_AsLong(py_exponent.obj()));
- RETURN_IF_PYERROR();
-
- const int32_t abs_exponent = std::abs(exponent);
-
- int32_t num_additional_zeros;
-
- if (num_digits <= abs_exponent) {
- DCHECK_NE(exponent, 0) << "exponent should never be zero here";
-
- // we have leading/trailing zeros, leading if exponent is negative
- num_additional_zeros = exponent < 0 ? abs_exponent - num_digits : exponent;
- *scale = static_cast<int32_t>(exponent < 0) * -exponent;
- } else {
- // we can use the number of digits as the precision
- num_additional_zeros = 0;
- *scale = -exponent;
+Status BuilderAppend(StringBuilder* builder, PyObject* obj, bool check_valid,
+ bool* is_full) {
+ PyBytesView view;
+ RETURN_NOT_OK(view.FromString(obj, check_valid));
+ int32_t length;
+ RETURN_NOT_OK(CastSize(view.size, &length));
+ // Did we reach the builder size limit?
+ if (ARROW_PREDICT_FALSE(builder->value_data_length() + length > kBinaryMemoryLimit)) {
+ if (is_full) {
+ *is_full = true;
+ return Status::OK();
+ } else {
+ return Status::Invalid("Maximum array size reached (2GB)");
+ }
+ }
+ RETURN_NOT_OK(builder->Append(view.bytes, length));
+ if (is_full) {
+ *is_full = false;
}
-
- *precision = num_digits + num_additional_zeros;
return Status::OK();
}
-PyObject* DecimalFromString(PyObject* decimal_constructor,
- const std::string& decimal_string) {
- DCHECK_NE(decimal_constructor, nullptr);
-
- auto string_size = decimal_string.size();
- DCHECK_GT(string_size, 0);
-
- auto string_bytes = decimal_string.c_str();
- DCHECK_NE(string_bytes, nullptr);
-
- return PyObject_CallFunction(decimal_constructor, const_cast<char*>("s#"), string_bytes,
- string_size);
-}
-
-Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type,
- Decimal128* out) {
- DCHECK_NE(python_decimal, NULLPTR);
- DCHECK_NE(out, NULLPTR);
-
- std::string string;
- RETURN_NOT_OK(PythonDecimalToString(python_decimal, &string));
+namespace {
- int32_t inferred_precision;
- int32_t inferred_scale;
-
- RETURN_NOT_OK(
- Decimal128::FromString(string, out, &inferred_precision, &inferred_scale));
-
- const int32_t precision = arrow_type.precision();
- const int32_t scale = arrow_type.scale();
-
- if (ARROW_PREDICT_FALSE(inferred_precision > precision)) {
- std::stringstream buf;
- buf << "Decimal type with precision " << inferred_precision
- << " does not fit into precision inferred from first array element: "
- << precision;
- return Status::Invalid(buf.str());
+Status IntegerOverflowStatus(const std::string& overflow_message) {
+ if (overflow_message.empty()) {
+ return Status::Invalid("Value too large to fit in C integer type");
+ } else {
+ return Status::Invalid(overflow_message);
}
+}
- if (scale != inferred_scale) {
- DCHECK_NE(out, NULLPTR);
- RETURN_NOT_OK(out->Rescale(inferred_scale, scale, out));
+// Extract C signed int from Python object
+template <typename Int,
+ typename std::enable_if<std::is_signed<Int>::value, Int>::type = 0>
+Status CIntFromPythonImpl(PyObject* obj, Int* out, const std::string& overflow_message) {
+ static_assert(sizeof(Int) <= sizeof(long long), // NOLINT
+ "integer type larger than long long");
+
+ if (sizeof(Int) > sizeof(long)) { // NOLINT
+ const auto value = PyLong_AsLongLong(obj);
+ if (ARROW_PREDICT_FALSE(value == -1)) {
+ RETURN_IF_PYERROR();
+ }
+ if (ARROW_PREDICT_FALSE(value < std::numeric_limits<Int>::min() ||
+ value > std::numeric_limits<Int>::max())) {
+ return IntegerOverflowStatus(overflow_message);
+ }
+ *out = static_cast<Int>(value);
+ } else {
+ const auto value = PyLong_AsLong(obj);
+ if (ARROW_PREDICT_FALSE(value == -1)) {
+ RETURN_IF_PYERROR();
+ }
+ if (ARROW_PREDICT_FALSE(value < std::numeric_limits<Int>::min() ||
+ value > std::numeric_limits<Int>::max())) {
+ return IntegerOverflowStatus(overflow_message);
+ }
+ *out = static_cast<Int>(value);
}
return Status::OK();
}
-bool IsPyInteger(PyObject* obj) {
-#if PYARROW_IS_PY2
- return PyLong_Check(obj) || PyInt_Check(obj);
-#else
- return PyLong_Check(obj);
-#endif
-}
+// Extract C unsigned int from Python object
+template <typename Int,
+ typename std::enable_if<std::is_unsigned<Int>::value, Int>::type = 0>
+Status CIntFromPythonImpl(PyObject* obj, Int* out, const std::string& overflow_message) {
+ static_assert(sizeof(Int) <= sizeof(unsigned long long), // NOLINT
+ "integer type larger than unsigned long long");
-Status UInt64FromPythonInt(PyObject* obj, uint64_t* out) {
OwnedRef ref;
- // PyLong_AsUnsignedLongLong() doesn't handle conversion from non-ints
- // (e.g. np.uint64), so do it ourselves
+ // PyLong_AsUnsignedLong() and PyLong_AsUnsignedLongLong() don't handle
+ // conversion from non-ints (e.g. np.uint64), so do it ourselves
if (!PyLong_Check(obj)) {
ref.reset(PyNumber_Long(obj));
- RETURN_IF_PYERROR();
+ if (!ref) {
+ RETURN_IF_PYERROR();
+ }
obj = ref.obj();
}
- auto result = static_cast<uint64_t>(PyLong_AsUnsignedLongLong(obj));
- if (result == static_cast<uint64_t>(-1)) {
- RETURN_IF_PYERROR();
+ if (sizeof(Int) > sizeof(unsigned long)) { // NOLINT
+ const auto value = PyLong_AsUnsignedLongLong(obj);
+ if (ARROW_PREDICT_FALSE(value == static_cast<decltype(value)>(-1))) {
+ RETURN_IF_PYERROR();
+ }
+ if (ARROW_PREDICT_FALSE(value > std::numeric_limits<Int>::max())) {
+ return IntegerOverflowStatus(overflow_message);
+ }
+ *out = static_cast<Int>(value);
+ } else {
+ const auto value = PyLong_AsUnsignedLong(obj);
+ if (ARROW_PREDICT_FALSE(value == static_cast<decltype(value)>(-1))) {
+ RETURN_IF_PYERROR();
+ }
+ if (ARROW_PREDICT_FALSE(value > std::numeric_limits<Int>::max())) {
+ return IntegerOverflowStatus(overflow_message);
+ }
+ *out = static_cast<Int>(value);
}
- *out = static_cast<uint64_t>(result);
return Status::OK();
}
-bool PyDecimal_Check(PyObject* obj) {
- static OwnedRef decimal_type;
- if (!decimal_type.obj()) {
- Status status = ImportDecimalType(&decimal_type);
- DCHECK_OK(status);
- DCHECK(PyType_Check(decimal_type.obj()));
- }
- // PyObject_IsInstance() is slower as it has to check for virtual subclasses
- const int result =
- PyType_IsSubtype(Py_TYPE(obj), reinterpret_cast<PyTypeObject*>(decimal_type.obj()));
- DCHECK_NE(result, -1) << " error during PyType_IsSubtype check";
- return result == 1;
-}
-
-bool PyDecimal_ISNAN(PyObject* obj) {
- DCHECK(PyDecimal_Check(obj)) << "obj is not an instance of decimal.Decimal";
- OwnedRef is_nan(
- PyObject_CallMethod(obj, const_cast<char*>("is_nan"), const_cast<char*>("")));
- return PyObject_IsTrue(is_nan.obj()) == 1;
-}
-
-DecimalMetadata::DecimalMetadata()
- : DecimalMetadata(std::numeric_limits<int32_t>::min(),
- std::numeric_limits<int32_t>::min()) {}
+} // namespace
-DecimalMetadata::DecimalMetadata(int32_t precision, int32_t scale)
- : precision_(precision), scale_(scale) {}
-
-Status DecimalMetadata::Update(int32_t suggested_precision, int32_t suggested_scale) {
- const int32_t current_precision = precision_;
- precision_ = std::max(current_precision, suggested_precision);
-
- const int32_t current_scale = scale_;
- scale_ = std::max(current_scale, suggested_scale);
-
- // if our suggested scale is zero and we don't yet have enough precision then we need to
- // add whatever the current scale is to the precision
- if (suggested_scale == 0 && suggested_precision > current_precision) {
- precision_ += scale_;
+template <typename Int>
+Status CIntFromPython(PyObject* obj, Int* out, const std::string& overflow_message) {
+ if (PyBool_Check(obj)) {
+ return Status::TypeError("Expected integer, got bool");
}
-
- return Status::OK();
+ return CIntFromPythonImpl(obj, out, overflow_message);
}
-Status DecimalMetadata::Update(PyObject* object) {
- DCHECK(PyDecimal_Check(object)) << "Object is not a Python Decimal";
-
- if (ARROW_PREDICT_FALSE(PyDecimal_ISNAN(object))) {
- return Status::OK();
- }
-
- int32_t precision;
- int32_t scale;
- RETURN_NOT_OK(InferDecimalPrecisionAndScale(object, &precision, &scale));
- return Update(precision, scale);
-}
+template Status CIntFromPython(PyObject*, int8_t*, const std::string&);
+template Status CIntFromPython(PyObject*, int16_t*, const std::string&);
+template Status CIntFromPython(PyObject*, int32_t*, const std::string&);
+template Status CIntFromPython(PyObject*, int64_t*, const std::string&);
+template Status CIntFromPython(PyObject*, uint8_t*, const std::string&);
+template Status CIntFromPython(PyObject*, uint16_t*, const std::string&);
+template Status CIntFromPython(PyObject*, uint32_t*, const std::string&);
+template Status CIntFromPython(PyObject*, uint64_t*, const std::string&);
bool PyFloat_IsNaN(PyObject* obj) {
return PyFloat_Check(obj) && std::isnan(PyFloat_AsDouble(obj));
diff --git a/cpp/src/arrow/python/helpers.h b/cpp/src/arrow/python/helpers.h
index e2f3b18..195d5fb 100644
--- a/cpp/src/arrow/python/helpers.h
+++ b/cpp/src/arrow/python/helpers.h
@@ -15,11 +15,12 @@
// specific language governing permissions and limitations
// under the License.
-#ifndef PYARROW_HELPERS_H
-#define PYARROW_HELPERS_H
+#ifndef ARROW_PYTHON_HELPERS_H
+#define ARROW_PYTHON_HELPERS_H
#include "arrow/python/platform.h"
+#include <limits>
#include <memory>
#include <string>
#include <utility>
@@ -32,8 +33,6 @@
namespace arrow {
-class Decimal128;
-
namespace py {
class OwnedRef;
@@ -63,32 +62,14 @@ Status ImportModule(const std::string& module_name, OwnedRef* ref);
// module
Status ImportFromModule(const OwnedRef& module, const std::string& name, OwnedRef* ref);
-// \brief Import
-Status ImportDecimalType(OwnedRef* decimal_type);
-
-// \brief Convert a Python Decimal object to a C++ string
-// \param[in] python_decimal A Python decimal.Decimal instance
-// \param[out] The string representation of the Python Decimal instance
-// \return The status of the operation
-Status PythonDecimalToString(PyObject* python_decimal, std::string* out);
-
-// \brief Convert a C++ std::string to a Python Decimal instance
-// \param[in] decimal_constructor The decimal type object
-// \param[in] decimal_string A decimal string
-// \return An instance of decimal.Decimal
-PyObject* DecimalFromString(PyObject* decimal_constructor,
- const std::string& decimal_string);
-
-// \brief Convert a Python decimal to an Arrow Decimal128 object
-// \param[in] python_decimal A Python decimal.Decimal instance
-// \param[in] arrow_type An instance of arrow::DecimalType
-// \param[out] out A pointer to a Decimal128
-// \return The status of the operation
-Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type,
- Decimal128* out);
-
// \brief Check whether obj is an integer, independent of Python versions.
-bool IsPyInteger(PyObject* obj);
+inline bool IsPyInteger(PyObject* obj) {
+#if PYARROW_IS_PY2
+ return PyLong_Check(obj) || PyInt_Check(obj);
+#else
+ return PyLong_Check(obj);
+#endif
+}
// \brief Use pandas missing value semantics to check if a value is null
bool PandasObjectIsNull(PyObject* obj);
@@ -96,45 +77,48 @@ bool PandasObjectIsNull(PyObject* obj);
// \brief Check whether obj is nan
bool PyFloat_IsNaN(PyObject* obj);
-// \brief Check whether obj is an instance of Decimal
-bool PyDecimal_Check(PyObject* obj);
-
-// \brief Check whether obj is nan. This function will abort the program if the argument
-// is not a Decimal instance
-bool PyDecimal_ISNAN(PyObject* obj);
+inline bool IsPyBinary(PyObject* obj) {
+ return PyBytes_Check(obj) || PyByteArray_Check(obj);
+}
-// \brief Convert a Python integer into an unsigned 64-bit integer
+// \brief Convert a Python integer into a C integer
// \param[in] obj A Python integer
-// \param[out] out A pointer to a C uint64_t to hold the result of the conversion
+// \param[out] out A pointer to a C integer to hold the result of the conversion
// \return The status of the operation
-Status UInt64FromPythonInt(PyObject* obj, uint64_t* out);
-
-// \brief Helper class to track and update the precision and scale of a decimal
-class DecimalMetadata {
- public:
- DecimalMetadata();
- DecimalMetadata(int32_t precision, int32_t scale);
-
- // \brief Adjust the precision and scale of a decimal type given a new precision and a
- // new scale \param[in] suggested_precision A candidate precision \param[in]
- // suggested_scale A candidate scale \return The status of the operation
- Status Update(int32_t suggested_precision, int32_t suggested_scale);
-
- // \brief A convenient interface for updating the precision and scale based on a Python
- // Decimal object \param object A Python Decimal object \return The status of the
- // operation
- Status Update(PyObject* object);
-
- int32_t precision() const { return precision_; }
- int32_t scale() const { return scale_; }
-
- private:
- int32_t precision_;
- int32_t scale_;
-};
+template <typename Int>
+Status CIntFromPython(PyObject* obj, Int* out, const std::string& overflow_message = "");
+
+// \brief Convert a Python unicode string to a std::string
+Status PyUnicode_AsStdString(PyObject* obj, std::string* out);
+
+// \brief Convert a Python bytes object to a std::string
+std::string PyBytes_AsStdString(PyObject* obj);
+
+// \brief Call str() on the given object and return the result as a std::string
+Status PyObject_StdStringStr(PyObject* obj, std::string* out);
+
+// \brief Return the repr() of the given object (always succeeds)
+std::string PyObject_StdStringRepr(PyObject* obj);
+
+// \brief Cast the given size to int32_t, with error checking
+inline Status CastSize(Py_ssize_t size, int32_t* out,
+ const char* error_msg = "Maximum size exceeded (2GB)") {
+ // size is assumed to be positive
+ if (size > std::numeric_limits<int32_t>::max()) {
+ return Status::Invalid(error_msg);
+ }
+ *out = static_cast<int32_t>(size);
+ return Status::OK();
+}
+
+Status BuilderAppend(StringBuilder* builder, PyObject* obj, bool check_valid = false,
+ bool* is_full = nullptr);
+Status BuilderAppend(BinaryBuilder* builder, PyObject* obj, bool* is_full = nullptr);
+Status BuilderAppend(FixedSizeBinaryBuilder* builder, PyObject* obj,
+ bool* is_full = nullptr);
} // namespace internal
} // namespace py
} // namespace arrow
-#endif // PYARROW_HELPERS_H
+#endif // ARROW_PYTHON_HELPERS_H
diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc
index eb0af8b..e37013c 100644
--- a/cpp/src/arrow/python/numpy_to_arrow.cc
+++ b/cpp/src/arrow/python/numpy_to_arrow.cc
@@ -49,6 +49,7 @@
#include "arrow/python/builtin_convert.h"
#include "arrow/python/common.h"
#include "arrow/python/config.h"
+#include "arrow/python/decimal.h"
#include "arrow/python/helpers.h"
#include "arrow/python/numpy-internal.h"
#include "arrow/python/numpy_convert.h"
@@ -65,14 +66,6 @@ using internal::NumPyTypeSize;
namespace {
-inline bool PyObject_is_string(PyObject* obj) {
-#if PY_MAJOR_VERSION >= 3
- return PyUnicode_Check(obj) || PyBytes_Check(obj);
-#else
- return PyString_Check(obj) || PyUnicode_Check(obj);
-#endif
-}
-
inline bool PyObject_is_integer(PyObject* obj) {
return !PyBool_Check(obj) && PyArray_IsIntegerScalar(obj);
}
@@ -204,8 +197,6 @@ int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap) {
static Status AppendObjectBinaries(PyArrayObject* arr, PyArrayObject* mask,
int64_t offset, BinaryBuilder* builder,
int64_t* end_offset) {
- PyObject* obj;
-
Ndarray1DIndexer<PyObject*> objects(arr);
Ndarray1DIndexer<uint8_t> mask_values;
@@ -216,30 +207,15 @@ static Status AppendObjectBinaries(PyArrayObject* arr, PyArrayObject* mask,
}
for (; offset < objects.size(); ++offset) {
- OwnedRef tmp_obj;
- obj = objects[offset];
+ PyObject* obj = objects[offset];
if ((have_mask && mask_values[offset]) || internal::PandasObjectIsNull(obj)) {
RETURN_NOT_OK(builder->AppendNull());
continue;
- } else if (PyBytes_Check(obj)) {
- const int32_t length = static_cast<int32_t>(PyBytes_GET_SIZE(obj));
- if (ARROW_PREDICT_FALSE(builder->value_data_length() + length >
- kBinaryMemoryLimit)) {
- break;
- }
- RETURN_NOT_OK(builder->Append(PyBytes_AS_STRING(obj), length));
- } else if (PyByteArray_Check(obj)) {
- const int32_t length = static_cast<int32_t>(PyByteArray_GET_SIZE(obj));
- if (ARROW_PREDICT_FALSE(builder->value_data_length() + length >
- kBinaryMemoryLimit)) {
- break;
- }
- RETURN_NOT_OK(builder->Append(PyByteArray_AS_STRING(obj), length));
- } else {
- std::stringstream ss;
- ss << "Error converting from Python objects to bytes: ";
- RETURN_NOT_OK(InvalidConversion(obj, "str, bytes, bytearray", &ss));
- return Status::Invalid(ss.str());
+ }
+ bool is_full;
+ RETURN_NOT_OK(internal::BuilderAppend(builder, obj, &is_full));
+ if (is_full) {
+ break;
}
}
@@ -275,27 +251,16 @@ static Status AppendObjectStrings(PyArrayObject* arr, PyArrayObject* mask, int64
if ((have_mask && mask_values[offset]) || internal::PandasObjectIsNull(obj)) {
RETURN_NOT_OK(builder->AppendNull());
continue;
- } else if (PyUnicode_Check(obj)) {
- obj = PyUnicode_AsUTF8String(obj);
- if (obj == NULL) {
- PyErr_Clear();
- return Status::Invalid("failed converting unicode to UTF8");
- }
- tmp_obj.reset(obj);
- } else if (PyBytes_Check(obj)) {
+ }
+ if (internal::IsPyBinary(obj)) {
*have_bytes = true;
- } else {
- std::stringstream ss;
- ss << "Error converting from Python objects to String/UTF8: ";
- RETURN_NOT_OK(InvalidConversion(obj, "str, bytes", &ss));
- return Status::Invalid(ss.str());
}
-
- const int32_t length = static_cast<int32_t>(PyBytes_GET_SIZE(obj));
- if (ARROW_PREDICT_FALSE(builder->value_data_length() + length > kBinaryMemoryLimit)) {
+ bool is_full;
+ RETURN_NOT_OK(
+ internal::BuilderAppend(builder, obj, false /* check_valid */, &is_full));
+ if (is_full) {
break;
}
- RETURN_NOT_OK(builder->Append(PyBytes_AS_STRING(obj), length));
}
// If we consumed the whole array, this will be the length of arr
@@ -324,28 +289,12 @@ static Status AppendObjectFixedWidthBytes(PyArrayObject* arr, PyArrayObject* mas
if ((have_mask && mask_values[offset]) || internal::PandasObjectIsNull(obj)) {
RETURN_NOT_OK(builder->AppendNull());
continue;
- } else if (PyUnicode_Check(obj)) {
- obj = PyUnicode_AsUTF8String(obj);
- if (obj == NULL) {
- PyErr_Clear();
- return Status::Invalid("failed converting unicode to UTF8");
- }
-
- tmp_obj.reset(obj);
- } else if (!PyBytes_Check(obj)) {
- std::stringstream ss;
- ss << "Error converting from Python objects to FixedSizeBinary: ";
- RETURN_NOT_OK(InvalidConversion(obj, "str, bytes", &ss));
- return Status::Invalid(ss.str());
}
-
- RETURN_NOT_OK(CheckPythonBytesAreFixedLength(obj, byte_width));
- if (ARROW_PREDICT_FALSE(builder->value_data_length() + byte_width >
- kBinaryMemoryLimit)) {
+ bool is_full;
+ RETURN_NOT_OK(internal::BuilderAppend(builder, obj, &is_full));
+ if (is_full) {
break;
}
- RETURN_NOT_OK(
- builder->Append(reinterpret_cast<const uint8_t*>(PyBytes_AS_STRING(obj))));
}
// If we consumed the whole array, this will be the length of arr
@@ -981,15 +930,10 @@ Status NumPyConverter::ConvertObjectIntegers() {
obj = objects[i];
if ((have_mask && mask_values[i]) || internal::PandasObjectIsNull(obj)) {
RETURN_NOT_OK(builder.AppendNull());
- } else if (PyObject_is_integer(obj)) {
- const int64_t val = static_cast<int64_t>(PyLong_AsLong(obj));
- RETURN_IF_PYERROR();
- RETURN_NOT_OK(builder.Append(val));
} else {
- std::stringstream ss;
- ss << "Error converting from Python objects to Int64: ";
- RETURN_NOT_OK(InvalidConversion(obj, "integer", &ss));
- return Status::Invalid(ss.str());
+ int64_t val;
+ RETURN_NOT_OK(internal::CIntFromPython(obj, &val));
+ RETURN_NOT_OK(builder.Append(val));
}
}
@@ -1102,7 +1046,9 @@ Status NumPyConverter::ConvertObjectsInfer() {
PyObject* obj = objects[i];
if (internal::PandasObjectIsNull(obj)) {
continue;
- } else if (PyObject_is_string(obj)) {
+ } else if (PyUnicode_Check(obj) || internal::IsPyBinary(obj)) {
+ // The exact Arrow type (Binary or String) will be decided based on
+ // Python object types
return ConvertObjectStrings();
} else if (PyFloat_Check(obj)) {
return ConvertObjectFloats();
@@ -1119,8 +1065,6 @@ Status NumPyConverter::ConvertObjectsInfer() {
return ConvertTimes();
} else if (PyObject_IsInstance(obj, decimal_type_.obj()) == 1) {
return ConvertDecimals();
- } else if (PyByteArray_Check(obj)) {
- return ConvertObjectBytes();
} else if (PyList_Check(obj)) {
std::shared_ptr<DataType> inferred_type;
RETURN_NOT_OK(InferArrowType(obj, &inferred_type));
diff --git a/cpp/src/arrow/python/python-test.cc b/cpp/src/arrow/python/python-test.cc
index c18b159..293255b 100644
--- a/cpp/src/arrow/python/python-test.cc
+++ b/cpp/src/arrow/python/python-test.cc
@@ -28,6 +28,7 @@
#include "arrow/python/arrow_to_pandas.h"
#include "arrow/python/builtin_convert.h"
+#include "arrow/python/decimal.h"
#include "arrow/python/helpers.h"
namespace arrow {
diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc
index d781d9f..998fa8a 100644
--- a/cpp/src/arrow/python/python_to_arrow.cc
+++ b/cpp/src/arrow/python/python_to_arrow.cc
@@ -84,7 +84,9 @@ class SequenceBuilder {
if (*tag == -1) {
*tag = num_tags_++;
}
- RETURN_NOT_OK(offsets_.Append(static_cast<int32_t>(offset)));
+ int32_t offset32;
+ RETURN_NOT_OK(internal::CastSize(offset, &offset32));
+ RETURN_NOT_OK(offsets_.Append(offset32));
RETURN_NOT_OK(types_.Append(*tag));
return nones_.AppendToBitmap(true);
}
@@ -173,26 +175,34 @@ class SequenceBuilder {
/// \param size
/// The size of the sublist
Status AppendList(Py_ssize_t size) {
+ int32_t offset;
+ RETURN_NOT_OK(internal::CastSize(list_offsets_.back() + size, &offset));
RETURN_NOT_OK(Update(list_offsets_.size() - 1, &list_tag_));
- list_offsets_.push_back(list_offsets_.back() + static_cast<int32_t>(size));
+ list_offsets_.push_back(offset);
return Status::OK();
}
Status AppendTuple(Py_ssize_t size) {
+ int32_t offset;
+ RETURN_NOT_OK(internal::CastSize(tuple_offsets_.back() + size, &offset));
RETURN_NOT_OK(Update(tuple_offsets_.size() - 1, &tuple_tag_));
- tuple_offsets_.push_back(tuple_offsets_.back() + static_cast<int32_t>(size));
+ tuple_offsets_.push_back(offset);
return Status::OK();
}
Status AppendDict(Py_ssize_t size) {
+ int32_t offset;
+ RETURN_NOT_OK(internal::CastSize(dict_offsets_.back() + size, &offset));
RETURN_NOT_OK(Update(dict_offsets_.size() - 1, &dict_tag_));
- dict_offsets_.push_back(dict_offsets_.back() + static_cast<int32_t>(size));
+ dict_offsets_.push_back(offset);
return Status::OK();
}
Status AppendSet(Py_ssize_t size) {
+ int32_t offset;
+ RETURN_NOT_OK(internal::CastSize(set_offsets_.back() + size, &offset));
RETURN_NOT_OK(Update(set_offsets_.size() - 1, &set_tag_));
- set_offsets_.push_back(set_offsets_.back() + static_cast<int32_t>(size));
+ set_offsets_.push_back(offset);
return Status::OK();
}
@@ -365,17 +375,8 @@ Status CallCustomCallback(PyObject* context, PyObject* method_name, PyObject* el
*result = NULL;
if (context == Py_None) {
std::stringstream ss;
- OwnedRef repr(PyObject_Repr(elem));
- RETURN_IF_PYERROR();
-#if PY_MAJOR_VERSION >= 3
- OwnedRef ascii(PyUnicode_AsASCIIString(repr.obj()));
- RETURN_IF_PYERROR();
- ss << "error while calling callback on " << PyBytes_AsString(ascii.obj())
+ ss << "error while calling callback on " << internal::PyObject_StdStringRepr(elem)
<< ": handler not registered";
-#else
- ss << "error while calling callback on " << PyString_AsString(repr.obj())
- << ": handler not registered";
-#endif
return Status::SerializationError(ss.str());
} else {
*result = PyObject_CallMethodObjArgs(context, method_name, elem, NULL);
@@ -483,24 +484,15 @@ Status Append(PyObject* context, PyObject* elem, SequenceBuilder* builder,
#endif
} else if (PyBytes_Check(elem)) {
auto data = reinterpret_cast<uint8_t*>(PyBytes_AS_STRING(elem));
- const int64_t size = static_cast<int64_t>(PyBytes_GET_SIZE(elem));
- if (size > std::numeric_limits<int32_t>::max()) {
- return Status::Invalid("Cannot writes bytes over 2GB");
- }
- RETURN_NOT_OK(builder->AppendBytes(data, static_cast<int32_t>(size)));
+ int32_t size;
+ RETURN_NOT_OK(internal::CastSize(PyBytes_GET_SIZE(elem), &size));
+ RETURN_NOT_OK(builder->AppendBytes(data, size));
} else if (PyUnicode_Check(elem)) {
- Py_ssize_t size;
-#if PY_MAJOR_VERSION >= 3
- char* data = PyUnicode_AsUTF8AndSize(elem, &size);
-#else
- OwnedRef str(PyUnicode_AsUTF8String(elem));
- char* data = PyString_AS_STRING(str.obj());
- size = PyString_GET_SIZE(str.obj());
-#endif
- if (size > std::numeric_limits<int32_t>::max()) {
- return Status::Invalid("Cannot writes bytes over 2GB");
- }
- RETURN_NOT_OK(builder->AppendString(data, static_cast<int32_t>(size)));
+ PyBytesView view;
+ RETURN_NOT_OK(view.FromString(elem));
+ int32_t size;
+ RETURN_NOT_OK(internal::CastSize(view.size, &size));
+ RETURN_NOT_OK(builder->AppendString(view.bytes, size));
} else if (PyList_CheckExact(elem)) {
RETURN_NOT_OK(builder->AppendList(PyList_Size(elem)));
sublists->push_back(elem);
diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx
index 672be08..8929ea0 100644
--- a/python/pyarrow/lib.pyx
+++ b/python/pyarrow/lib.pyx
@@ -33,10 +33,6 @@ from pyarrow.includes.common cimport PyObject_to_object
cimport pyarrow.includes.libarrow as libarrow
cimport cpython as cp
-cdef _pandas():
- import pandas as pd
- return pd
-
arrow_init_numpy()
set_numpy_nan(np.nan)
diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py
index 988d512..1c5dd71 100644
--- a/python/pyarrow/tests/test_convert_builtin.py
+++ b/python/pyarrow/tests/test_convert_builtin.py
@@ -215,6 +215,26 @@ def test_sequence_numpy_integer_inferred(seq, np_scalar_pa_type):
assert arr.to_pylist() == expected
+@pytest.mark.parametrize("bits", [8, 16, 32, 64])
+def test_signed_integer_overflow(bits):
+ ty = getattr(pa, "int%d" % bits)()
+ # XXX ideally would raise OverflowError
+ with pytest.raises((ValueError, pa.ArrowException)):
+ pa.array([2 ** (bits - 1)], ty)
+ with pytest.raises((ValueError, pa.ArrowException)):
+ pa.array([-2 ** (bits - 1) - 1], ty)
+
+
+@pytest.mark.parametrize("bits", [8, 16, 32, 64])
+def test_unsigned_integer_overflow(bits):
+ ty = getattr(pa, "uint%d" % bits)()
+ # XXX ideally would raise OverflowError
+ with pytest.raises((ValueError, pa.ArrowException)):
+ pa.array([2 ** bits], ty)
+ with pytest.raises((ValueError, pa.ArrowException)):
+ pa.array([-1], ty)
+
+
def test_garbage_collection():
import gc
@@ -260,12 +280,14 @@ def test_sequence_bytes():
u1 = b'ma\xc3\xb1ana'
data = [b'foo',
u1.decode('utf-8'), # unicode gets encoded,
+ bytearray(b'bar'),
None]
- arr = pa.array(data)
- assert len(arr) == 3
- assert arr.null_count == 1
- assert arr.type == pa.binary()
- assert arr.to_pylist() == [b'foo', u1, None]
+ for ty in [None, pa.binary()]:
+ arr = pa.array(data, type=ty)
+ assert len(arr) == 4
+ assert arr.null_count == 1
+ assert arr.type == pa.binary()
+ assert arr.to_pylist() == [b'foo', u1, b'bar', None]
def test_sequence_utf8_to_unicode():
@@ -281,12 +303,12 @@ def test_sequence_utf8_to_unicode():
def test_sequence_fixed_size_bytes():
- data = [b'foof', None, b'barb', b'2346']
+ data = [b'foof', None, bytearray(b'barb'), b'2346']
arr = pa.array(data, type=pa.binary(4))
assert len(arr) == 4
assert arr.null_count == 1
assert arr.type == pa.binary(4)
- assert arr.to_pylist() == data
+ assert arr.to_pylist() == [b'foof', None, b'barb', b'2346']
def test_fixed_size_bytes_does_not_accept_varying_lengths():
@@ -473,7 +495,7 @@ def test_sequence_mixed_types_with_specified_type_fails():
data = ['-10', '-5', {'a': 1}, '0', '5', '10']
type = pa.string()
- with pytest.raises(pa.ArrowInvalid):
+ with pytest.raises(TypeError):
pa.array(data, type=type)
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index 04b1fa4..c6e2b75 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -30,7 +30,7 @@ import numpy.testing as npt
import pandas as pd
import pandas.util.testing as tm
-from pyarrow.compat import u, PY2
+from pyarrow.compat import PY2
import pyarrow as pa
import pyarrow.types as patypes
@@ -1065,13 +1065,13 @@ class TestConvertStringLikeTypes(object):
_check_pandas_roundtrip(df, expected_schema=schema)
def test_bytes_to_binary(self):
- values = [u('qux'), b'foo', None, 'bar', 'qux', np.nan]
+ values = [u'qux', b'foo', None, bytearray(b'barz'), 'qux', np.nan]
df = pd.DataFrame({'strings': values})
table = pa.Table.from_pandas(df)
assert table[0].type == pa.binary()
- values2 = [b'qux', b'foo', None, b'bar', b'qux', np.nan]
+ values2 = [b'qux', b'foo', None, b'barz', b'qux', np.nan]
expected = pd.DataFrame({'strings': values2})
_check_pandas_roundtrip(df, expected)
@@ -1093,7 +1093,7 @@ class TestConvertStringLikeTypes(object):
assert table[0].data.num_chunks == 2
def test_fixed_size_bytes(self):
- values = [b'foo', None, b'bar', None, None, b'hey']
+ values = [b'foo', None, bytearray(b'bar'), None, None, b'hey']
df = pd.DataFrame({'strings': values})
schema = pa.schema([pa.field('strings', pa.binary(3))])
table = pa.Table.from_pandas(df, schema=schema)
diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py
index b0764fd..a14673f 100644
--- a/python/pyarrow/tests/test_feather.py
+++ b/python/pyarrow/tests/test_feather.py
@@ -465,7 +465,7 @@ class TestFeatherReader(unittest.TestCase):
# non-strings
df = pd.DataFrame({'a': ['a', 1, 2.0]})
- self._assert_error_on_write(df, ValueError)
+ self._assert_error_on_write(df, TypeError)
@pytest.mark.slow
def test_large_dataframe(self):
--
To stop receiving notification emails like this one, please contact
apitrou@apache.org.