You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/03/30 23:12:55 UTC
arrow git commit: ARROW-632: [Python] Add support for
FixedWidthBinary type
Repository: arrow
Updated Branches:
refs/heads/master edd6cfcd9 -> 4915ecf1e
ARROW-632: [Python] Add support for FixedWidthBinary type
Author: Phillip Cloud <cp...@gmail.com>
Closes #461 from cpcloud/ARROW-632 and squashes the following commits:
134644a [Phillip Cloud] ARROW-632: [Python] Add support for FixedWidthBinary type
Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/4915ecf1
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/4915ecf1
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/4915ecf1
Branch: refs/heads/master
Commit: 4915ecf1e1dba625d916604d30f2575e4ddb6439
Parents: edd6cfc
Author: Phillip Cloud <cp...@gmail.com>
Authored: Thu Mar 30 19:12:49 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Thu Mar 30 19:12:49 2017 -0400
----------------------------------------------------------------------
.gitignore | 3 +
cpp/src/arrow/builder.cc | 1 +
cpp/src/arrow/ipc/ipc-read-write-benchmark.cc | 4 +-
cpp/src/arrow/ipc/reader.cc | 2 +-
cpp/src/arrow/python/builtin_convert.cc | 101 +++++++++++++---
cpp/src/arrow/python/builtin_convert.h | 17 ++-
cpp/src/arrow/python/pandas_convert.cc | 131 ++++++++++++++++++---
cpp/src/arrow/util/logging.h | 7 +-
python/pyarrow/__init__.py | 5 +-
python/pyarrow/array.pxd | 8 ++
python/pyarrow/array.pyx | 16 ++-
python/pyarrow/includes/libarrow.pxd | 8 ++
python/pyarrow/includes/pyarrow.pxd | 3 +
python/pyarrow/scalar.pxd | 5 +
python/pyarrow/scalar.pyx | 19 ++-
python/pyarrow/schema.pxd | 6 +
python/pyarrow/schema.pyx | 42 +++++--
python/pyarrow/tests/test_convert_builtin.py | 13 ++
python/pyarrow/tests/test_convert_pandas.py | 17 +++
python/pyarrow/tests/test_scalars.py | 14 +++
20 files changed, 367 insertions(+), 55 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/arrow/blob/4915ecf1/.gitignore
----------------------------------------------------------------------
diff --git a/.gitignore b/.gitignore
index a00cbba..5e28b36 100644
--- a/.gitignore
+++ b/.gitignore
@@ -24,3 +24,6 @@
*.dylib
.build_cache_dir
MANIFEST
+
+cpp/.idea/
+python/.eggs/
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/arrow/blob/4915ecf1/cpp/src/arrow/builder.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc
index 52a785d..82b6214 100644
--- a/cpp/src/arrow/builder.cc
+++ b/cpp/src/arrow/builder.cc
@@ -542,6 +542,7 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
BUILDER_CASE(DOUBLE, DoubleBuilder);
BUILDER_CASE(STRING, StringBuilder);
BUILDER_CASE(BINARY, BinaryBuilder);
+ BUILDER_CASE(FIXED_WIDTH_BINARY, FixedWidthBinaryBuilder);
case Type::LIST: {
std::shared_ptr<ArrayBuilder> value_builder;
std::shared_ptr<DataType> value_type =
http://git-wip-us.apache.org/repos/asf/arrow/blob/4915ecf1/cpp/src/arrow/ipc/ipc-read-write-benchmark.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/ipc-read-write-benchmark.cc b/cpp/src/arrow/ipc/ipc-read-write-benchmark.cc
index 1aecdbc..b385929 100644
--- a/cpp/src/arrow/ipc/ipc-read-write-benchmark.cc
+++ b/cpp/src/arrow/ipc/ipc-read-write-benchmark.cc
@@ -80,7 +80,7 @@ static void BM_WriteRecordBatch(benchmark::State& state) { // NOLINT non-const
int32_t metadata_length;
int64_t body_length;
if (!ipc::WriteRecordBatch(*record_batch, 0, &stream, &metadata_length, &body_length,
- default_memory_pool())
+ default_memory_pool())
.ok()) {
state.SkipWithError("Failed to write!");
}
@@ -101,7 +101,7 @@ static void BM_ReadRecordBatch(benchmark::State& state) { // NOLINT non-const r
int32_t metadata_length;
int64_t body_length;
if (!ipc::WriteRecordBatch(*record_batch, 0, &stream, &metadata_length, &body_length,
- default_memory_pool())
+ default_memory_pool())
.ok()) {
state.SkipWithError("Failed to write!");
}
http://git-wip-us.apache.org/repos/asf/arrow/blob/4915ecf1/cpp/src/arrow/ipc/reader.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc
index b47b773..00ea20c 100644
--- a/cpp/src/arrow/ipc/reader.cc
+++ b/cpp/src/arrow/ipc/reader.cc
@@ -32,8 +32,8 @@
#include "arrow/ipc/util.h"
#include "arrow/status.h"
#include "arrow/table.h"
-#include "arrow/type.h"
#include "arrow/tensor.h"
+#include "arrow/type.h"
#include "arrow/util/logging.h"
namespace arrow {
http://git-wip-us.apache.org/repos/asf/arrow/blob/4915ecf1/cpp/src/arrow/python/builtin_convert.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc
index 6e59845..72e8677 100644
--- a/cpp/src/arrow/python/builtin_convert.cc
+++ b/cpp/src/arrow/python/builtin_convert.cc
@@ -23,6 +23,7 @@
#include "arrow/api.h"
#include "arrow/status.h"
+#include "arrow/util/logging.h"
#include "arrow/python/helpers.h"
#include "arrow/python/util/datetime.h"
@@ -200,18 +201,25 @@ class SeqVisitor {
int nesting_histogram_[MAX_NESTING_LEVELS];
};
-// Non-exhaustive type inference
-Status InferArrowType(PyObject* obj, int64_t* size, std::shared_ptr<DataType>* out_type) {
- *size = PySequence_Size(obj);
+Status InferArrowSize(PyObject* obj, int64_t* size) {
+ *size = static_cast<int64_t>(PySequence_Size(obj));
if (PyErr_Occurred()) {
// Not a sequence
PyErr_Clear();
return Status::TypeError("Object is not a sequence");
}
+ return Status::OK();
+}
+
+// Non-exhaustive type inference
+Status InferArrowTypeAndSize(
+ PyObject* obj, int64_t* size, std::shared_ptr<DataType>* out_type) {
+ RETURN_NOT_OK(InferArrowSize(obj, size));
// For 0-length sequences, refuse to guess
if (*size == 0) { *out_type = null(); }
+ PyDateTime_IMPORT;
SeqVisitor seq_visitor;
RETURN_NOT_OK(seq_visitor.Visit(obj));
RETURN_NOT_OK(seq_visitor.Validate());
@@ -253,7 +261,7 @@ class TypedConverter : public SeqConverter {
class BoolConverter : public TypedConverter<BooleanBuilder> {
public:
Status AppendData(PyObject* seq) override {
- Py_ssize_t size = PySequence_Size(seq);
+ int64_t size = static_cast<int64_t>(PySequence_Size(seq));
RETURN_NOT_OK(typed_builder_->Reserve(size));
for (int64_t i = 0; i < size; ++i) {
OwnedRef item(PySequence_GetItem(seq, i));
@@ -275,14 +283,14 @@ class Int64Converter : public TypedConverter<Int64Builder> {
public:
Status AppendData(PyObject* seq) override {
int64_t val;
- Py_ssize_t size = PySequence_Size(seq);
+ int64_t size = static_cast<int64_t>(PySequence_Size(seq));
RETURN_NOT_OK(typed_builder_->Reserve(size));
for (int64_t i = 0; i < size; ++i) {
OwnedRef item(PySequence_GetItem(seq, i));
if (item.obj() == Py_None) {
typed_builder_->AppendNull();
} else {
- val = PyLong_AsLongLong(item.obj());
+ val = static_cast<int64_t>(PyLong_AsLongLong(item.obj()));
RETURN_IF_PYERROR();
typed_builder_->Append(val);
}
@@ -294,7 +302,7 @@ class Int64Converter : public TypedConverter<Int64Builder> {
class DateConverter : public TypedConverter<Date64Builder> {
public:
Status AppendData(PyObject* seq) override {
- Py_ssize_t size = PySequence_Size(seq);
+ int64_t size = static_cast<int64_t>(PySequence_Size(seq));
RETURN_NOT_OK(typed_builder_->Reserve(size));
for (int64_t i = 0; i < size; ++i) {
OwnedRef item(PySequence_GetItem(seq, i));
@@ -312,7 +320,7 @@ class DateConverter : public TypedConverter<Date64Builder> {
class TimestampConverter : public TypedConverter<TimestampBuilder> {
public:
Status AppendData(PyObject* seq) override {
- Py_ssize_t size = PySequence_Size(seq);
+ int64_t size = static_cast<int64_t>(PySequence_Size(seq));
RETURN_NOT_OK(typed_builder_->Reserve(size));
for (int64_t i = 0; i < size; ++i) {
OwnedRef item(PySequence_GetItem(seq, i));
@@ -334,7 +342,8 @@ class TimestampConverter : public TypedConverter<TimestampBuilder> {
epoch.tm_year = 70;
epoch.tm_mday = 1;
// Microseconds since the epoch
- int64_t val = lrint(difftime(mktime(&datetime), mktime(&epoch))) * 1000000 + us;
+ int64_t val = static_cast<int64_t>(
+ lrint(difftime(mktime(&datetime), mktime(&epoch))) * 1000000 + us);
typed_builder_->Append(val);
}
}
@@ -346,7 +355,7 @@ class DoubleConverter : public TypedConverter<DoubleBuilder> {
public:
Status AppendData(PyObject* seq) override {
double val;
- Py_ssize_t size = PySequence_Size(seq);
+ int64_t size = static_cast<int64_t>(PySequence_Size(seq));
RETURN_NOT_OK(typed_builder_->Reserve(size));
for (int64_t i = 0; i < size; ++i) {
OwnedRef item(PySequence_GetItem(seq, i));
@@ -369,7 +378,7 @@ class BytesConverter : public TypedConverter<BinaryBuilder> {
PyObject* bytes_obj;
OwnedRef tmp;
const char* bytes;
- int64_t length;
+ Py_ssize_t length;
Py_ssize_t size = PySequence_Size(seq);
for (int64_t i = 0; i < size; ++i) {
item = PySequence_GetItem(seq, i);
@@ -385,7 +394,8 @@ class BytesConverter : public TypedConverter<BinaryBuilder> {
} else if (PyBytes_Check(item)) {
bytes_obj = item;
} else {
- return Status::TypeError("Non-string value encountered");
+ return Status::TypeError(
+ "Value that cannot be converted to bytes was encountered");
}
// No error checking
length = PyBytes_GET_SIZE(bytes_obj);
@@ -396,6 +406,41 @@ class BytesConverter : public TypedConverter<BinaryBuilder> {
}
};
+class FixedWidthBytesConverter : public TypedConverter<FixedWidthBinaryBuilder> {
+ public:
+ Status AppendData(PyObject* seq) override {
+ PyObject* item;
+ PyObject* bytes_obj;
+ OwnedRef tmp;
+ Py_ssize_t expected_length = std::dynamic_pointer_cast<FixedWidthBinaryType>(
+ typed_builder_->type())->byte_width();
+ Py_ssize_t size = PySequence_Size(seq);
+ for (int64_t i = 0; i < size; ++i) {
+ item = PySequence_GetItem(seq, i);
+ OwnedRef holder(item);
+
+ if (item == Py_None) {
+ RETURN_NOT_OK(typed_builder_->AppendNull());
+ continue;
+ } else if (PyUnicode_Check(item)) {
+ tmp.reset(PyUnicode_AsUTF8String(item));
+ RETURN_IF_PYERROR();
+ bytes_obj = tmp.obj();
+ } else if (PyBytes_Check(item)) {
+ bytes_obj = item;
+ } else {
+ return Status::TypeError(
+ "Value that cannot be converted to bytes was encountered");
+ }
+ // No error checking
+ RETURN_NOT_OK(CheckPythonBytesAreFixedLength(bytes_obj, expected_length));
+ RETURN_NOT_OK(typed_builder_->Append(
+ reinterpret_cast<const uint8_t*>(PyBytes_AS_STRING(bytes_obj))));
+ }
+ return Status::OK();
+ }
+};
+
class UTF8Converter : public TypedConverter<StringBuilder> {
public:
Status AppendData(PyObject* seq) override {
@@ -403,7 +448,7 @@ class UTF8Converter : public TypedConverter<StringBuilder> {
PyObject* bytes_obj;
OwnedRef tmp;
const char* bytes;
- int64_t length;
+ Py_ssize_t length;
Py_ssize_t size = PySequence_Size(seq);
for (int64_t i = 0; i < size; ++i) {
item = PySequence_GetItem(seq, i);
@@ -465,6 +510,8 @@ std::shared_ptr<SeqConverter> GetConverter(const std::shared_ptr<DataType>& type
return std::make_shared<DoubleConverter>();
case Type::BINARY:
return std::make_shared<BytesConverter>();
+ case Type::FIXED_WIDTH_BINARY:
+ return std::make_shared<FixedWidthBytesConverter>();
case Type::STRING:
return std::make_shared<UTF8Converter>();
case Type::LIST:
@@ -472,7 +519,6 @@ std::shared_ptr<SeqConverter> GetConverter(const std::shared_ptr<DataType>& type
case Type::STRUCT:
default:
return nullptr;
- break;
}
}
@@ -492,6 +538,7 @@ Status ListConverter::Init(const std::shared_ptr<ArrayBuilder>& builder) {
Status AppendPySequence(PyObject* obj, const std::shared_ptr<DataType>& type,
const std::shared_ptr<ArrayBuilder>& builder) {
+ PyDateTime_IMPORT;
std::shared_ptr<SeqConverter> converter = GetConverter(type);
if (converter == nullptr) {
std::stringstream ss;
@@ -506,9 +553,12 @@ Status AppendPySequence(PyObject* obj, const std::shared_ptr<DataType>& type,
Status ConvertPySequence(PyObject* obj, MemoryPool* pool, std::shared_ptr<Array>* out) {
std::shared_ptr<DataType> type;
int64_t size;
- PyDateTime_IMPORT;
- RETURN_NOT_OK(InferArrowType(obj, &size, &type));
+ RETURN_NOT_OK(InferArrowTypeAndSize(obj, &size, &type));
+ return ConvertPySequence(obj, pool, out, type, size);
+}
+Status ConvertPySequence(PyObject* obj, MemoryPool* pool, std::shared_ptr<Array>* out,
+ const std::shared_ptr<DataType>& type, int64_t size) {
// Handle NA / NullType case
if (type->type == Type::NA) {
out->reset(new NullArray(size));
@@ -519,9 +569,26 @@ Status ConvertPySequence(PyObject* obj, MemoryPool* pool, std::shared_ptr<Array>
std::shared_ptr<ArrayBuilder> builder;
RETURN_NOT_OK(MakeBuilder(pool, type, &builder));
RETURN_NOT_OK(AppendPySequence(obj, type, builder));
-
return builder->Finish(out);
}
+Status ConvertPySequence(PyObject* obj, MemoryPool* pool, std::shared_ptr<Array>* out,
+ const std::shared_ptr<DataType>& type) {
+ int64_t size;
+ RETURN_NOT_OK(InferArrowSize(obj, &size));
+ return ConvertPySequence(obj, pool, out, type, size);
+}
+
+Status CheckPythonBytesAreFixedLength(PyObject* obj, Py_ssize_t expected_length) {
+ const Py_ssize_t length = PyBytes_GET_SIZE(obj);
+ if (length != expected_length) {
+ std::stringstream ss;
+ ss << "Found byte string of length " << length << ", expected length is "
+ << expected_length;
+ return Status::TypeError(ss.str());
+ }
+ return Status::OK();
+}
+
} // namespace py
} // namespace arrow
http://git-wip-us.apache.org/repos/asf/arrow/blob/4915ecf1/cpp/src/arrow/python/builtin_convert.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/builtin_convert.h b/cpp/src/arrow/python/builtin_convert.h
index 7b50990..00ff0fd 100644
--- a/cpp/src/arrow/python/builtin_convert.h
+++ b/cpp/src/arrow/python/builtin_convert.h
@@ -38,16 +38,31 @@ class Status;
namespace py {
-ARROW_EXPORT arrow::Status InferArrowType(
+ARROW_EXPORT arrow::Status InferArrowTypeAndSize(
PyObject* obj, int64_t* size, std::shared_ptr<arrow::DataType>* out_type);
+ARROW_EXPORT arrow::Status InferArrowSize(PyObject* obj, int64_t* size);
ARROW_EXPORT arrow::Status AppendPySequence(PyObject* obj,
const std::shared_ptr<arrow::DataType>& type,
const std::shared_ptr<arrow::ArrayBuilder>& builder);
+// Type and size inference
ARROW_EXPORT
Status ConvertPySequence(PyObject* obj, MemoryPool* pool, std::shared_ptr<Array>* out);
+// Size inference
+ARROW_EXPORT
+Status ConvertPySequence(PyObject* obj, MemoryPool* pool, std::shared_ptr<Array>* out,
+ const std::shared_ptr<DataType>& type);
+
+// No inference
+ARROW_EXPORT
+Status ConvertPySequence(PyObject* obj, MemoryPool* pool, std::shared_ptr<Array>* out,
+ const std::shared_ptr<DataType>& type, int64_t size);
+
+ARROW_EXPORT Status CheckPythonBytesAreFixedLength(
+ PyObject* obj, Py_ssize_t expected_length);
+
} // namespace py
} // namespace arrow
http://git-wip-us.apache.org/repos/asf/arrow/blob/4915ecf1/cpp/src/arrow/python/pandas_convert.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/pandas_convert.cc b/cpp/src/arrow/python/pandas_convert.cc
index db2e90e..68a8d7d 100644
--- a/cpp/src/arrow/python/pandas_convert.cc
+++ b/cpp/src/arrow/python/pandas_convert.cc
@@ -147,8 +147,8 @@ Status CheckFlatNumpyArray(PyArrayObject* numpy_array, int np_type) {
return Status::OK();
}
-Status AppendObjectStrings(StringBuilder& string_builder, PyObject** objects,
- int64_t objects_length, bool* have_bytes) {
+Status AppendObjectStrings(int64_t objects_length, StringBuilder* builder,
+ PyObject** objects, bool* have_bytes) {
PyObject* obj;
for (int64_t i = 0; i < objects_length; ++i) {
@@ -160,15 +160,45 @@ Status AppendObjectStrings(StringBuilder& string_builder, PyObject** objects,
return Status::TypeError("failed converting unicode to UTF8");
}
const int32_t length = static_cast<int32_t>(PyBytes_GET_SIZE(obj));
- Status s = string_builder.Append(PyBytes_AS_STRING(obj), length);
+ Status s = builder->Append(PyBytes_AS_STRING(obj), length);
Py_DECREF(obj);
if (!s.ok()) { return s; }
} else if (PyBytes_Check(obj)) {
*have_bytes = true;
const int32_t length = static_cast<int32_t>(PyBytes_GET_SIZE(obj));
- RETURN_NOT_OK(string_builder.Append(PyBytes_AS_STRING(obj), length));
+ RETURN_NOT_OK(builder->Append(PyBytes_AS_STRING(obj), length));
} else {
- string_builder.AppendNull();
+ builder->AppendNull();
+ }
+ }
+
+ return Status::OK();
+}
+
+static Status AppendObjectFixedWidthBytes(int64_t objects_length, int byte_width,
+ FixedWidthBinaryBuilder* builder, PyObject** objects) {
+ PyObject* obj;
+
+ for (int64_t i = 0; i < objects_length; ++i) {
+ obj = objects[i];
+ if (PyUnicode_Check(obj)) {
+ obj = PyUnicode_AsUTF8String(obj);
+ if (obj == NULL) {
+ PyErr_Clear();
+ return Status::TypeError("failed converting unicode to UTF8");
+ }
+
+ RETURN_NOT_OK(CheckPythonBytesAreFixedLength(obj, byte_width));
+ Status s =
+ builder->Append(reinterpret_cast<const uint8_t*>(PyBytes_AS_STRING(obj)));
+ Py_DECREF(obj);
+ RETURN_NOT_OK(s);
+ } else if (PyBytes_Check(obj)) {
+ RETURN_NOT_OK(CheckPythonBytesAreFixedLength(obj, byte_width));
+ RETURN_NOT_OK(
+ builder->Append(reinterpret_cast<const uint8_t*>(PyBytes_AS_STRING(obj))));
+ } else {
+ builder->AppendNull();
}
}
@@ -192,6 +222,13 @@ struct WrapBytes<BinaryArray> {
}
};
+template <>
+struct WrapBytes<FixedWidthBinaryArray> {
+ static inline PyObject* Wrap(const uint8_t* data, int64_t length) {
+ return PyBytes_FromStringAndSize(reinterpret_cast<const char*>(data), length);
+ }
+};
+
static inline bool ListTypeSupported(const Type::type type_id) {
switch (type_id) {
case Type::UINT8:
@@ -226,7 +263,7 @@ class PandasConverter : public TypeVisitor {
arr_(reinterpret_cast<PyArrayObject*>(ao)),
mask_(nullptr) {
if (mo != nullptr && mo != Py_None) { mask_ = reinterpret_cast<PyArrayObject*>(mo); }
- length_ = PyArray_SIZE(arr_);
+ length_ = static_cast<int64_t>(PyArray_SIZE(arr_));
}
bool is_strided() const {
@@ -241,7 +278,7 @@ class PandasConverter : public TypeVisitor {
RETURN_NOT_OK(null_bitmap_->Resize(null_bytes));
null_bitmap_data_ = null_bitmap_->mutable_data();
- memset(null_bitmap_data_, 0, null_bytes);
+ memset(null_bitmap_data_, 0, static_cast<size_t>(null_bytes));
return Status::OK();
}
@@ -321,6 +358,8 @@ class PandasConverter : public TypeVisitor {
const std::shared_ptr<DataType>& type, std::shared_ptr<Array>* out);
Status ConvertObjectStrings(std::shared_ptr<Array>* out);
+ Status ConvertObjectFixedWidthBytes(
+ const std::shared_ptr<DataType>& type, std::shared_ptr<Array>* out);
Status ConvertBooleans(std::shared_ptr<Array>* out);
Status ConvertDates(std::shared_ptr<Array>* out);
Status ConvertLists(const std::shared_ptr<DataType>& type, std::shared_ptr<Array>* out);
@@ -402,13 +441,13 @@ Status PandasConverter::ConvertObjectStrings(std::shared_ptr<Array>* out) {
// and unicode mixed in the object array
PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
- StringBuilder string_builder(pool_);
- RETURN_NOT_OK(string_builder.Resize(length_));
+ StringBuilder builder(pool_);
+ RETURN_NOT_OK(builder.Resize(length_));
Status s;
bool have_bytes = false;
- RETURN_NOT_OK(AppendObjectStrings(string_builder, objects, length_, &have_bytes));
- RETURN_NOT_OK(string_builder.Finish(out));
+ RETURN_NOT_OK(AppendObjectStrings(length_, &builder, objects, &have_bytes));
+ RETURN_NOT_OK(builder.Finish(out));
if (have_bytes) {
const auto& arr = static_cast<const StringArray&>(*out->get());
@@ -418,6 +457,20 @@ Status PandasConverter::ConvertObjectStrings(std::shared_ptr<Array>* out) {
return Status::OK();
}
+Status PandasConverter::ConvertObjectFixedWidthBytes(
+ const std::shared_ptr<DataType>& type, std::shared_ptr<Array>* out) {
+ PyAcquireGIL lock;
+
+ PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
+ FixedWidthBinaryBuilder builder(pool_, type);
+ RETURN_NOT_OK(builder.Resize(length_));
+ RETURN_NOT_OK(AppendObjectFixedWidthBytes(length_,
+ std::dynamic_pointer_cast<FixedWidthBinaryType>(builder.type())->byte_width(),
+ &builder, objects));
+ RETURN_NOT_OK(builder.Finish(out));
+ return Status::OK();
+}
+
Status PandasConverter::ConvertBooleans(std::shared_ptr<Array>* out) {
PyAcquireGIL lock;
@@ -474,6 +527,8 @@ Status PandasConverter::ConvertObjects(std::shared_ptr<Array>* out) {
switch (type_->type) {
case Type::STRING:
return ConvertObjectStrings(out);
+ case Type::FIXED_WIDTH_BINARY:
+ return ConvertObjectFixedWidthBytes(type_, out);
case Type::BOOL:
return ConvertBooleans(out);
case Type::DATE64:
@@ -543,7 +598,7 @@ inline Status PandasConverter::ConvertTypedLists(
int64_t size;
std::shared_ptr<DataType> inferred_type;
RETURN_NOT_OK(list_builder.Append(true));
- RETURN_NOT_OK(InferArrowType(objects[i], &size, &inferred_type));
+ RETURN_NOT_OK(InferArrowTypeAndSize(objects[i], &size, &inferred_type));
if (inferred_type->type != type->type) {
std::stringstream ss;
ss << inferred_type->ToString() << " cannot be converted to " << type->ToString();
@@ -577,14 +632,14 @@ inline Status PandasConverter::ConvertTypedLists<NPY_OBJECT, StringType>(
// TODO(uwe): Support more complex numpy array structures
RETURN_NOT_OK(CheckFlatNumpyArray(numpy_array, NPY_OBJECT));
- int64_t size = PyArray_DIM(numpy_array, 0);
+ int64_t size = static_cast<int64_t>(PyArray_DIM(numpy_array, 0));
auto data = reinterpret_cast<PyObject**>(PyArray_DATA(numpy_array));
- RETURN_NOT_OK(AppendObjectStrings(*value_builder.get(), data, size, &have_bytes));
+ RETURN_NOT_OK(AppendObjectStrings(size, value_builder.get(), data, &have_bytes));
} else if (PyList_Check(objects[i])) {
int64_t size;
std::shared_ptr<DataType> inferred_type;
RETURN_NOT_OK(list_builder.Append(true));
- RETURN_NOT_OK(InferArrowType(objects[i], &size, &inferred_type));
+ RETURN_NOT_OK(InferArrowTypeAndSize(objects[i], &size, &inferred_type));
if (inferred_type->type != Type::STRING) {
std::stringstream ss;
ss << inferred_type->ToString() << " cannot be converted to STRING.";
@@ -832,7 +887,7 @@ inline void ConvertIntegerWithNulls(const ChunkedArray& data, double* out_values
// Upcast to double, set NaN as appropriate
for (int i = 0; i < arr->length(); ++i) {
- *out_values++ = prim_arr->IsNull(i) ? NAN : in_values[i];
+ *out_values++ = prim_arr->IsNull(i) ? NAN : static_cast<double>(in_values[i]);
}
}
}
@@ -924,6 +979,36 @@ inline Status ConvertBinaryLike(const ChunkedArray& data, PyObject** out_values)
return Status::OK();
}
+inline Status ConvertFixedWidthBinary(const ChunkedArray& data, PyObject** out_values) {
+ PyAcquireGIL lock;
+ for (int c = 0; c < data.num_chunks(); c++) {
+ auto arr = static_cast<FixedWidthBinaryArray*>(data.chunk(c).get());
+
+ const uint8_t* data_ptr;
+ int32_t length =
+ std::dynamic_pointer_cast<FixedWidthBinaryType>(arr->type())->byte_width();
+ const bool has_nulls = data.null_count() > 0;
+ for (int64_t i = 0; i < arr->length(); ++i) {
+ if (has_nulls && arr->IsNull(i)) {
+ Py_INCREF(Py_None);
+ *out_values = Py_None;
+ } else {
+ data_ptr = arr->GetValue(i);
+ *out_values = WrapBytes<FixedWidthBinaryArray>::Wrap(data_ptr, length);
+ if (*out_values == nullptr) {
+ PyErr_Clear();
+ std::stringstream ss;
+ ss << "Wrapping "
+ << std::string(reinterpret_cast<const char*>(data_ptr), length) << " failed";
+ return Status::UnknownError(ss.str());
+ }
+ }
+ ++out_values;
+ }
+ }
+ return Status::OK();
+}
+
template <typename ArrowType>
inline Status ConvertListsLike(
const std::shared_ptr<Column>& col, PyObject** out_values) {
@@ -1058,6 +1143,8 @@ class ObjectBlock : public PandasBlock {
RETURN_NOT_OK(ConvertBinaryLike<BinaryArray>(data, out_buffer));
} else if (type == Type::STRING) {
RETURN_NOT_OK(ConvertBinaryLike<StringArray>(data, out_buffer));
+ } else if (type == Type::FIXED_WIDTH_BINARY) {
+ RETURN_NOT_OK(ConvertFixedWidthBinary(data, out_buffer));
} else if (type == Type::LIST) {
auto list_type = std::static_pointer_cast<ListType>(col->type());
switch (list_type->value_type()->type) {
@@ -1487,6 +1574,7 @@ class DataFrameBlockCreator {
break;
case Type::STRING:
case Type::BINARY:
+ case Type::FIXED_WIDTH_BINARY:
output_type = PandasBlock::OBJECT;
break;
case Type::DATE64:
@@ -1751,6 +1839,7 @@ class ArrowDeserializer {
CONVERT_CASE(DOUBLE);
CONVERT_CASE(BINARY);
CONVERT_CASE(STRING);
+ CONVERT_CASE(FIXED_WIDTH_BINARY);
CONVERT_CASE(DATE64);
CONVERT_CASE(TIMESTAMP);
CONVERT_CASE(DICTIONARY);
@@ -1845,6 +1934,7 @@ class ArrowDeserializer {
return ConvertBinaryLike<StringArray>(data_, out_values);
}
+ // Binary strings
template <int T2>
inline typename std::enable_if<T2 == Type::BINARY, Status>::type ConvertValues() {
RETURN_NOT_OK(AllocateOutput(NPY_OBJECT));
@@ -1852,6 +1942,15 @@ class ArrowDeserializer {
return ConvertBinaryLike<BinaryArray>(data_, out_values);
}
+ // Fixed length binary strings
+ template <int TYPE>
+ inline typename std::enable_if<TYPE == Type::FIXED_WIDTH_BINARY, Status>::type
+ ConvertValues() {
+ RETURN_NOT_OK(AllocateOutput(NPY_OBJECT));
+ auto out_values = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
+ return ConvertFixedWidthBinary(data_, out_values);
+ }
+
#define CONVERTVALUES_LISTSLIKE_CASE(ArrowType, ArrowEnum) \
case Type::ArrowEnum: \
return ConvertListsLike<ArrowType>(col_, out_values);
http://git-wip-us.apache.org/repos/asf/arrow/blob/4915ecf1/cpp/src/arrow/util/logging.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/util/logging.h b/cpp/src/arrow/util/logging.h
index b22f07d..697d47c 100644
--- a/cpp/src/arrow/util/logging.h
+++ b/cpp/src/arrow/util/logging.h
@@ -38,9 +38,10 @@ namespace arrow {
#define ARROW_LOG_INTERNAL(level) ::arrow::internal::CerrLog(level)
#define ARROW_LOG(level) ARROW_LOG_INTERNAL(ARROW_##level)
-#define ARROW_CHECK(condition) \
- (condition) ? 0 : ::arrow::internal::FatalLog(ARROW_FATAL) \
- << __FILE__ << __LINE__ << " Check failed: " #condition " "
+#define ARROW_CHECK(condition) \
+ (condition) ? 0 \
+ : ::arrow::internal::FatalLog(ARROW_FATAL) \
+ << __FILE__ << __LINE__ << " Check failed: " #condition " "
#ifdef NDEBUG
#define ARROW_DFATAL ARROW_WARNING
http://git-wip-us.apache.org/repos/asf/arrow/blob/4915ecf1/python/pyarrow/__init__.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index dce4389..66b6038 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -55,7 +55,7 @@ from pyarrow.scalar import (ArrayValue, Scalar, NA, NAType,
Int8Value, Int16Value, Int32Value, Int64Value,
UInt8Value, UInt16Value, UInt32Value, UInt64Value,
FloatValue, DoubleValue, ListValue,
- BinaryValue, StringValue)
+ BinaryValue, StringValue, FixedWidthBinaryValue)
import pyarrow.schema as _schema
@@ -65,7 +65,8 @@ from pyarrow.schema import (null, bool_,
timestamp, date32, date64,
float_, double, binary, string,
list_, struct, dictionary, field,
- DataType, Field, Schema, schema)
+ DataType, FixedWidthBinaryType,
+ Field, Schema, schema)
from pyarrow.table import Column, RecordBatch, Table, concat_tables
http://git-wip-us.apache.org/repos/asf/arrow/blob/4915ecf1/python/pyarrow/array.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/array.pxd b/python/pyarrow/array.pxd
index c3e7997..a7241c6 100644
--- a/python/pyarrow/array.pxd
+++ b/python/pyarrow/array.pxd
@@ -24,9 +24,11 @@ from pyarrow.schema cimport DataType
from cpython cimport PyObject
+
cdef extern from "Python.h":
int PySlice_Check(object)
+
cdef class Array:
cdef:
shared_ptr[CArray] sp_array
@@ -38,6 +40,7 @@ cdef class Array:
cdef init(self, const shared_ptr[CArray]& sp_array)
cdef getitem(self, int64_t i)
+
cdef object box_array(const shared_ptr[CArray]& sp_array)
@@ -52,6 +55,7 @@ cdef class NumericArray(Array):
cdef class IntegerArray(NumericArray):
pass
+
cdef class FloatingPointArray(NumericArray):
pass
@@ -96,6 +100,10 @@ cdef class DoubleArray(FloatingPointArray):
pass
+cdef class FixedWidthBinaryArray(Array):
+ pass
+
+
cdef class ListArray(Array):
pass
http://git-wip-us.apache.org/repos/asf/arrow/blob/4915ecf1/python/pyarrow/array.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/array.pyx b/python/pyarrow/array.pyx
index 6cae196..289baf2 100644
--- a/python/pyarrow/array.pyx
+++ b/python/pyarrow/array.pyx
@@ -37,6 +37,7 @@ cimport pyarrow.scalar as scalar
from pyarrow.scalar import NA
from pyarrow.schema cimport (DataType, Field, Schema, DictionaryType,
+ FixedWidthBinaryType,
box_data_type)
import pyarrow.schema as schema
@@ -197,7 +198,11 @@ cdef class Array:
if type is None:
check_status(pyarrow.ConvertPySequence(list_obj, pool, &sp_array))
else:
- raise NotImplementedError()
+ check_status(
+ pyarrow.ConvertPySequence(
+ list_obj, pool, &sp_array, type.sp_type
+ )
+ )
return box_array(sp_array)
@@ -385,6 +390,7 @@ cdef class Date64Array(NumericArray):
cdef class TimestampArray(NumericArray):
pass
+
cdef class Time32Array(NumericArray):
pass
@@ -392,6 +398,7 @@ cdef class Time32Array(NumericArray):
cdef class Time64Array(NumericArray):
pass
+
cdef class FloatArray(FloatingPointArray):
pass
@@ -400,6 +407,10 @@ cdef class DoubleArray(FloatingPointArray):
pass
+cdef class FixedWidthBinaryArray(Array):
+ pass
+
+
cdef class ListArray(Array):
pass
@@ -506,7 +517,8 @@ cdef dict _array_classes = {
Type_LIST: ListArray,
Type_BINARY: BinaryArray,
Type_STRING: StringArray,
- Type_DICTIONARY: DictionaryArray
+ Type_DICTIONARY: DictionaryArray,
+ Type_FIXED_WIDTH_BINARY: FixedWidthBinaryArray,
}
cdef object box_array(const shared_ptr[CArray]& sp_array):
http://git-wip-us.apache.org/repos/asf/arrow/blob/4915ecf1/python/pyarrow/includes/libarrow.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 8e428b4..b44ade5 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -45,6 +45,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
Type_TIME64" arrow::Type::TIME64"
Type_BINARY" arrow::Type::BINARY"
Type_STRING" arrow::Type::STRING"
+ Type_FIXED_WIDTH_BINARY" arrow::Type::FIXED_WIDTH_BINARY"
Type_LIST" arrow::Type::LIST"
Type_STRUCT" arrow::Type::STRUCT"
@@ -139,6 +140,10 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
cdef cppclass CStringType" arrow::StringType"(CDataType):
pass
+ cdef cppclass CFixedWidthBinaryType" arrow::FixedWidthBinaryType"(CFixedWidthType):
+ CFixedWidthBinaryType(int byte_width)
+ int byte_width()
+
cdef cppclass CField" arrow::Field":
c_string name
shared_ptr[CDataType] type
@@ -203,6 +208,9 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
cdef cppclass CDoubleArray" arrow::DoubleArray"(CArray):
double Value(int i)
+ cdef cppclass CFixedWidthBinaryArray" arrow::FixedWidthBinaryArray"(CArray):
+ const uint8_t* GetValue(int i)
+
cdef cppclass CListArray" arrow::ListArray"(CArray):
const int32_t* raw_value_offsets()
int32_t value_offset(int i)
http://git-wip-us.apache.org/repos/asf/arrow/blob/4915ecf1/python/pyarrow/includes/pyarrow.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/pyarrow.pxd b/python/pyarrow/includes/pyarrow.pxd
index c3fdf4b..8142c1c 100644
--- a/python/pyarrow/includes/pyarrow.pxd
+++ b/python/pyarrow/includes/pyarrow.pxd
@@ -30,6 +30,9 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil:
shared_ptr[CDataType] GetTimestampType(TimeUnit unit)
CStatus ConvertPySequence(object obj, CMemoryPool* pool,
shared_ptr[CArray]* out)
+ CStatus ConvertPySequence(object obj, CMemoryPool* pool,
+ shared_ptr[CArray]* out,
+ const shared_ptr[CDataType]& type)
CStatus PandasDtypeToArrow(object dtype, shared_ptr[CDataType]* type)
http://git-wip-us.apache.org/repos/asf/arrow/blob/4915ecf1/python/pyarrow/scalar.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/scalar.pxd b/python/pyarrow/scalar.pxd
index 551aeb9..e9cc3cb 100644
--- a/python/pyarrow/scalar.pxd
+++ b/python/pyarrow/scalar.pxd
@@ -61,6 +61,11 @@ cdef class ListValue(ArrayValue):
cdef class StringValue(ArrayValue):
pass
+
+cdef class FixedWidthBinaryValue(ArrayValue):
+ pass
+
+
cdef object box_scalar(DataType type,
const shared_ptr[CArray]& sp_array,
int64_t index)
http://git-wip-us.apache.org/repos/asf/arrow/blob/4915ecf1/python/pyarrow/scalar.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/scalar.pyx b/python/pyarrow/scalar.pyx
index 1b7e67b..f4a1c9e 100644
--- a/python/pyarrow/scalar.pyx
+++ b/python/pyarrow/scalar.pyx
@@ -224,6 +224,22 @@ cdef class ListValue(ArrayValue):
return result
+cdef class FixedWidthBinaryValue(ArrayValue):
+
+ def as_py(self):
+ cdef:
+ CFixedWidthBinaryArray* ap
+ CFixedWidthBinaryType* ap_type
+ int32_t length
+ const char* data
+ ap = <CFixedWidthBinaryArray*> self.sp_array.get()
+ ap_type = <CFixedWidthBinaryType*> ap.type().get()
+ length = ap_type.byte_width()
+ data = <const char*> ap.GetValue(self.index)
+ return cp.PyBytes_FromStringAndSize(data, length)
+
+
+
cdef dict _scalar_classes = {
Type_BOOL: BooleanValue,
Type_UINT8: Int8Value,
@@ -241,7 +257,8 @@ cdef dict _scalar_classes = {
Type_DOUBLE: DoubleValue,
Type_LIST: ListValue,
Type_BINARY: BinaryValue,
- Type_STRING: StringValue
+ Type_STRING: StringValue,
+ Type_FIXED_WIDTH_BINARY: FixedWidthBinaryValue,
}
cdef object box_scalar(DataType type, const shared_ptr[CArray]& sp_array,
http://git-wip-us.apache.org/repos/asf/arrow/blob/4915ecf1/python/pyarrow/schema.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/schema.pxd b/python/pyarrow/schema.pxd
index 15ee5f1..c0c2c70 100644
--- a/python/pyarrow/schema.pxd
+++ b/python/pyarrow/schema.pxd
@@ -19,6 +19,7 @@ from pyarrow.includes.common cimport *
from pyarrow.includes.libarrow cimport (CDataType,
CDictionaryType,
CTimestampType,
+ CFixedWidthBinaryType,
CField, CSchema)
cdef class DataType:
@@ -39,6 +40,11 @@ cdef class TimestampType(DataType):
const CTimestampType* ts_type
+cdef class FixedWidthBinaryType(DataType):
+ cdef:
+ const CFixedWidthBinaryType* fixed_width_binary_type
+
+
cdef class Field:
cdef:
shared_ptr[CField] sp_field
http://git-wip-us.apache.org/repos/asf/arrow/blob/4915ecf1/python/pyarrow/schema.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/schema.pyx b/python/pyarrow/schema.pyx
index 4f02901..532a318 100644
--- a/python/pyarrow/schema.pyx
+++ b/python/pyarrow/schema.pyx
@@ -28,6 +28,7 @@ from pyarrow.compat import frombytes, tobytes
from pyarrow.array cimport Array
from pyarrow.error cimport check_status
from pyarrow.includes.libarrow cimport (CDataType, CStructType, CListType,
+ CFixedWidthBinaryType,
TimeUnit_SECOND, TimeUnit_MILLI,
TimeUnit_MICRO, TimeUnit_NANO,
Type, TimeUnit)
@@ -52,7 +53,7 @@ cdef class DataType:
return frombytes(self.type.ToString())
def __repr__(self):
- return 'DataType({0})'.format(str(self))
+ return '{0.__class__.__name__}({0})'.format(self)
def __richcmp__(DataType self, DataType other, int op):
if op == cpython.Py_EQ:
@@ -69,9 +70,6 @@ cdef class DictionaryType(DataType):
DataType.init(self, type)
self.dict_type = <const CDictionaryType*> type.get()
- def __repr__(self):
- return 'DictionaryType({0})'.format(str(self))
-
cdef class TimestampType(DataType):
@@ -92,8 +90,17 @@ cdef class TimestampType(DataType):
else:
return None
- def __repr__(self):
- return 'TimestampType({0})'.format(str(self))
+
+cdef class FixedWidthBinaryType(DataType):
+
+ cdef init(self, const shared_ptr[CDataType]& type):
+ DataType.init(self, type)
+ self.fixed_width_binary_type = <const CFixedWidthBinaryType*> type.get()
+
+ property byte_width:
+
+ def __get__(self):
+ return self.fixed_width_binary_type.byte_width()
cdef class Field:
@@ -348,11 +355,24 @@ def string():
return primitive_type(la.Type_STRING)
-def binary():
- """
- Binary (PyBytes-like) type
+def binary(int length=-1):
+ """Binary (PyBytes-like) type
+
+ Parameters
+ ----------
+ length : int, optional, default -1
+ If length == -1 then return a variable length binary type. If length is
+ greater than or equal to 0 then return a fixed width binary type of
+ width `length`.
"""
- return primitive_type(la.Type_BINARY)
+ if length == -1:
+ return primitive_type(la.Type_BINARY)
+
+ cdef FixedWidthBinaryType out = FixedWidthBinaryType()
+ cdef shared_ptr[CDataType] fixed_width_binary_type
+ fixed_width_binary_type.reset(new CFixedWidthBinaryType(length))
+ out.init(fixed_width_binary_type)
+ return out
def list_(DataType value_type):
@@ -408,6 +428,8 @@ cdef DataType box_data_type(const shared_ptr[CDataType]& type):
out = DictionaryType()
elif type.get().type == la.Type_TIMESTAMP:
out = TimestampType()
+ elif type.get().type == la.Type_FIXED_WIDTH_BINARY:
+ out = FixedWidthBinaryType()
else:
out = DataType()
http://git-wip-us.apache.org/repos/asf/arrow/blob/4915ecf1/python/pyarrow/tests/test_convert_builtin.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py
index 7915f97..9925125 100644
--- a/python/pyarrow/tests/test_convert_builtin.py
+++ b/python/pyarrow/tests/test_convert_builtin.py
@@ -92,6 +92,19 @@ class TestConvertList(unittest.TestCase):
assert arr.type == pyarrow.binary()
assert arr.to_pylist() == [b'foo', u1, None]
+ def test_fixed_width_bytes(self):
+ data = [b'foof', None, b'barb', b'2346']
+ arr = pyarrow.from_pylist(data, type=pyarrow.binary(4))
+ assert len(arr) == 4
+ assert arr.null_count == 1
+ assert arr.type == pyarrow.binary(4)
+ assert arr.to_pylist() == data
+
+ def test_fixed_width_bytes_does_not_accept_varying_lengths(self):
+ data = [b'foo', None, b'barb', b'2346']
+ with self.assertRaises(pyarrow.error.ArrowException):
+ pyarrow.from_pylist(data, type=pyarrow.binary(4))
+
def test_date(self):
data = [datetime.date(2000, 1, 1), None, datetime.date(1970, 1, 1),
datetime.date(2040, 2, 26)]
http://git-wip-us.apache.org/repos/asf/arrow/blob/4915ecf1/python/pyarrow/tests/test_convert_pandas.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index ea7a892..f7cb47f 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -244,6 +244,23 @@ class TestPandasConversion(unittest.TestCase):
expected = pd.DataFrame({'strings': values2})
self._check_pandas_roundtrip(df, expected)
+ def test_fixed_width_bytes(self):
+ values = [b'foo', None, b'bar', None, None, b'hey']
+ df = pd.DataFrame({'strings': values})
+ schema = A.Schema.from_fields([A.field('strings', A.binary(3))])
+ table = A.Table.from_pandas(df, schema=schema)
+ assert table.schema[0].type == schema[0].type
+ assert table.schema[0].name == schema[0].name
+ result = table.to_pandas()
+ tm.assert_frame_equal(result, df)
+
+ def test_fixed_width_bytes_does_not_accept_varying_lengths(self):
+ values = [b'foo', None, b'ba', None, None, b'hey']
+ df = pd.DataFrame({'strings': values})
+ schema = A.Schema.from_fields([A.field('strings', A.binary(3))])
+ with self.assertRaises(A.error.ArrowException):
+ A.Table.from_pandas(df, schema=schema)
+
def test_timestamps_notimezone_no_nulls(self):
df = pd.DataFrame({
'datetime64': np.array([
http://git-wip-us.apache.org/repos/asf/arrow/blob/4915ecf1/python/pyarrow/tests/test_scalars.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py
index d56481c..265ce8d 100644
--- a/python/pyarrow/tests/test_scalars.py
+++ b/python/pyarrow/tests/test_scalars.py
@@ -87,6 +87,20 @@ class TestScalars(unittest.TestCase):
assert v == b'bar'
assert isinstance(v, bytes)
+ def test_fixed_width_bytes(self):
+ data = [b'foof', None, b'barb']
+ arr = A.from_pylist(data, type=A.binary(4))
+
+ v = arr[0]
+ assert isinstance(v, A.FixedWidthBinaryValue)
+ assert v.as_py() == b'foof'
+
+ assert arr[1] is A.NA
+
+ v = arr[2].as_py()
+ assert v == b'barb'
+ assert isinstance(v, bytes)
+
def test_list(self):
arr = A.from_pylist([['foo', None], None, ['bar'], []])