You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2019/08/01 10:50:34 UTC
[arrow] branch master updated: ARROW-6000: [Python] Add support for
LargeString and LargeBinary types
This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new eb73b96 ARROW-6000: [Python] Add support for LargeString and LargeBinary types
eb73b96 is described below
commit eb73b962e42b5ae6983bf026ebf825f1f707e245
Author: Antoine Pitrou <an...@python.org>
AuthorDate: Thu Aug 1 12:50:21 2019 +0200
ARROW-6000: [Python] Add support for LargeString and LargeBinary types
Also fix a bug in Take / Filter for large binary types.
Closes #4927 from pitrou/ARROW-6000-py-large-binary and squashes the following commits:
0bb1b7f89 <Antoine Pitrou> Fix Take on LargeBinary / LargeString data
9672ca4c3 <Antoine Pitrou> ARROW-6000: Add support for LargeString and LargeBinary types
Authored-by: Antoine Pitrou <an...@python.org>
Signed-off-by: Antoine Pitrou <an...@python.org>
---
cpp/src/arrow/array/builder_binary.h | 16 +++++--
cpp/src/arrow/array/builder_decimal.h | 2 +
cpp/src/arrow/array/builder_primitive.h | 3 ++
cpp/src/arrow/array/builder_time.h | 1 +
cpp/src/arrow/compute/kernels/filter-test.cc | 22 ++++++---
cpp/src/arrow/compute/kernels/take-internal.h | 20 ++++----
cpp/src/arrow/compute/kernels/take-test.cc | 31 ++++++++----
cpp/src/arrow/python/helpers.cc | 2 +
cpp/src/arrow/python/helpers.h | 6 +++
cpp/src/arrow/python/python_to_arrow.cc | 66 +++++++++++++++----------
cpp/src/arrow/type.h | 2 +
docs/source/python/api/arrays.rst | 4 ++
docs/source/python/api/datatypes.rst | 6 +++
python/pyarrow/__init__.py | 9 +++-
python/pyarrow/array.pxi | 40 ++++++++++++++++
python/pyarrow/includes/libarrow.pxd | 20 +++++++-
python/pyarrow/lib.pyx | 2 +
python/pyarrow/scalar.pxi | 69 +++++++++++++++++++++++++++
python/pyarrow/tests/strategies.py | 8 +++-
python/pyarrow/tests/test_convert_builtin.py | 58 ++++++++++++++++++----
python/pyarrow/tests/test_scalars.py | 37 ++++++++++++++
python/pyarrow/tests/test_types.py | 16 +++++++
python/pyarrow/types.pxi | 32 +++++++++++++
python/pyarrow/types.py | 22 +++++++++
24 files changed, 424 insertions(+), 70 deletions(-)
diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h
index 7ae4d31..869e61f 100644
--- a/cpp/src/arrow/array/builder_binary.h
+++ b/cpp/src/arrow/array/builder_binary.h
@@ -294,6 +294,11 @@ class BaseBinaryBuilder : public ArrayBuilder {
return util::string_view(reinterpret_cast<const char*>(value_data), value_length);
}
+ // Cannot make this a static attribute because of linking issues
+ static constexpr int64_t memory_limit() {
+ return std::numeric_limits<offset_type>::max() - 1;
+ }
+
protected:
TypedBufferBuilder<offset_type> offsets_builder_;
TypedBufferBuilder<uint8_t> value_data_builder_;
@@ -315,11 +320,6 @@ class BaseBinaryBuilder : public ArrayBuilder {
const int64_t num_bytes = value_data_builder_.length();
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
}
-
- // Cannot make this a static attribute because of linking issues
- static constexpr int64_t memory_limit() {
- return std::numeric_limits<offset_type>::max() - 1;
- }
};
/// \class BinaryBuilder
@@ -387,6 +387,8 @@ class ARROW_EXPORT LargeStringBuilder : public LargeBinaryBuilder {
class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder {
public:
+ using TypeClass = FixedSizeBinaryType;
+
FixedSizeBinaryBuilder(const std::shared_ptr<DataType>& type,
MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT);
@@ -471,6 +473,10 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder {
/// This view becomes invalid on the next modifying operation.
util::string_view GetView(int64_t i) const;
+ static constexpr int64_t memory_limit() {
+ return std::numeric_limits<int64_t>::max() - 1;
+ }
+
protected:
int32_t byte_width_;
BufferBuilder byte_builder_;
diff --git a/cpp/src/arrow/array/builder_decimal.h b/cpp/src/arrow/array/builder_decimal.h
index e64d165..1b8d3b4 100644
--- a/cpp/src/arrow/array/builder_decimal.h
+++ b/cpp/src/arrow/array/builder_decimal.h
@@ -28,6 +28,8 @@ class Decimal128;
class ARROW_EXPORT Decimal128Builder : public FixedSizeBinaryBuilder {
public:
+ using TypeClass = Decimal128Type;
+
explicit Decimal128Builder(const std::shared_ptr<DataType>& type,
MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT);
diff --git a/cpp/src/arrow/array/builder_primitive.h b/cpp/src/arrow/array/builder_primitive.h
index 8abbe02..fecb8ca 100644
--- a/cpp/src/arrow/array/builder_primitive.h
+++ b/cpp/src/arrow/array/builder_primitive.h
@@ -58,6 +58,7 @@ class ARROW_EXPORT NullBuilder : public ArrayBuilder {
template <typename T>
class NumericBuilder : public ArrayBuilder {
public:
+ using TypeClass = T;
using value_type = typename T::c_type;
using ArrayType = typename TypeTraits<T>::ArrayType;
using ArrayBuilder::ArrayBuilder;
@@ -265,7 +266,9 @@ using DoubleBuilder = NumericBuilder<DoubleType>;
class ARROW_EXPORT BooleanBuilder : public ArrayBuilder {
public:
+ using TypeClass = BooleanType;
using value_type = bool;
+
explicit BooleanBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT);
explicit BooleanBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool);
diff --git a/cpp/src/arrow/array/builder_time.h b/cpp/src/arrow/array/builder_time.h
index 3ff783b..e264176 100644
--- a/cpp/src/arrow/array/builder_time.h
+++ b/cpp/src/arrow/array/builder_time.h
@@ -34,6 +34,7 @@ namespace arrow {
class ARROW_EXPORT DayTimeIntervalBuilder : public ArrayBuilder {
public:
+ using TypeClass = DayTimeIntervalType;
using DayMilliseconds = DayTimeIntervalType::DayMilliseconds;
explicit DayTimeIntervalBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT)
diff --git a/cpp/src/arrow/compute/kernels/filter-test.cc b/cpp/src/arrow/compute/kernels/filter-test.cc
index 45fd9e5..fa40753 100644
--- a/cpp/src/arrow/compute/kernels/filter-test.cc
+++ b/cpp/src/arrow/compute/kernels/filter-test.cc
@@ -285,18 +285,26 @@ TYPED_TEST(TestFilterKernelWithNumeric, ScalarInRangeAndFilterRandomNumeric) {
}
}
-class TestFilterKernelWithString : public TestFilterKernel<StringType> {
+using StringTypes =
+ ::testing::Types<BinaryType, StringType, LargeBinaryType, LargeStringType>;
+
+template <typename TypeClass>
+class TestFilterKernelWithString : public TestFilterKernel<TypeClass> {
protected:
+ std::shared_ptr<DataType> value_type() {
+ return TypeTraits<TypeClass>::type_singleton();
+ }
+
void AssertFilter(const std::string& values, const std::string& filter,
const std::string& expected) {
- TestFilterKernel<StringType>::AssertFilter(utf8(), values, filter, expected);
+ TestFilterKernel<TypeClass>::AssertFilter(value_type(), values, filter, expected);
}
void AssertFilterDictionary(const std::string& dictionary_values,
const std::string& dictionary_filter,
const std::string& filter,
const std::string& expected_filter) {
- auto dict = ArrayFromJSON(utf8(), dictionary_values);
- auto type = dictionary(int8(), utf8());
+ auto dict = ArrayFromJSON(value_type(), dictionary_values);
+ auto type = dictionary(int8(), value_type());
std::shared_ptr<Array> values, actual, expected;
ASSERT_OK(DictionaryArray::FromArrays(type, ArrayFromJSON(int8(), dictionary_filter),
dict, &values));
@@ -307,13 +315,15 @@ class TestFilterKernelWithString : public TestFilterKernel<StringType> {
}
};
-TEST_F(TestFilterKernelWithString, FilterString) {
+TYPED_TEST_CASE(TestFilterKernelWithString, StringTypes);
+
+TYPED_TEST(TestFilterKernelWithString, FilterString) {
this->AssertFilter(R"(["a", "b", "c"])", "[0, 1, 0]", R"(["b"])");
this->AssertFilter(R"([null, "b", "c"])", "[0, 1, 0]", R"(["b"])");
this->AssertFilter(R"(["a", "b", "c"])", "[null, 1, 0]", R"([null, "b"])");
}
-TEST_F(TestFilterKernelWithString, FilterDictionary) {
+TYPED_TEST(TestFilterKernelWithString, FilterDictionary) {
auto dict = R"(["a", "b", "c", "d", "e"])";
this->AssertFilterDictionary(dict, "[3, 4, 2]", "[0, 1, 0]", "[4]");
this->AssertFilterDictionary(dict, "[null, 4, 2]", "[0, 1, 0]", "[4]");
diff --git a/cpp/src/arrow/compute/kernels/take-internal.h b/cpp/src/arrow/compute/kernels/take-internal.h
index 04e89d1..b5fe87f 100644
--- a/cpp/src/arrow/compute/kernels/take-internal.h
+++ b/cpp/src/arrow/compute/kernels/take-internal.h
@@ -20,11 +20,13 @@
#include <algorithm>
#include <limits>
#include <memory>
+#include <type_traits>
#include <utility>
#include <vector>
#include "arrow/builder.h"
#include "arrow/compute/context.h"
+#include "arrow/type_traits.h"
#include "arrow/util/bit-util.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/logging.h"
@@ -37,21 +39,19 @@ namespace compute {
using internal::checked_cast;
using internal::checked_pointer_cast;
+// For non-binary builders, use regular value append
template <typename Builder, typename Scalar>
-static Status UnsafeAppend(Builder* builder, Scalar&& value) {
+static typename std::enable_if<
+ !std::is_base_of<BaseBinaryType, typename Builder::TypeClass>::value, Status>::type
+UnsafeAppend(Builder* builder, Scalar&& value) {
builder->UnsafeAppend(std::forward<Scalar>(value));
return Status::OK();
}
-// Use BinaryBuilder::UnsafeAppend, but reserve byte storage first
-static Status UnsafeAppend(BinaryBuilder* builder, util::string_view value) {
- RETURN_NOT_OK(builder->ReserveData(static_cast<int64_t>(value.size())));
- builder->UnsafeAppend(value);
- return Status::OK();
-}
-
-// Use StringBuilder::UnsafeAppend, but reserve character storage first
-static Status UnsafeAppend(StringBuilder* builder, util::string_view value) {
+// For binary builders, need to reserve byte storage first
+template <typename Builder>
+static enable_if_base_binary<typename Builder::TypeClass, Status> UnsafeAppend(
+ Builder* builder, util::string_view value) {
RETURN_NOT_OK(builder->ReserveData(static_cast<int64_t>(value.size())));
builder->UnsafeAppend(value);
return Status::OK();
diff --git a/cpp/src/arrow/compute/kernels/take-test.cc b/cpp/src/arrow/compute/kernels/take-test.cc
index 6a8e30b..0f080fd 100644
--- a/cpp/src/arrow/compute/kernels/take-test.cc
+++ b/cpp/src/arrow/compute/kernels/take-test.cc
@@ -179,18 +179,26 @@ TYPED_TEST(TestTakeKernelWithNumeric, TakeRandomNumeric) {
}
}
-class TestTakeKernelWithString : public TestTakeKernel<StringType> {
- protected:
+using StringTypes =
+ ::testing::Types<BinaryType, StringType, LargeBinaryType, LargeStringType>;
+
+template <typename TypeClass>
+class TestTakeKernelWithString : public TestTakeKernel<TypeClass> {
+ public:
+ std::shared_ptr<DataType> value_type() {
+ return TypeTraits<TypeClass>::type_singleton();
+ }
+
void AssertTake(const std::string& values, const std::string& indices,
const std::string& expected) {
- TestTakeKernel<StringType>::AssertTake(utf8(), values, indices, expected);
+ TestTakeKernel<TypeClass>::AssertTake(value_type(), values, indices, expected);
}
void AssertTakeDictionary(const std::string& dictionary_values,
const std::string& dictionary_indices,
const std::string& indices,
const std::string& expected_indices) {
- auto dict = ArrayFromJSON(utf8(), dictionary_values);
- auto type = dictionary(int8(), utf8());
+ auto dict = ArrayFromJSON(value_type(), dictionary_values);
+ auto type = dictionary(int8(), value_type());
std::shared_ptr<Array> values, actual, expected;
ASSERT_OK(DictionaryArray::FromArrays(type, ArrayFromJSON(int8(), dictionary_indices),
dict, &values));
@@ -201,19 +209,22 @@ class TestTakeKernelWithString : public TestTakeKernel<StringType> {
}
};
-TEST_F(TestTakeKernelWithString, TakeString) {
+TYPED_TEST_CASE(TestTakeKernelWithString, StringTypes);
+
+TYPED_TEST(TestTakeKernelWithString, TakeString) {
this->AssertTake(R"(["a", "b", "c"])", "[0, 1, 0]", R"(["a", "b", "a"])");
this->AssertTake(R"([null, "b", "c"])", "[0, 1, 0]", "[null, \"b\", null]");
this->AssertTake(R"(["a", "b", "c"])", "[null, 1, 0]", R"([null, "b", "a"])");
+ std::shared_ptr<DataType> type = this->value_type();
std::shared_ptr<Array> arr;
ASSERT_RAISES(IndexError,
- this->Take(utf8(), R"(["a", "b", "c"])", int8(), "[0, 9, 0]", &arr));
- ASSERT_RAISES(IndexError, this->Take(utf8(), R"(["a", "b", null, "ddd", "ee"])",
- int64(), "[2, 5]", &arr));
+ this->Take(type, R"(["a", "b", "c"])", int8(), "[0, 9, 0]", &arr));
+ ASSERT_RAISES(IndexError, this->Take(type, R"(["a", "b", null, "ddd", "ee"])", int64(),
+ "[2, 5]", &arr));
}
-TEST_F(TestTakeKernelWithString, TakeDictionary) {
+TYPED_TEST(TestTakeKernelWithString, TakeDictionary) {
auto dict = R"(["a", "b", "c", "d", "e"])";
this->AssertTakeDictionary(dict, "[3, 4, 2]", "[0, 1, 0]", "[3, 4, 3]");
this->AssertTakeDictionary(dict, "[null, 4, 2]", "[0, 1, 0]", "[null, 4, null]");
diff --git a/cpp/src/arrow/python/helpers.cc b/cpp/src/arrow/python/helpers.cc
index e44878f..7bbac81 100644
--- a/cpp/src/arrow/python/helpers.cc
+++ b/cpp/src/arrow/python/helpers.cc
@@ -62,6 +62,8 @@ std::shared_ptr<DataType> GetPrimitiveType(Type::type type) {
GET_PRIMITIVE_TYPE(DOUBLE, float64);
GET_PRIMITIVE_TYPE(BINARY, binary);
GET_PRIMITIVE_TYPE(STRING, utf8);
+ GET_PRIMITIVE_TYPE(LARGE_BINARY, large_binary);
+ GET_PRIMITIVE_TYPE(LARGE_STRING, large_utf8);
default:
return nullptr;
}
diff --git a/cpp/src/arrow/python/helpers.h b/cpp/src/arrow/python/helpers.h
index 4917f99..8661ee5 100644
--- a/cpp/src/arrow/python/helpers.h
+++ b/cpp/src/arrow/python/helpers.h
@@ -119,6 +119,12 @@ inline Status CastSize(Py_ssize_t size, int32_t* out,
return Status::OK();
}
+inline Status CastSize(Py_ssize_t size, int64_t* out, const char* error_msg = NULLPTR) {
+ // size is assumed to be positive
+ *out = static_cast<int64_t>(size);
+ return Status::OK();
+}
+
// \brief Print the Python object's __str__ form along with the passed error
// message
ARROW_PYTHON_EXPORT
diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc
index 28d8c13..424e309 100644
--- a/cpp/src/arrow/python/python_to_arrow.cc
+++ b/cpp/src/arrow/python/python_to_arrow.cc
@@ -429,18 +429,20 @@ class TimestampConverter : public TypedConverter<TimestampType, TimestampConvert
namespace detail {
-template <typename BuilderType, typename AppendFunc>
-inline Status AppendPyString(BuilderType* builder, const PyBytesView& view, bool* is_full,
- AppendFunc&& append_func) {
- int32_t length = -1;
- RETURN_NOT_OK(internal::CastSize(view.size, &length));
- DCHECK_GE(length, 0);
+template <typename BuilderType>
+inline Status AppendPyString(BuilderType* builder, const PyBytesView& view,
+ bool* is_full) {
+ if (view.size > BuilderType::memory_limit()) {
+ return Status::Invalid("string too large for datatype");
+ }
+ DCHECK_GE(view.size, 0);
// Did we reach the builder size limit?
- if (ARROW_PREDICT_FALSE(builder->value_data_length() + length > kBinaryMemoryLimit)) {
+ if (ARROW_PREDICT_FALSE(builder->value_data_length() + view.size >
+ BuilderType::memory_limit())) {
*is_full = true;
return Status::OK();
}
- RETURN_NOT_OK(append_func(view.bytes, length));
+ RETURN_NOT_OK(builder->Append(::arrow::util::string_view(view.bytes, view.size)));
*is_full = false;
return Status::OK();
}
@@ -448,10 +450,13 @@ inline Status AppendPyString(BuilderType* builder, const PyBytesView& view, bool
inline Status BuilderAppend(BinaryBuilder* builder, PyObject* obj, bool* is_full) {
PyBytesView view;
RETURN_NOT_OK(view.FromString(obj));
- return AppendPyString(builder, view, is_full,
- [&builder](const char* bytes, int32_t length) {
- return builder->Append(bytes, length);
- });
+ return AppendPyString(builder, view, is_full);
+}
+
+inline Status BuilderAppend(LargeBinaryBuilder* builder, PyObject* obj, bool* is_full) {
+ PyBytesView view;
+ RETURN_NOT_OK(view.FromString(obj));
+ return AppendPyString(builder, view, is_full);
}
inline Status BuilderAppend(FixedSizeBinaryBuilder* builder, PyObject* obj,
@@ -466,9 +471,7 @@ inline Status BuilderAppend(FixedSizeBinaryBuilder* builder, PyObject* obj,
return internal::InvalidValue(obj, ss.str());
}
- return AppendPyString(
- builder, view, is_full,
- [&builder](const char* bytes, int32_t length) { return builder->Append(bytes); });
+ return AppendPyString(builder, view, is_full);
}
} // namespace detail
@@ -496,12 +499,15 @@ class BinaryLikeConverter : public TypedConverter<Type, BinaryLikeConverter<Type
class BytesConverter : public BinaryLikeConverter<BinaryType> {};
+class LargeBytesConverter : public BinaryLikeConverter<LargeBinaryType> {};
+
class FixedWidthBytesConverter : public BinaryLikeConverter<FixedSizeBinaryType> {};
// For String/UTF8, if strict_conversions enabled, we reject any non-UTF8,
// otherwise we allow but return results as BinaryArray
-template <bool STRICT>
-class StringConverter : public TypedConverter<StringType, StringConverter<STRICT>> {
+template <typename TypeClass, bool STRICT>
+class StringConverter
+ : public TypedConverter<TypeClass, StringConverter<TypeClass, STRICT>> {
public:
StringConverter() : binary_count_(0) {}
@@ -526,10 +532,7 @@ class StringConverter : public TypedConverter<StringType, StringConverter<STRICT
}
}
- return detail::AppendPyString(this->typed_builder_, string_view_, is_full,
- [this](const char* bytes, int32_t length) {
- return this->typed_builder_->Append(bytes, length);
- });
+ return detail::AppendPyString(this->typed_builder_, string_view_, is_full);
}
Status AppendItem(PyObject* obj) {
@@ -556,10 +559,13 @@ class StringConverter : public TypedConverter<StringType, StringConverter<STRICT
// We should have bailed out earlier
DCHECK(!STRICT);
+ using EquivalentBinaryType = typename TypeClass::EquivalentBinaryType;
+ using EquivalentBinaryArray = typename TypeTraits<EquivalentBinaryType>::ArrayType;
+
for (size_t i = 0; i < out->size(); ++i) {
auto binary_data = (*out)[i]->data()->Copy();
- binary_data->type = ::arrow::binary();
- (*out)[i] = std::make_shared<BinaryArray>(binary_data);
+ binary_data->type = TypeTraits<EquivalentBinaryType>::type_singleton();
+ (*out)[i] = std::make_shared<EquivalentBinaryArray>(binary_data);
}
}
return Status::OK();
@@ -871,14 +877,24 @@ Status GetConverter(const std::shared_ptr<DataType>& type, bool from_pandas,
NUMERIC_CONVERTER(DOUBLE, DoubleType);
SIMPLE_CONVERTER_CASE(DECIMAL, DecimalConverter);
SIMPLE_CONVERTER_CASE(BINARY, BytesConverter);
+ SIMPLE_CONVERTER_CASE(LARGE_BINARY, LargeBytesConverter);
SIMPLE_CONVERTER_CASE(FIXED_SIZE_BINARY, FixedWidthBytesConverter);
SIMPLE_CONVERTER_CASE(DATE32, Date32Converter);
SIMPLE_CONVERTER_CASE(DATE64, Date64Converter);
case Type::STRING:
if (strict_conversions) {
- *out = std::unique_ptr<SeqConverter>(new StringConverter<true>());
+ *out = std::unique_ptr<SeqConverter>(new StringConverter<StringType, true>());
+ } else {
+ *out = std::unique_ptr<SeqConverter>(new StringConverter<StringType, false>());
+ }
+ break;
+ case Type::LARGE_STRING:
+ if (strict_conversions) {
+ *out =
+ std::unique_ptr<SeqConverter>(new StringConverter<LargeStringType, true>());
} else {
- *out = std::unique_ptr<SeqConverter>(new StringConverter<false>());
+ *out =
+ std::unique_ptr<SeqConverter>(new StringConverter<LargeStringType, false>());
}
break;
case Type::TIME32: {
diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
index 753c73e..86df73b 100644
--- a/cpp/src/arrow/type.h
+++ b/cpp/src/arrow/type.h
@@ -670,6 +670,7 @@ class ARROW_EXPORT StringType : public BinaryType {
public:
static constexpr Type::type type_id = Type::STRING;
static constexpr bool is_utf8 = true;
+ using EquivalentBinaryType = BinaryType;
static constexpr const char* type_name() { return "utf8"; }
@@ -684,6 +685,7 @@ class ARROW_EXPORT LargeStringType : public LargeBinaryType {
public:
static constexpr Type::type type_id = Type::LARGE_STRING;
static constexpr bool is_utf8 = true;
+ using EquivalentBinaryType = LargeBinaryType;
static constexpr const char* type_name() { return "large_utf8"; }
diff --git a/docs/source/python/api/arrays.rst b/docs/source/python/api/arrays.rst
index db45eef..e10b5af 100644
--- a/docs/source/python/api/arrays.rst
+++ b/docs/source/python/api/arrays.rst
@@ -57,6 +57,8 @@ may expose data type-specific methods or properties.
BinaryArray
StringArray
FixedSizeBinaryArray
+ LargeBinaryArray
+ LargeStringArray
Time32Array
Time64Array
Date32Array
@@ -97,6 +99,8 @@ any of those classes directly.
BinaryValue
StringValue
FixedSizeBinaryValue
+ LargeBinaryValue
+ LargeStringValue
Time32Value
Time64Value
Date32Value
diff --git a/docs/source/python/api/datatypes.rst b/docs/source/python/api/datatypes.rst
index 5ad0204..327bcf6 100644
--- a/docs/source/python/api/datatypes.rst
+++ b/docs/source/python/api/datatypes.rst
@@ -50,6 +50,9 @@ These should be used to create Arrow data types and schemas.
binary
string
utf8
+ large_binary
+ large_string
+ large_utf8
decimal128
list_
struct
@@ -129,6 +132,9 @@ represents a given data type (such as ``int32``) or general category
is_binary
is_unicode
is_string
+ is_large_binary
+ is_large_unicode
+ is_large_string
is_fixed_size_binary
is_map
is_dictionary
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index afe0636..51afa0f 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -52,7 +52,9 @@ from pyarrow.lib import (null, bool_,
uint8, uint16, uint32, uint64,
time32, time64, timestamp, date32, date64,
float16, float32, float64,
- binary, string, utf8, decimal128,
+ binary, string, utf8,
+ large_binary, large_string, large_utf8,
+ decimal128,
list_, struct, union, dictionary, field,
type_for_alias,
DataType, DictionaryType, ListType, StructType,
@@ -77,6 +79,7 @@ from pyarrow.lib import (null, bool_,
Int64Array, UInt64Array,
ListArray, UnionArray,
BinaryArray, StringArray,
+ LargeBinaryArray, LargeStringArray,
FixedSizeBinaryArray,
DictionaryArray,
Date32Array, Date64Array,
@@ -87,7 +90,9 @@ from pyarrow.lib import (null, bool_,
Int8Value, Int16Value, Int32Value, Int64Value,
UInt8Value, UInt16Value, UInt32Value, UInt64Value,
HalfFloatValue, FloatValue, DoubleValue, ListValue,
- BinaryValue, StringValue, FixedSizeBinaryValue,
+ BinaryValue, StringValue,
+ LargeBinaryValue, LargeStringValue,
+ FixedSizeBinaryValue,
DecimalValue, UnionValue, StructValue, DictionaryValue,
Date32Value, Date64Value,
Time32Value, Time64Value,
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 15905a1..4341c41 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -1178,12 +1178,50 @@ cdef class StringArray(Array):
null_count, offset)
+cdef class LargeStringArray(Array):
+ """
+ Concrete class for Arrow arrays of large string (or utf8) data type.
+ """
+
+ @staticmethod
+ def from_buffers(int length, Buffer value_offsets, Buffer data,
+ Buffer null_bitmap=None, int null_count=-1,
+ int offset=0):
+ """
+ Construct a LargeStringArray from value_offsets and data buffers.
+ If there are nulls in the data, also a null_bitmap and the matching
+ null_count must be passed.
+
+ Parameters
+ ----------
+ length : int
+ value_offsets : Buffer
+ data : Buffer
+ null_bitmap : Buffer, optional
+ null_count : int, default 0
+ offset : int, default 0
+
+ Returns
+ -------
+ string_array : StringArray
+ """
+ return Array.from_buffers(large_utf8(), length,
+ [null_bitmap, value_offsets, data],
+ null_count, offset)
+
+
cdef class BinaryArray(Array):
"""
Concrete class for Arrow arrays of variable-sized binary data type.
"""
+cdef class LargeBinaryArray(Array):
+ """
+ Concrete class for Arrow arrays of large variable-sized binary data type.
+ """
+
+
cdef class DictionaryArray(Array):
"""
Concrete class for dictionary-encoded Arrow arrays.
@@ -1449,6 +1487,8 @@ cdef dict _array_classes = {
_Type_UNION: UnionArray,
_Type_BINARY: BinaryArray,
_Type_STRING: StringArray,
+ _Type_LARGE_BINARY: LargeBinaryArray,
+ _Type_LARGE_STRING: LargeStringArray,
_Type_DICTIONARY: DictionaryArray,
_Type_FIXED_SIZE_BINARY: FixedSizeBinaryArray,
_Type_DECIMAL: Decimal128Array,
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 0f80ad7..bfda15d 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -65,6 +65,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
_Type_TIME64" arrow::Type::TIME64"
_Type_BINARY" arrow::Type::BINARY"
_Type_STRING" arrow::Type::STRING"
+ _Type_LARGE_BINARY" arrow::Type::LARGE_BINARY"
+ _Type_LARGE_STRING" arrow::Type::LARGE_STRING"
_Type_FIXED_SIZE_BINARY" arrow::Type::FIXED_SIZE_BINARY"
_Type_LIST" arrow::Type::LIST"
@@ -437,12 +439,18 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
const CArray* UnsafeChild(int pos)
UnionMode mode()
- cdef cppclass CBinaryArray" arrow::BinaryArray"(CListArray):
+ cdef cppclass CBinaryArray" arrow::BinaryArray"(CArray):
const uint8_t* GetValue(int i, int32_t* length)
shared_ptr[CBuffer] value_data()
int32_t value_offset(int64_t i)
int32_t value_length(int64_t i)
+ cdef cppclass CLargeBinaryArray" arrow::LargeBinaryArray"(CArray):
+ const uint8_t* GetValue(int i, int64_t* length)
+ shared_ptr[CBuffer] value_data()
+ int64_t value_offset(int64_t i)
+ int64_t value_length(int64_t i)
+
cdef cppclass CStringArray" arrow::StringArray"(CBinaryArray):
CStringArray(int64_t length, shared_ptr[CBuffer] value_offsets,
shared_ptr[CBuffer] data,
@@ -452,6 +460,16 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
c_string GetString(int i)
+ cdef cppclass CLargeStringArray" arrow::LargeStringArray" \
+ (CLargeBinaryArray):
+ CLargeStringArray(int64_t length, shared_ptr[CBuffer] value_offsets,
+ shared_ptr[CBuffer] data,
+ shared_ptr[CBuffer] null_bitmap,
+ int64_t null_count,
+ int64_t offset)
+
+ c_string GetString(int i)
+
cdef cppclass CStructArray" arrow::StructArray"(CArray):
CStructArray(shared_ptr[CDataType] type, int64_t length,
vector[shared_ptr[CArray]] children,
diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx
index 2da5a83..0b33f39 100644
--- a/python/pyarrow/lib.pyx
+++ b/python/pyarrow/lib.pyx
@@ -87,6 +87,8 @@ Type_TIME32 = _Type_TIME32
Type_TIME64 = _Type_TIME64
Type_BINARY = _Type_BINARY
Type_STRING = _Type_STRING
+Type_LARGE_BINARY = _Type_LARGE_BINARY
+Type_LARGE_STRING = _Type_LARGE_STRING
Type_FIXED_SIZE_BINARY = _Type_FIXED_SIZE_BINARY
Type_LIST = _Type_LIST
Type_STRUCT = _Type_STRUCT
diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi
index 326aefa..0ead3e5 100644
--- a/python/pyarrow/scalar.pxi
+++ b/python/pyarrow/scalar.pxi
@@ -437,6 +437,43 @@ cdef class StringValue(ArrayValue):
cdef CStringArray* ap = <CStringArray*> self.sp_array.get()
return ap.GetString(self.index).decode('utf-8')
+ def as_buffer(self):
+ """
+ Return a view over this value as a Buffer object.
+ """
+ cdef:
+ CStringArray* ap = <CStringArray*> self.sp_array.get()
+ shared_ptr[CBuffer] buf
+
+ buf = SliceBuffer(ap.value_data(), ap.value_offset(self.index),
+ ap.value_length(self.index))
+ return pyarrow_wrap_buffer(buf)
+
+
+cdef class LargeStringValue(ArrayValue):
+ """
+ Concrete class for large string (utf8) array elements.
+ """
+
+ def as_py(self):
+ """
+ Return this value as a Python unicode string.
+ """
+ cdef CLargeStringArray* ap = <CLargeStringArray*> self.sp_array.get()
+ return ap.GetString(self.index).decode('utf-8')
+
+ def as_buffer(self):
+ """
+ Return a view over this value as a Buffer object.
+ """
+ cdef:
+ CLargeStringArray* ap = <CLargeStringArray*> self.sp_array.get()
+ shared_ptr[CBuffer] buf
+
+ buf = SliceBuffer(ap.value_data(), ap.value_offset(self.index),
+ ap.value_length(self.index))
+ return pyarrow_wrap_buffer(buf)
+
cdef class BinaryValue(ArrayValue):
"""
@@ -468,6 +505,36 @@ cdef class BinaryValue(ArrayValue):
return pyarrow_wrap_buffer(buf)
+cdef class LargeBinaryValue(ArrayValue):
+ """
+ Concrete class for large variable-sized binary array elements.
+ """
+
+ def as_py(self):
+ """
+ Return this value as a Python bytes object.
+ """
+ cdef:
+ const uint8_t* ptr
+ int64_t length
+ CLargeBinaryArray* ap = <CLargeBinaryArray*> self.sp_array.get()
+
+ ptr = ap.GetValue(self.index, &length)
+ return cp.PyBytes_FromStringAndSize(<const char*>(ptr), length)
+
+ def as_buffer(self):
+ """
+ Return a view over this value as a Buffer object.
+ """
+ cdef:
+ CLargeBinaryArray* ap = <CLargeBinaryArray*> self.sp_array.get()
+ shared_ptr[CBuffer] buf
+
+ buf = SliceBuffer(ap.value_data(), ap.value_offset(self.index),
+ ap.value_length(self.index))
+ return pyarrow_wrap_buffer(buf)
+
+
cdef class ListValue(ArrayValue):
"""
Concrete class for list array elements.
@@ -665,6 +732,8 @@ cdef dict _array_value_classes = {
_Type_UNION: UnionValue,
_Type_BINARY: BinaryValue,
_Type_STRING: StringValue,
+ _Type_LARGE_BINARY: LargeBinaryValue,
+ _Type_LARGE_STRING: LargeStringValue,
_Type_FIXED_SIZE_BINARY: FixedSizeBinaryValue,
_Type_DECIMAL: DecimalValue,
_Type_STRUCT: StructValue,
diff --git a/python/pyarrow/tests/strategies.py b/python/pyarrow/tests/strategies.py
index a282864..7312ce3 100644
--- a/python/pyarrow/tests/strategies.py
+++ b/python/pyarrow/tests/strategies.py
@@ -38,6 +38,8 @@ bool_type = st.just(pa.bool_())
binary_type = st.just(pa.binary())
string_type = st.just(pa.string())
+large_binary_type = st.just(pa.large_binary())
+large_string_type = st.just(pa.large_string())
signed_integer_types = st.sampled_from([
pa.int8(),
@@ -87,6 +89,8 @@ primitive_types = st.one_of(
bool_type,
binary_type,
string_type,
+ large_binary_type,
+ large_string_type,
numeric_types,
temporal_types
)
@@ -190,9 +194,9 @@ def arrays(draw, type, size=None):
elif pa.types.is_timestamp(type):
tz = pytz.timezone(type.tz) if type.tz is not None else None
value = st.datetimes(timezones=st.just(tz))
- elif pa.types.is_binary(type):
+ elif pa.types.is_binary(type) or pa.types.is_large_binary(type):
value = st.binary()
- elif pa.types.is_string(type):
+ elif pa.types.is_string(type) or pa.types.is_large_string(type):
value = st.text()
elif pa.types.is_decimal(type):
# TODO(kszucs): properly limit the precision
diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py
index 81d5952..f39706e 100644
--- a/python/pyarrow/tests/test_convert_builtin.py
+++ b/python/pyarrow/tests/test_convert_builtin.py
@@ -26,6 +26,7 @@ import collections
import datetime
import decimal
import itertools
+import math
import traceback
import sys
@@ -605,7 +606,7 @@ def test_sequence_unicode():
assert arr.to_pylist() == data
-def test_array_mixed_unicode_bytes():
+def check_array_mixed_unicode_bytes(binary_type, string_type):
values = [u'qux', b'foo', bytearray(b'barz')]
b_values = [b'qux', b'foo', b'barz']
u_values = [u'qux', u'foo', u'barz']
@@ -615,36 +616,75 @@ def test_array_mixed_unicode_bytes():
assert arr.type == pa.binary()
assert arr.equals(expected)
- arr = pa.array(values, type=pa.string())
- expected = pa.array(u_values, type=pa.string())
- assert arr.type == pa.string()
+ arr = pa.array(values, type=binary_type)
+ expected = pa.array(b_values, type=binary_type)
+ assert arr.type == binary_type
+ assert arr.equals(expected)
+
+ arr = pa.array(values, type=string_type)
+ expected = pa.array(u_values, type=string_type)
+ assert arr.type == string_type
assert arr.equals(expected)
+def test_array_mixed_unicode_bytes():
+ check_array_mixed_unicode_bytes(pa.binary(), pa.string())
+ check_array_mixed_unicode_bytes(pa.large_binary(), pa.large_string())
+
+
+@pytest.mark.large_memory
+@pytest.mark.parametrize("ty", [pa.large_binary(), pa.large_string()])
+def test_large_binary_array(ty):
+ # Construct a large binary array with more than 4GB of data
+ s = b"0123456789abcdefghijklmnopqrstuvwxyz" * 10
+ nrepeats = math.ceil((2**32 + 5) / len(s))
+ data = [s] * nrepeats
+ arr = pa.array(data, type=ty)
+ assert isinstance(arr, pa.Array)
+ assert arr.type == ty
+ assert len(arr) == nrepeats
+
+
+@pytest.mark.large_memory
+@pytest.mark.parametrize("ty", [pa.large_binary(), pa.large_string()])
+def test_large_binary_value(ty):
+ # Construct a large binary array with a single value larger than 4GB
+ s = b"0123456789abcdefghijklmnopqrstuvwxyz"
+ nrepeats = math.ceil((2**32 + 5) / len(s))
+ arr = pa.array([b"foo", s * nrepeats, None, b"bar"], type=ty)
+ assert isinstance(arr, pa.Array)
+ assert arr.type == ty
+ assert len(arr) == 4
+ buf = arr[1].as_buffer()
+ assert len(buf) == len(s) * nrepeats
+
+
def test_sequence_bytes():
u1 = b'ma\xc3\xb1ana'
data = [b'foo',
u1.decode('utf-8'), # unicode gets encoded,
bytearray(b'bar'),
None]
- for ty in [None, pa.binary()]:
+ for ty in [None, pa.binary(), pa.large_binary()]:
arr = pa.array(data, type=ty)
assert len(arr) == 4
assert arr.null_count == 1
- assert arr.type == pa.binary()
+ assert arr.type == ty or pa.binary()
assert arr.to_pylist() == [b'foo', u1, b'bar', None]
-def test_sequence_utf8_to_unicode():
+@pytest.mark.parametrize("ty", [pa.string(), pa.large_string()])
+def test_sequence_utf8_to_unicode(ty):
# ARROW-1225
data = [b'foo', None, b'bar']
- arr = pa.array(data, type=pa.string())
+ arr = pa.array(data, type=ty)
+ assert arr.type == ty
assert arr[0].as_py() == u'foo'
# test a non-utf8 unicode string
val = (u'mañana').encode('utf-16-le')
with pytest.raises(pa.ArrowInvalid):
- pa.array([val], type=pa.string())
+ pa.array([val], type=ty)
def test_sequence_fixed_size_bytes():
diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py
index 406f4ee..ca7a10e 100644
--- a/python/pyarrow/tests/test_scalars.py
+++ b/python/pyarrow/tests/test_scalars.py
@@ -104,6 +104,25 @@ class TestScalars(unittest.TestCase):
assert v == u'mañana'
assert isinstance(v, unicode_type)
+ def test_large_string_unicode(self):
+ arr = pa.array([u'foo', None, u'mañana'], type=pa.large_string())
+
+ v = arr[0]
+ assert isinstance(v, pa.LargeStringValue)
+ assert v.as_py() == u'foo'
+ assert repr(v) == repr(u"foo")
+ assert str(v) == str(u"foo")
+ assert v == u'foo'
+ # Assert that newly created values are equal to the previously created
+ # one.
+ assert v == arr[0]
+
+ assert arr[1] is pa.NA
+
+ v = arr[2].as_py()
+ assert v == u'mañana'
+ assert isinstance(v, unicode_type)
+
def test_bytes(self):
arr = pa.array([b'foo', None, u('bar')])
@@ -122,6 +141,24 @@ class TestScalars(unittest.TestCase):
assert arr[1] is pa.NA
check_value(arr[2], b'bar')
+ def test_large_bytes(self):
+ arr = pa.array([b'foo', None, u('bar')], type=pa.large_binary())
+
+ def check_value(v, expected):
+ assert isinstance(v, pa.LargeBinaryValue)
+ assert v.as_py() == expected
+ assert str(v) == str(expected)
+ assert repr(v) == repr(expected)
+ assert v == expected
+ assert v != b'xxxxx'
+ buf = v.as_buffer()
+ assert isinstance(buf, pa.Buffer)
+ assert buf.to_pybytes() == expected
+
+ check_value(arr[0], b'foo')
+ assert arr[1] is pa.NA
+ check_value(arr[2], b'bar')
+
def test_fixed_size_bytes(self):
data = [b'foof', None, b'barb']
arr = pa.array(data, type=pa.binary(4))
diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py
index 7d9abf7..de532e6 100644
--- a/python/pyarrow/tests/test_types.py
+++ b/python/pyarrow/tests/test_types.py
@@ -49,6 +49,8 @@ def get_many_types():
pa.string(),
pa.binary(),
pa.binary(10),
+ pa.large_string(),
+ pa.large_binary(),
pa.list_(pa.int32()),
pa.struct([pa.field('a', pa.int32()),
pa.field('b', pa.int8()),
@@ -145,10 +147,24 @@ def test_is_union():
def test_is_binary_string():
assert types.is_binary(pa.binary())
assert not types.is_binary(pa.string())
+ assert not types.is_binary(pa.large_binary())
+ assert not types.is_binary(pa.large_string())
assert types.is_string(pa.string())
assert types.is_unicode(pa.string())
assert not types.is_string(pa.binary())
+ assert not types.is_string(pa.large_string())
+ assert not types.is_string(pa.large_binary())
+
+ assert types.is_large_binary(pa.large_binary())
+ assert not types.is_large_binary(pa.large_string())
+ assert not types.is_large_binary(pa.binary())
+ assert not types.is_large_binary(pa.string())
+
+ assert types.is_large_string(pa.large_string())
+ assert not types.is_large_string(pa.large_binary())
+ assert not types.is_large_string(pa.string())
+ assert not types.is_large_string(pa.binary())
assert types.is_fixed_size_binary(pa.binary(5))
assert not types.is_fixed_size_binary(pa.binary())
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index 657df04..0db15d5 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -106,6 +106,7 @@ cdef class DataType:
"instead.".format(self.__class__.__name__))
cdef void init(self, const shared_ptr[CDataType]& type) except *:
+ assert type != nullptr
self.sp_type = type
self.type = type.get()
self.pep3118_format = _datatype_to_pep3118(self.type)
@@ -1531,6 +1532,33 @@ def binary(int length=-1):
return pyarrow_wrap_data_type(fixed_size_binary_type)
+def large_binary():
+ """
+ Create large variable-length binary type
+
+ This data type may not be supported by all Arrow implementations. Unless
+ you need to represent data larger than 2GB, you should prefer binary().
+ """
+ return primitive_type(_Type_LARGE_BINARY)
+
+
+def large_string():
+ """
+ Create large UTF8 variable-length string type
+
+ This data type may not be supported by all Arrow implementations. Unless
+ you need to represent data larger than 2GB, you should prefer string().
+ """
+ return primitive_type(_Type_LARGE_STRING)
+
+
+def large_utf8():
+ """
+ Alias for large_string()
+ """
+ return large_string()
+
+
cpdef ListType list_(value_type):
"""
Create ListType instance from child data type or field
@@ -1714,6 +1742,10 @@ cdef dict _type_aliases = {
'str': string,
'utf8': string,
'binary': binary,
+ 'large_string': large_string,
+ 'large_str': large_string,
+ 'large_utf8': large_string,
+ 'large_binary': large_binary,
'date32': date32,
'date64': date64,
'date32[day]': date32,
diff --git a/python/pyarrow/types.py b/python/pyarrow/types.py
index def1dde..dc314e8 100644
--- a/python/pyarrow/types.py
+++ b/python/pyarrow/types.py
@@ -228,6 +228,14 @@ def is_binary(t):
return t.id == lib.Type_BINARY
+def is_large_binary(t):
+ """
+ Return True if value is an instance of a large variable-length
+ binary type
+ """
+ return t.id == lib.Type_LARGE_BINARY
+
+
def is_unicode(t):
"""
Alias for is_string
@@ -242,6 +250,20 @@ def is_string(t):
return t.id == lib.Type_STRING
+def is_large_unicode(t):
+ """
+ Alias for is_large_string
+ """
+ return is_large_string(t)
+
+
+def is_large_string(t):
+ """
+ Return True if value is an instance of large string (utf8 unicode) type
+ """
+ return t.id == lib.Type_LARGE_STRING
+
+
def is_fixed_size_binary(t):
"""
Return True if value is an instance of a fixed size binary type