You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2019/08/01 10:50:34 UTC

[arrow] branch master updated: ARROW-6000: [Python] Add support for LargeString and LargeBinary types

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new eb73b96  ARROW-6000: [Python] Add support for LargeString and LargeBinary types
eb73b96 is described below

commit eb73b962e42b5ae6983bf026ebf825f1f707e245
Author: Antoine Pitrou <an...@python.org>
AuthorDate: Thu Aug 1 12:50:21 2019 +0200

    ARROW-6000: [Python] Add support for LargeString and LargeBinary types
    
    Also fix a bug in Take / Filter for large binary types.
    
    Closes #4927 from pitrou/ARROW-6000-py-large-binary and squashes the following commits:
    
    0bb1b7f89 <Antoine Pitrou> Fix Take on LargeBinary / LargeString data
    9672ca4c3 <Antoine Pitrou> ARROW-6000:  Add support for LargeString and LargeBinary types
    
    Authored-by: Antoine Pitrou <an...@python.org>
    Signed-off-by: Antoine Pitrou <an...@python.org>
---
 cpp/src/arrow/array/builder_binary.h          | 16 +++++--
 cpp/src/arrow/array/builder_decimal.h         |  2 +
 cpp/src/arrow/array/builder_primitive.h       |  3 ++
 cpp/src/arrow/array/builder_time.h            |  1 +
 cpp/src/arrow/compute/kernels/filter-test.cc  | 22 ++++++---
 cpp/src/arrow/compute/kernels/take-internal.h | 20 ++++----
 cpp/src/arrow/compute/kernels/take-test.cc    | 31 ++++++++----
 cpp/src/arrow/python/helpers.cc               |  2 +
 cpp/src/arrow/python/helpers.h                |  6 +++
 cpp/src/arrow/python/python_to_arrow.cc       | 66 +++++++++++++++----------
 cpp/src/arrow/type.h                          |  2 +
 docs/source/python/api/arrays.rst             |  4 ++
 docs/source/python/api/datatypes.rst          |  6 +++
 python/pyarrow/__init__.py                    |  9 +++-
 python/pyarrow/array.pxi                      | 40 ++++++++++++++++
 python/pyarrow/includes/libarrow.pxd          | 20 +++++++-
 python/pyarrow/lib.pyx                        |  2 +
 python/pyarrow/scalar.pxi                     | 69 +++++++++++++++++++++++++++
 python/pyarrow/tests/strategies.py            |  8 +++-
 python/pyarrow/tests/test_convert_builtin.py  | 58 ++++++++++++++++++----
 python/pyarrow/tests/test_scalars.py          | 37 ++++++++++++++
 python/pyarrow/tests/test_types.py            | 16 +++++++
 python/pyarrow/types.pxi                      | 32 +++++++++++++
 python/pyarrow/types.py                       | 22 +++++++++
 24 files changed, 424 insertions(+), 70 deletions(-)

diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h
index 7ae4d31..869e61f 100644
--- a/cpp/src/arrow/array/builder_binary.h
+++ b/cpp/src/arrow/array/builder_binary.h
@@ -294,6 +294,11 @@ class BaseBinaryBuilder : public ArrayBuilder {
     return util::string_view(reinterpret_cast<const char*>(value_data), value_length);
   }
 
+  // Cannot make this a static attribute because of linking issues
+  static constexpr int64_t memory_limit() {
+    return std::numeric_limits<offset_type>::max() - 1;
+  }
+
  protected:
   TypedBufferBuilder<offset_type> offsets_builder_;
   TypedBufferBuilder<uint8_t> value_data_builder_;
@@ -315,11 +320,6 @@ class BaseBinaryBuilder : public ArrayBuilder {
     const int64_t num_bytes = value_data_builder_.length();
     offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
   }
-
-  // Cannot make this a static attribute because of linking issues
-  static constexpr int64_t memory_limit() {
-    return std::numeric_limits<offset_type>::max() - 1;
-  }
 };
 
 /// \class BinaryBuilder
@@ -387,6 +387,8 @@ class ARROW_EXPORT LargeStringBuilder : public LargeBinaryBuilder {
 
 class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder {
  public:
+  using TypeClass = FixedSizeBinaryType;
+
   FixedSizeBinaryBuilder(const std::shared_ptr<DataType>& type,
                          MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT);
 
@@ -471,6 +473,10 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder {
   /// This view becomes invalid on the next modifying operation.
   util::string_view GetView(int64_t i) const;
 
+  static constexpr int64_t memory_limit() {
+    return std::numeric_limits<int64_t>::max() - 1;
+  }
+
  protected:
   int32_t byte_width_;
   BufferBuilder byte_builder_;
diff --git a/cpp/src/arrow/array/builder_decimal.h b/cpp/src/arrow/array/builder_decimal.h
index e64d165..1b8d3b4 100644
--- a/cpp/src/arrow/array/builder_decimal.h
+++ b/cpp/src/arrow/array/builder_decimal.h
@@ -28,6 +28,8 @@ class Decimal128;
 
 class ARROW_EXPORT Decimal128Builder : public FixedSizeBinaryBuilder {
  public:
+  using TypeClass = Decimal128Type;
+
   explicit Decimal128Builder(const std::shared_ptr<DataType>& type,
                              MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT);
 
diff --git a/cpp/src/arrow/array/builder_primitive.h b/cpp/src/arrow/array/builder_primitive.h
index 8abbe02..fecb8ca 100644
--- a/cpp/src/arrow/array/builder_primitive.h
+++ b/cpp/src/arrow/array/builder_primitive.h
@@ -58,6 +58,7 @@ class ARROW_EXPORT NullBuilder : public ArrayBuilder {
 template <typename T>
 class NumericBuilder : public ArrayBuilder {
  public:
+  using TypeClass = T;
   using value_type = typename T::c_type;
   using ArrayType = typename TypeTraits<T>::ArrayType;
   using ArrayBuilder::ArrayBuilder;
@@ -265,7 +266,9 @@ using DoubleBuilder = NumericBuilder<DoubleType>;
 
 class ARROW_EXPORT BooleanBuilder : public ArrayBuilder {
  public:
+  using TypeClass = BooleanType;
   using value_type = bool;
+
   explicit BooleanBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT);
 
   explicit BooleanBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool);
diff --git a/cpp/src/arrow/array/builder_time.h b/cpp/src/arrow/array/builder_time.h
index 3ff783b..e264176 100644
--- a/cpp/src/arrow/array/builder_time.h
+++ b/cpp/src/arrow/array/builder_time.h
@@ -34,6 +34,7 @@ namespace arrow {
 
 class ARROW_EXPORT DayTimeIntervalBuilder : public ArrayBuilder {
  public:
+  using TypeClass = DayTimeIntervalType;
   using DayMilliseconds = DayTimeIntervalType::DayMilliseconds;
 
   explicit DayTimeIntervalBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT)
diff --git a/cpp/src/arrow/compute/kernels/filter-test.cc b/cpp/src/arrow/compute/kernels/filter-test.cc
index 45fd9e5..fa40753 100644
--- a/cpp/src/arrow/compute/kernels/filter-test.cc
+++ b/cpp/src/arrow/compute/kernels/filter-test.cc
@@ -285,18 +285,26 @@ TYPED_TEST(TestFilterKernelWithNumeric, ScalarInRangeAndFilterRandomNumeric) {
   }
 }
 
-class TestFilterKernelWithString : public TestFilterKernel<StringType> {
+using StringTypes =
+    ::testing::Types<BinaryType, StringType, LargeBinaryType, LargeStringType>;
+
+template <typename TypeClass>
+class TestFilterKernelWithString : public TestFilterKernel<TypeClass> {
  protected:
+  std::shared_ptr<DataType> value_type() {
+    return TypeTraits<TypeClass>::type_singleton();
+  }
+
   void AssertFilter(const std::string& values, const std::string& filter,
                     const std::string& expected) {
-    TestFilterKernel<StringType>::AssertFilter(utf8(), values, filter, expected);
+    TestFilterKernel<TypeClass>::AssertFilter(value_type(), values, filter, expected);
   }
   void AssertFilterDictionary(const std::string& dictionary_values,
                               const std::string& dictionary_filter,
                               const std::string& filter,
                               const std::string& expected_filter) {
-    auto dict = ArrayFromJSON(utf8(), dictionary_values);
-    auto type = dictionary(int8(), utf8());
+    auto dict = ArrayFromJSON(value_type(), dictionary_values);
+    auto type = dictionary(int8(), value_type());
     std::shared_ptr<Array> values, actual, expected;
     ASSERT_OK(DictionaryArray::FromArrays(type, ArrayFromJSON(int8(), dictionary_filter),
                                           dict, &values));
@@ -307,13 +315,15 @@ class TestFilterKernelWithString : public TestFilterKernel<StringType> {
   }
 };
 
-TEST_F(TestFilterKernelWithString, FilterString) {
+TYPED_TEST_CASE(TestFilterKernelWithString, StringTypes);
+
+TYPED_TEST(TestFilterKernelWithString, FilterString) {
   this->AssertFilter(R"(["a", "b", "c"])", "[0, 1, 0]", R"(["b"])");
   this->AssertFilter(R"([null, "b", "c"])", "[0, 1, 0]", R"(["b"])");
   this->AssertFilter(R"(["a", "b", "c"])", "[null, 1, 0]", R"([null, "b"])");
 }
 
-TEST_F(TestFilterKernelWithString, FilterDictionary) {
+TYPED_TEST(TestFilterKernelWithString, FilterDictionary) {
   auto dict = R"(["a", "b", "c", "d", "e"])";
   this->AssertFilterDictionary(dict, "[3, 4, 2]", "[0, 1, 0]", "[4]");
   this->AssertFilterDictionary(dict, "[null, 4, 2]", "[0, 1, 0]", "[4]");
diff --git a/cpp/src/arrow/compute/kernels/take-internal.h b/cpp/src/arrow/compute/kernels/take-internal.h
index 04e89d1..b5fe87f 100644
--- a/cpp/src/arrow/compute/kernels/take-internal.h
+++ b/cpp/src/arrow/compute/kernels/take-internal.h
@@ -20,11 +20,13 @@
 #include <algorithm>
 #include <limits>
 #include <memory>
+#include <type_traits>
 #include <utility>
 #include <vector>
 
 #include "arrow/builder.h"
 #include "arrow/compute/context.h"
+#include "arrow/type_traits.h"
 #include "arrow/util/bit-util.h"
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/logging.h"
@@ -37,21 +39,19 @@ namespace compute {
 using internal::checked_cast;
 using internal::checked_pointer_cast;
 
+// For non-binary builders, use regular value append
 template <typename Builder, typename Scalar>
-static Status UnsafeAppend(Builder* builder, Scalar&& value) {
+static typename std::enable_if<
+    !std::is_base_of<BaseBinaryType, typename Builder::TypeClass>::value, Status>::type
+UnsafeAppend(Builder* builder, Scalar&& value) {
   builder->UnsafeAppend(std::forward<Scalar>(value));
   return Status::OK();
 }
 
-// Use BinaryBuilder::UnsafeAppend, but reserve byte storage first
-static Status UnsafeAppend(BinaryBuilder* builder, util::string_view value) {
-  RETURN_NOT_OK(builder->ReserveData(static_cast<int64_t>(value.size())));
-  builder->UnsafeAppend(value);
-  return Status::OK();
-}
-
-// Use StringBuilder::UnsafeAppend, but reserve character storage first
-static Status UnsafeAppend(StringBuilder* builder, util::string_view value) {
+// For binary builders, need to reserve byte storage first
+template <typename Builder>
+static enable_if_base_binary<typename Builder::TypeClass, Status> UnsafeAppend(
+    Builder* builder, util::string_view value) {
   RETURN_NOT_OK(builder->ReserveData(static_cast<int64_t>(value.size())));
   builder->UnsafeAppend(value);
   return Status::OK();
diff --git a/cpp/src/arrow/compute/kernels/take-test.cc b/cpp/src/arrow/compute/kernels/take-test.cc
index 6a8e30b..0f080fd 100644
--- a/cpp/src/arrow/compute/kernels/take-test.cc
+++ b/cpp/src/arrow/compute/kernels/take-test.cc
@@ -179,18 +179,26 @@ TYPED_TEST(TestTakeKernelWithNumeric, TakeRandomNumeric) {
   }
 }
 
-class TestTakeKernelWithString : public TestTakeKernel<StringType> {
- protected:
+using StringTypes =
+    ::testing::Types<BinaryType, StringType, LargeBinaryType, LargeStringType>;
+
+template <typename TypeClass>
+class TestTakeKernelWithString : public TestTakeKernel<TypeClass> {
+ public:
+  std::shared_ptr<DataType> value_type() {
+    return TypeTraits<TypeClass>::type_singleton();
+  }
+
   void AssertTake(const std::string& values, const std::string& indices,
                   const std::string& expected) {
-    TestTakeKernel<StringType>::AssertTake(utf8(), values, indices, expected);
+    TestTakeKernel<TypeClass>::AssertTake(value_type(), values, indices, expected);
   }
   void AssertTakeDictionary(const std::string& dictionary_values,
                             const std::string& dictionary_indices,
                             const std::string& indices,
                             const std::string& expected_indices) {
-    auto dict = ArrayFromJSON(utf8(), dictionary_values);
-    auto type = dictionary(int8(), utf8());
+    auto dict = ArrayFromJSON(value_type(), dictionary_values);
+    auto type = dictionary(int8(), value_type());
     std::shared_ptr<Array> values, actual, expected;
     ASSERT_OK(DictionaryArray::FromArrays(type, ArrayFromJSON(int8(), dictionary_indices),
                                           dict, &values));
@@ -201,19 +209,22 @@ class TestTakeKernelWithString : public TestTakeKernel<StringType> {
   }
 };
 
-TEST_F(TestTakeKernelWithString, TakeString) {
+TYPED_TEST_CASE(TestTakeKernelWithString, StringTypes);
+
+TYPED_TEST(TestTakeKernelWithString, TakeString) {
   this->AssertTake(R"(["a", "b", "c"])", "[0, 1, 0]", R"(["a", "b", "a"])");
   this->AssertTake(R"([null, "b", "c"])", "[0, 1, 0]", "[null, \"b\", null]");
   this->AssertTake(R"(["a", "b", "c"])", "[null, 1, 0]", R"([null, "b", "a"])");
 
+  std::shared_ptr<DataType> type = this->value_type();
   std::shared_ptr<Array> arr;
   ASSERT_RAISES(IndexError,
-                this->Take(utf8(), R"(["a", "b", "c"])", int8(), "[0, 9, 0]", &arr));
-  ASSERT_RAISES(IndexError, this->Take(utf8(), R"(["a", "b", null, "ddd", "ee"])",
-                                       int64(), "[2, 5]", &arr));
+                this->Take(type, R"(["a", "b", "c"])", int8(), "[0, 9, 0]", &arr));
+  ASSERT_RAISES(IndexError, this->Take(type, R"(["a", "b", null, "ddd", "ee"])", int64(),
+                                       "[2, 5]", &arr));
 }
 
-TEST_F(TestTakeKernelWithString, TakeDictionary) {
+TYPED_TEST(TestTakeKernelWithString, TakeDictionary) {
   auto dict = R"(["a", "b", "c", "d", "e"])";
   this->AssertTakeDictionary(dict, "[3, 4, 2]", "[0, 1, 0]", "[3, 4, 3]");
   this->AssertTakeDictionary(dict, "[null, 4, 2]", "[0, 1, 0]", "[null, 4, null]");
diff --git a/cpp/src/arrow/python/helpers.cc b/cpp/src/arrow/python/helpers.cc
index e44878f..7bbac81 100644
--- a/cpp/src/arrow/python/helpers.cc
+++ b/cpp/src/arrow/python/helpers.cc
@@ -62,6 +62,8 @@ std::shared_ptr<DataType> GetPrimitiveType(Type::type type) {
       GET_PRIMITIVE_TYPE(DOUBLE, float64);
       GET_PRIMITIVE_TYPE(BINARY, binary);
       GET_PRIMITIVE_TYPE(STRING, utf8);
+      GET_PRIMITIVE_TYPE(LARGE_BINARY, large_binary);
+      GET_PRIMITIVE_TYPE(LARGE_STRING, large_utf8);
     default:
       return nullptr;
   }
diff --git a/cpp/src/arrow/python/helpers.h b/cpp/src/arrow/python/helpers.h
index 4917f99..8661ee5 100644
--- a/cpp/src/arrow/python/helpers.h
+++ b/cpp/src/arrow/python/helpers.h
@@ -119,6 +119,12 @@ inline Status CastSize(Py_ssize_t size, int32_t* out,
   return Status::OK();
 }
 
+inline Status CastSize(Py_ssize_t size, int64_t* out, const char* error_msg = NULLPTR) {
+  // size is assumed to be positive
+  *out = static_cast<int64_t>(size);
+  return Status::OK();
+}
+
 // \brief Print the Python object's __str__ form along with the passed error
 // message
 ARROW_PYTHON_EXPORT
diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc
index 28d8c13..424e309 100644
--- a/cpp/src/arrow/python/python_to_arrow.cc
+++ b/cpp/src/arrow/python/python_to_arrow.cc
@@ -429,18 +429,20 @@ class TimestampConverter : public TypedConverter<TimestampType, TimestampConvert
 
 namespace detail {
 
-template <typename BuilderType, typename AppendFunc>
-inline Status AppendPyString(BuilderType* builder, const PyBytesView& view, bool* is_full,
-                             AppendFunc&& append_func) {
-  int32_t length = -1;
-  RETURN_NOT_OK(internal::CastSize(view.size, &length));
-  DCHECK_GE(length, 0);
+template <typename BuilderType>
+inline Status AppendPyString(BuilderType* builder, const PyBytesView& view,
+                             bool* is_full) {
+  if (view.size > BuilderType::memory_limit()) {
+    return Status::Invalid("string too large for datatype");
+  }
+  DCHECK_GE(view.size, 0);
   // Did we reach the builder size limit?
-  if (ARROW_PREDICT_FALSE(builder->value_data_length() + length > kBinaryMemoryLimit)) {
+  if (ARROW_PREDICT_FALSE(builder->value_data_length() + view.size >
+                          BuilderType::memory_limit())) {
     *is_full = true;
     return Status::OK();
   }
-  RETURN_NOT_OK(append_func(view.bytes, length));
+  RETURN_NOT_OK(builder->Append(::arrow::util::string_view(view.bytes, view.size)));
   *is_full = false;
   return Status::OK();
 }
@@ -448,10 +450,13 @@ inline Status AppendPyString(BuilderType* builder, const PyBytesView& view, bool
 inline Status BuilderAppend(BinaryBuilder* builder, PyObject* obj, bool* is_full) {
   PyBytesView view;
   RETURN_NOT_OK(view.FromString(obj));
-  return AppendPyString(builder, view, is_full,
-                        [&builder](const char* bytes, int32_t length) {
-                          return builder->Append(bytes, length);
-                        });
+  return AppendPyString(builder, view, is_full);
+}
+
+inline Status BuilderAppend(LargeBinaryBuilder* builder, PyObject* obj, bool* is_full) {
+  PyBytesView view;
+  RETURN_NOT_OK(view.FromString(obj));
+  return AppendPyString(builder, view, is_full);
 }
 
 inline Status BuilderAppend(FixedSizeBinaryBuilder* builder, PyObject* obj,
@@ -466,9 +471,7 @@ inline Status BuilderAppend(FixedSizeBinaryBuilder* builder, PyObject* obj,
     return internal::InvalidValue(obj, ss.str());
   }
 
-  return AppendPyString(
-      builder, view, is_full,
-      [&builder](const char* bytes, int32_t length) { return builder->Append(bytes); });
+  return AppendPyString(builder, view, is_full);
 }
 
 }  // namespace detail
@@ -496,12 +499,15 @@ class BinaryLikeConverter : public TypedConverter<Type, BinaryLikeConverter<Type
 
 class BytesConverter : public BinaryLikeConverter<BinaryType> {};
 
+class LargeBytesConverter : public BinaryLikeConverter<LargeBinaryType> {};
+
 class FixedWidthBytesConverter : public BinaryLikeConverter<FixedSizeBinaryType> {};
 
 // For String/UTF8, if strict_conversions enabled, we reject any non-UTF8,
 // otherwise we allow but return results as BinaryArray
-template <bool STRICT>
-class StringConverter : public TypedConverter<StringType, StringConverter<STRICT>> {
+template <typename TypeClass, bool STRICT>
+class StringConverter
+    : public TypedConverter<TypeClass, StringConverter<TypeClass, STRICT>> {
  public:
   StringConverter() : binary_count_(0) {}
 
@@ -526,10 +532,7 @@ class StringConverter : public TypedConverter<StringType, StringConverter<STRICT
       }
     }
 
-    return detail::AppendPyString(this->typed_builder_, string_view_, is_full,
-                                  [this](const char* bytes, int32_t length) {
-                                    return this->typed_builder_->Append(bytes, length);
-                                  });
+    return detail::AppendPyString(this->typed_builder_, string_view_, is_full);
   }
 
   Status AppendItem(PyObject* obj) {
@@ -556,10 +559,13 @@ class StringConverter : public TypedConverter<StringType, StringConverter<STRICT
       // We should have bailed out earlier
       DCHECK(!STRICT);
 
+      using EquivalentBinaryType = typename TypeClass::EquivalentBinaryType;
+      using EquivalentBinaryArray = typename TypeTraits<EquivalentBinaryType>::ArrayType;
+
       for (size_t i = 0; i < out->size(); ++i) {
         auto binary_data = (*out)[i]->data()->Copy();
-        binary_data->type = ::arrow::binary();
-        (*out)[i] = std::make_shared<BinaryArray>(binary_data);
+        binary_data->type = TypeTraits<EquivalentBinaryType>::type_singleton();
+        (*out)[i] = std::make_shared<EquivalentBinaryArray>(binary_data);
       }
     }
     return Status::OK();
@@ -871,14 +877,24 @@ Status GetConverter(const std::shared_ptr<DataType>& type, bool from_pandas,
     NUMERIC_CONVERTER(DOUBLE, DoubleType);
     SIMPLE_CONVERTER_CASE(DECIMAL, DecimalConverter);
     SIMPLE_CONVERTER_CASE(BINARY, BytesConverter);
+    SIMPLE_CONVERTER_CASE(LARGE_BINARY, LargeBytesConverter);
     SIMPLE_CONVERTER_CASE(FIXED_SIZE_BINARY, FixedWidthBytesConverter);
     SIMPLE_CONVERTER_CASE(DATE32, Date32Converter);
     SIMPLE_CONVERTER_CASE(DATE64, Date64Converter);
     case Type::STRING:
       if (strict_conversions) {
-        *out = std::unique_ptr<SeqConverter>(new StringConverter<true>());
+        *out = std::unique_ptr<SeqConverter>(new StringConverter<StringType, true>());
+      } else {
+        *out = std::unique_ptr<SeqConverter>(new StringConverter<StringType, false>());
+      }
+      break;
+    case Type::LARGE_STRING:
+      if (strict_conversions) {
+        *out =
+            std::unique_ptr<SeqConverter>(new StringConverter<LargeStringType, true>());
       } else {
-        *out = std::unique_ptr<SeqConverter>(new StringConverter<false>());
+        *out =
+            std::unique_ptr<SeqConverter>(new StringConverter<LargeStringType, false>());
       }
       break;
     case Type::TIME32: {
diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
index 753c73e..86df73b 100644
--- a/cpp/src/arrow/type.h
+++ b/cpp/src/arrow/type.h
@@ -670,6 +670,7 @@ class ARROW_EXPORT StringType : public BinaryType {
  public:
   static constexpr Type::type type_id = Type::STRING;
   static constexpr bool is_utf8 = true;
+  using EquivalentBinaryType = BinaryType;
 
   static constexpr const char* type_name() { return "utf8"; }
 
@@ -684,6 +685,7 @@ class ARROW_EXPORT LargeStringType : public LargeBinaryType {
  public:
   static constexpr Type::type type_id = Type::LARGE_STRING;
   static constexpr bool is_utf8 = true;
+  using EquivalentBinaryType = LargeBinaryType;
 
   static constexpr const char* type_name() { return "large_utf8"; }
 
diff --git a/docs/source/python/api/arrays.rst b/docs/source/python/api/arrays.rst
index db45eef..e10b5af 100644
--- a/docs/source/python/api/arrays.rst
+++ b/docs/source/python/api/arrays.rst
@@ -57,6 +57,8 @@ may expose data type-specific methods or properties.
    BinaryArray
    StringArray
    FixedSizeBinaryArray
+   LargeBinaryArray
+   LargeStringArray
    Time32Array
    Time64Array
    Date32Array
@@ -97,6 +99,8 @@ any of those classes directly.
    BinaryValue
    StringValue
    FixedSizeBinaryValue
+   LargeBinaryValue
+   LargeStringValue
    Time32Value
    Time64Value
    Date32Value
diff --git a/docs/source/python/api/datatypes.rst b/docs/source/python/api/datatypes.rst
index 5ad0204..327bcf6 100644
--- a/docs/source/python/api/datatypes.rst
+++ b/docs/source/python/api/datatypes.rst
@@ -50,6 +50,9 @@ These should be used to create Arrow data types and schemas.
    binary
    string
    utf8
+   large_binary
+   large_string
+   large_utf8
    decimal128
    list_
    struct
@@ -129,6 +132,9 @@ represents a given data type (such as ``int32``) or general category
    is_binary
    is_unicode
    is_string
+   is_large_binary
+   is_large_unicode
+   is_large_string
    is_fixed_size_binary
    is_map
    is_dictionary
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index afe0636..51afa0f 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -52,7 +52,9 @@ from pyarrow.lib import (null, bool_,
                          uint8, uint16, uint32, uint64,
                          time32, time64, timestamp, date32, date64,
                          float16, float32, float64,
-                         binary, string, utf8, decimal128,
+                         binary, string, utf8,
+                         large_binary, large_string, large_utf8,
+                         decimal128,
                          list_, struct, union, dictionary, field,
                          type_for_alias,
                          DataType, DictionaryType, ListType, StructType,
@@ -77,6 +79,7 @@ from pyarrow.lib import (null, bool_,
                          Int64Array, UInt64Array,
                          ListArray, UnionArray,
                          BinaryArray, StringArray,
+                         LargeBinaryArray, LargeStringArray,
                          FixedSizeBinaryArray,
                          DictionaryArray,
                          Date32Array, Date64Array,
@@ -87,7 +90,9 @@ from pyarrow.lib import (null, bool_,
                          Int8Value, Int16Value, Int32Value, Int64Value,
                          UInt8Value, UInt16Value, UInt32Value, UInt64Value,
                          HalfFloatValue, FloatValue, DoubleValue, ListValue,
-                         BinaryValue, StringValue, FixedSizeBinaryValue,
+                         BinaryValue, StringValue,
+                         LargeBinaryValue, LargeStringValue,
+                         FixedSizeBinaryValue,
                          DecimalValue, UnionValue, StructValue, DictionaryValue,
                          Date32Value, Date64Value,
                          Time32Value, Time64Value,
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 15905a1..4341c41 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -1178,12 +1178,50 @@ cdef class StringArray(Array):
                                   null_count, offset)
 
 
+cdef class LargeStringArray(Array):
+    """
+    Concrete class for Arrow arrays of large string (or utf8) data type.
+    """
+
+    @staticmethod
+    def from_buffers(int length, Buffer value_offsets, Buffer data,
+                     Buffer null_bitmap=None, int null_count=-1,
+                     int offset=0):
+        """
+        Construct a LargeStringArray from value_offsets and data buffers.
+        If there are nulls in the data, also a null_bitmap and the matching
+        null_count must be passed.
+
+        Parameters
+        ----------
+        length : int
+        value_offsets : Buffer
+        data : Buffer
+        null_bitmap : Buffer, optional
+        null_count : int, default 0
+        offset : int, default 0
+
+        Returns
+        -------
+        string_array : StringArray
+        """
+        return Array.from_buffers(large_utf8(), length,
+                                  [null_bitmap, value_offsets, data],
+                                  null_count, offset)
+
+
 cdef class BinaryArray(Array):
     """
     Concrete class for Arrow arrays of variable-sized binary data type.
     """
 
 
+cdef class LargeBinaryArray(Array):
+    """
+    Concrete class for Arrow arrays of large variable-sized binary data type.
+    """
+
+
 cdef class DictionaryArray(Array):
     """
     Concrete class for dictionary-encoded Arrow arrays.
@@ -1449,6 +1487,8 @@ cdef dict _array_classes = {
     _Type_UNION: UnionArray,
     _Type_BINARY: BinaryArray,
     _Type_STRING: StringArray,
+    _Type_LARGE_BINARY: LargeBinaryArray,
+    _Type_LARGE_STRING: LargeStringArray,
     _Type_DICTIONARY: DictionaryArray,
     _Type_FIXED_SIZE_BINARY: FixedSizeBinaryArray,
     _Type_DECIMAL: Decimal128Array,
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 0f80ad7..bfda15d 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -65,6 +65,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
         _Type_TIME64" arrow::Type::TIME64"
         _Type_BINARY" arrow::Type::BINARY"
         _Type_STRING" arrow::Type::STRING"
+        _Type_LARGE_BINARY" arrow::Type::LARGE_BINARY"
+        _Type_LARGE_STRING" arrow::Type::LARGE_STRING"
         _Type_FIXED_SIZE_BINARY" arrow::Type::FIXED_SIZE_BINARY"
 
         _Type_LIST" arrow::Type::LIST"
@@ -437,12 +439,18 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
         const CArray* UnsafeChild(int pos)
         UnionMode mode()
 
-    cdef cppclass CBinaryArray" arrow::BinaryArray"(CListArray):
+    cdef cppclass CBinaryArray" arrow::BinaryArray"(CArray):
         const uint8_t* GetValue(int i, int32_t* length)
         shared_ptr[CBuffer] value_data()
         int32_t value_offset(int64_t i)
         int32_t value_length(int64_t i)
 
+    cdef cppclass CLargeBinaryArray" arrow::LargeBinaryArray"(CArray):
+        const uint8_t* GetValue(int i, int64_t* length)
+        shared_ptr[CBuffer] value_data()
+        int64_t value_offset(int64_t i)
+        int64_t value_length(int64_t i)
+
     cdef cppclass CStringArray" arrow::StringArray"(CBinaryArray):
         CStringArray(int64_t length, shared_ptr[CBuffer] value_offsets,
                      shared_ptr[CBuffer] data,
@@ -452,6 +460,16 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
 
         c_string GetString(int i)
 
+    cdef cppclass CLargeStringArray" arrow::LargeStringArray" \
+            (CLargeBinaryArray):
+        CLargeStringArray(int64_t length, shared_ptr[CBuffer] value_offsets,
+                          shared_ptr[CBuffer] data,
+                          shared_ptr[CBuffer] null_bitmap,
+                          int64_t null_count,
+                          int64_t offset)
+
+        c_string GetString(int i)
+
     cdef cppclass CStructArray" arrow::StructArray"(CArray):
         CStructArray(shared_ptr[CDataType] type, int64_t length,
                      vector[shared_ptr[CArray]] children,
diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx
index 2da5a83..0b33f39 100644
--- a/python/pyarrow/lib.pyx
+++ b/python/pyarrow/lib.pyx
@@ -87,6 +87,8 @@ Type_TIME32 = _Type_TIME32
 Type_TIME64 = _Type_TIME64
 Type_BINARY = _Type_BINARY
 Type_STRING = _Type_STRING
+Type_LARGE_BINARY = _Type_LARGE_BINARY
+Type_LARGE_STRING = _Type_LARGE_STRING
 Type_FIXED_SIZE_BINARY = _Type_FIXED_SIZE_BINARY
 Type_LIST = _Type_LIST
 Type_STRUCT = _Type_STRUCT
diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi
index 326aefa..0ead3e5 100644
--- a/python/pyarrow/scalar.pxi
+++ b/python/pyarrow/scalar.pxi
@@ -437,6 +437,43 @@ cdef class StringValue(ArrayValue):
         cdef CStringArray* ap = <CStringArray*> self.sp_array.get()
         return ap.GetString(self.index).decode('utf-8')
 
+    def as_buffer(self):
+        """
+        Return a view over this value as a Buffer object.
+        """
+        cdef:
+            CStringArray* ap = <CStringArray*> self.sp_array.get()
+            shared_ptr[CBuffer] buf
+
+        buf = SliceBuffer(ap.value_data(), ap.value_offset(self.index),
+                          ap.value_length(self.index))
+        return pyarrow_wrap_buffer(buf)
+
+
+cdef class LargeStringValue(ArrayValue):
+    """
+    Concrete class for large string (utf8) array elements.
+    """
+
+    def as_py(self):
+        """
+        Return this value as a Python unicode string.
+        """
+        cdef CLargeStringArray* ap = <CLargeStringArray*> self.sp_array.get()
+        return ap.GetString(self.index).decode('utf-8')
+
+    def as_buffer(self):
+        """
+        Return a view over this value as a Buffer object.
+        """
+        cdef:
+            CLargeStringArray* ap = <CLargeStringArray*> self.sp_array.get()
+            shared_ptr[CBuffer] buf
+
+        buf = SliceBuffer(ap.value_data(), ap.value_offset(self.index),
+                          ap.value_length(self.index))
+        return pyarrow_wrap_buffer(buf)
+
 
 cdef class BinaryValue(ArrayValue):
     """
@@ -468,6 +505,36 @@ cdef class BinaryValue(ArrayValue):
         return pyarrow_wrap_buffer(buf)
 
 
+cdef class LargeBinaryValue(ArrayValue):
+    """
+    Concrete class for large variable-sized binary array elements.
+    """
+
+    def as_py(self):
+        """
+        Return this value as a Python bytes object.
+        """
+        cdef:
+            const uint8_t* ptr
+            int64_t length
+            CLargeBinaryArray* ap = <CLargeBinaryArray*> self.sp_array.get()
+
+        ptr = ap.GetValue(self.index, &length)
+        return cp.PyBytes_FromStringAndSize(<const char*>(ptr), length)
+
+    def as_buffer(self):
+        """
+        Return a view over this value as a Buffer object.
+        """
+        cdef:
+            CLargeBinaryArray* ap = <CLargeBinaryArray*> self.sp_array.get()
+            shared_ptr[CBuffer] buf
+
+        buf = SliceBuffer(ap.value_data(), ap.value_offset(self.index),
+                          ap.value_length(self.index))
+        return pyarrow_wrap_buffer(buf)
+
+
 cdef class ListValue(ArrayValue):
     """
     Concrete class for list array elements.
@@ -665,6 +732,8 @@ cdef dict _array_value_classes = {
     _Type_UNION: UnionValue,
     _Type_BINARY: BinaryValue,
     _Type_STRING: StringValue,
+    _Type_LARGE_BINARY: LargeBinaryValue,
+    _Type_LARGE_STRING: LargeStringValue,
     _Type_FIXED_SIZE_BINARY: FixedSizeBinaryValue,
     _Type_DECIMAL: DecimalValue,
     _Type_STRUCT: StructValue,
diff --git a/python/pyarrow/tests/strategies.py b/python/pyarrow/tests/strategies.py
index a282864..7312ce3 100644
--- a/python/pyarrow/tests/strategies.py
+++ b/python/pyarrow/tests/strategies.py
@@ -38,6 +38,8 @@ bool_type = st.just(pa.bool_())
 
 binary_type = st.just(pa.binary())
 string_type = st.just(pa.string())
+large_binary_type = st.just(pa.large_binary())
+large_string_type = st.just(pa.large_string())
 
 signed_integer_types = st.sampled_from([
     pa.int8(),
@@ -87,6 +89,8 @@ primitive_types = st.one_of(
     bool_type,
     binary_type,
     string_type,
+    large_binary_type,
+    large_string_type,
     numeric_types,
     temporal_types
 )
@@ -190,9 +194,9 @@ def arrays(draw, type, size=None):
     elif pa.types.is_timestamp(type):
         tz = pytz.timezone(type.tz) if type.tz is not None else None
         value = st.datetimes(timezones=st.just(tz))
-    elif pa.types.is_binary(type):
+    elif pa.types.is_binary(type) or pa.types.is_large_binary(type):
         value = st.binary()
-    elif pa.types.is_string(type):
+    elif pa.types.is_string(type) or pa.types.is_large_string(type):
         value = st.text()
     elif pa.types.is_decimal(type):
         # TODO(kszucs): properly limit the precision
diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py
index 81d5952..f39706e 100644
--- a/python/pyarrow/tests/test_convert_builtin.py
+++ b/python/pyarrow/tests/test_convert_builtin.py
@@ -26,6 +26,7 @@ import collections
 import datetime
 import decimal
 import itertools
+import math
 import traceback
 import sys
 
@@ -605,7 +606,7 @@ def test_sequence_unicode():
     assert arr.to_pylist() == data
 
 
-def test_array_mixed_unicode_bytes():
+def check_array_mixed_unicode_bytes(binary_type, string_type):
     values = [u'qux', b'foo', bytearray(b'barz')]
     b_values = [b'qux', b'foo', b'barz']
     u_values = [u'qux', u'foo', u'barz']
@@ -615,36 +616,75 @@ def test_array_mixed_unicode_bytes():
     assert arr.type == pa.binary()
     assert arr.equals(expected)
 
-    arr = pa.array(values, type=pa.string())
-    expected = pa.array(u_values, type=pa.string())
-    assert arr.type == pa.string()
+    arr = pa.array(values, type=binary_type)
+    expected = pa.array(b_values, type=binary_type)
+    assert arr.type == binary_type
+    assert arr.equals(expected)
+
+    arr = pa.array(values, type=string_type)
+    expected = pa.array(u_values, type=string_type)
+    assert arr.type == string_type
     assert arr.equals(expected)
 
 
+def test_array_mixed_unicode_bytes():
+    check_array_mixed_unicode_bytes(pa.binary(), pa.string())
+    check_array_mixed_unicode_bytes(pa.large_binary(), pa.large_string())
+
+
+@pytest.mark.large_memory
+@pytest.mark.parametrize("ty", [pa.large_binary(), pa.large_string()])
+def test_large_binary_array(ty):
+    # Construct a large binary array with more than 4GB of data
+    s = b"0123456789abcdefghijklmnopqrstuvwxyz" * 10
+    nrepeats = math.ceil((2**32 + 5) / len(s))
+    data = [s] * nrepeats
+    arr = pa.array(data, type=ty)
+    assert isinstance(arr, pa.Array)
+    assert arr.type == ty
+    assert len(arr) == nrepeats
+
+
+@pytest.mark.large_memory
+@pytest.mark.parametrize("ty", [pa.large_binary(), pa.large_string()])
+def test_large_binary_value(ty):
+    # Construct a large binary array with a single value larger than 4GB
+    s = b"0123456789abcdefghijklmnopqrstuvwxyz"
+    nrepeats = math.ceil((2**32 + 5) / len(s))
+    arr = pa.array([b"foo", s * nrepeats, None, b"bar"], type=ty)
+    assert isinstance(arr, pa.Array)
+    assert arr.type == ty
+    assert len(arr) == 4
+    buf = arr[1].as_buffer()
+    assert len(buf) == len(s) * nrepeats
+
+
 def test_sequence_bytes():
     u1 = b'ma\xc3\xb1ana'
     data = [b'foo',
             u1.decode('utf-8'),  # unicode gets encoded,
             bytearray(b'bar'),
             None]
-    for ty in [None, pa.binary()]:
+    for ty in [None, pa.binary(), pa.large_binary()]:
         arr = pa.array(data, type=ty)
         assert len(arr) == 4
         assert arr.null_count == 1
-        assert arr.type == pa.binary()
+        assert arr.type == ty or pa.binary()
         assert arr.to_pylist() == [b'foo', u1, b'bar', None]
 
 
-def test_sequence_utf8_to_unicode():
+@pytest.mark.parametrize("ty", [pa.string(), pa.large_string()])
+def test_sequence_utf8_to_unicode(ty):
     # ARROW-1225
     data = [b'foo', None, b'bar']
-    arr = pa.array(data, type=pa.string())
+    arr = pa.array(data, type=ty)
+    assert arr.type == ty
     assert arr[0].as_py() == u'foo'
 
     # test a non-utf8 unicode string
     val = (u'mañana').encode('utf-16-le')
     with pytest.raises(pa.ArrowInvalid):
-        pa.array([val], type=pa.string())
+        pa.array([val], type=ty)
 
 
 def test_sequence_fixed_size_bytes():
diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py
index 406f4ee..ca7a10e 100644
--- a/python/pyarrow/tests/test_scalars.py
+++ b/python/pyarrow/tests/test_scalars.py
@@ -104,6 +104,25 @@ class TestScalars(unittest.TestCase):
         assert v == u'mañana'
         assert isinstance(v, unicode_type)
 
+    def test_large_string_unicode(self):
+        arr = pa.array([u'foo', None, u'mañana'], type=pa.large_string())
+
+        v = arr[0]
+        assert isinstance(v, pa.LargeStringValue)
+        assert v.as_py() == u'foo'
+        assert repr(v) == repr(u"foo")
+        assert str(v) == str(u"foo")
+        assert v == u'foo'
+        # Assert that newly created values are equal to the previously created
+        # one.
+        assert v == arr[0]
+
+        assert arr[1] is pa.NA
+
+        v = arr[2].as_py()
+        assert v == u'mañana'
+        assert isinstance(v, unicode_type)
+
     def test_bytes(self):
         arr = pa.array([b'foo', None, u('bar')])
 
@@ -122,6 +141,24 @@ class TestScalars(unittest.TestCase):
         assert arr[1] is pa.NA
         check_value(arr[2], b'bar')
 
+    def test_large_bytes(self):
+        arr = pa.array([b'foo', None, u('bar')], type=pa.large_binary())
+
+        def check_value(v, expected):
+            assert isinstance(v, pa.LargeBinaryValue)
+            assert v.as_py() == expected
+            assert str(v) == str(expected)
+            assert repr(v) == repr(expected)
+            assert v == expected
+            assert v != b'xxxxx'
+            buf = v.as_buffer()
+            assert isinstance(buf, pa.Buffer)
+            assert buf.to_pybytes() == expected
+
+        check_value(arr[0], b'foo')
+        assert arr[1] is pa.NA
+        check_value(arr[2], b'bar')
+
     def test_fixed_size_bytes(self):
         data = [b'foof', None, b'barb']
         arr = pa.array(data, type=pa.binary(4))
diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py
index 7d9abf7..de532e6 100644
--- a/python/pyarrow/tests/test_types.py
+++ b/python/pyarrow/tests/test_types.py
@@ -49,6 +49,8 @@ def get_many_types():
         pa.string(),
         pa.binary(),
         pa.binary(10),
+        pa.large_string(),
+        pa.large_binary(),
         pa.list_(pa.int32()),
         pa.struct([pa.field('a', pa.int32()),
                    pa.field('b', pa.int8()),
@@ -145,10 +147,24 @@ def test_is_union():
 def test_is_binary_string():
     assert types.is_binary(pa.binary())
     assert not types.is_binary(pa.string())
+    assert not types.is_binary(pa.large_binary())
+    assert not types.is_binary(pa.large_string())
 
     assert types.is_string(pa.string())
     assert types.is_unicode(pa.string())
     assert not types.is_string(pa.binary())
+    assert not types.is_string(pa.large_string())
+    assert not types.is_string(pa.large_binary())
+
+    assert types.is_large_binary(pa.large_binary())
+    assert not types.is_large_binary(pa.large_string())
+    assert not types.is_large_binary(pa.binary())
+    assert not types.is_large_binary(pa.string())
+
+    assert types.is_large_string(pa.large_string())
+    assert not types.is_large_string(pa.large_binary())
+    assert not types.is_large_string(pa.string())
+    assert not types.is_large_string(pa.binary())
 
     assert types.is_fixed_size_binary(pa.binary(5))
     assert not types.is_fixed_size_binary(pa.binary())
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index 657df04..0db15d5 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -106,6 +106,7 @@ cdef class DataType:
                         "instead.".format(self.__class__.__name__))
 
     cdef void init(self, const shared_ptr[CDataType]& type) except *:
+        assert type != nullptr
         self.sp_type = type
         self.type = type.get()
         self.pep3118_format = _datatype_to_pep3118(self.type)
@@ -1531,6 +1532,33 @@ def binary(int length=-1):
     return pyarrow_wrap_data_type(fixed_size_binary_type)
 
 
+def large_binary():
+    """
+    Create large variable-length binary type
+
+    This data type may not be supported by all Arrow implementations.  Unless
+    you need to represent data larger than 2GB, you should prefer binary().
+    """
+    return primitive_type(_Type_LARGE_BINARY)
+
+
+def large_string():
+    """
+    Create large UTF8 variable-length string type
+
+    This data type may not be supported by all Arrow implementations.  Unless
+    you need to represent data larger than 2GB, you should prefer string().
+    """
+    return primitive_type(_Type_LARGE_STRING)
+
+
+def large_utf8():
+    """
+    Alias for large_string()
+    """
+    return large_string()
+
+
 cpdef ListType list_(value_type):
     """
     Create ListType instance from child data type or field
@@ -1714,6 +1742,10 @@ cdef dict _type_aliases = {
     'str': string,
     'utf8': string,
     'binary': binary,
+    'large_string': large_string,
+    'large_str': large_string,
+    'large_utf8': large_string,
+    'large_binary': large_binary,
     'date32': date32,
     'date64': date64,
     'date32[day]': date32,
diff --git a/python/pyarrow/types.py b/python/pyarrow/types.py
index def1dde..dc314e8 100644
--- a/python/pyarrow/types.py
+++ b/python/pyarrow/types.py
@@ -228,6 +228,14 @@ def is_binary(t):
     return t.id == lib.Type_BINARY
 
 
+def is_large_binary(t):
+    """
+    Return True if value is an instance of a large variable-length
+    binary type
+    """
+    return t.id == lib.Type_LARGE_BINARY
+
+
 def is_unicode(t):
     """
     Alias for is_string
@@ -242,6 +250,20 @@ def is_string(t):
     return t.id == lib.Type_STRING
 
 
+def is_large_unicode(t):
+    """
+    Alias for is_large_string
+    """
+    return is_large_string(t)
+
+
+def is_large_string(t):
+    """
+    Return True if value is an instance of large string (utf8 unicode) type
+    """
+    return t.id == lib.Type_LARGE_STRING
+
+
 def is_fixed_size_binary(t):
     """
     Return True if value is an instance of a fixed size binary type