You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2018/04/05 13:34:56 UTC

[arrow] branch master updated: ARROW-2380: [Python] Streamline conversions

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 9515fe9  ARROW-2380: [Python] Streamline conversions
9515fe9 is described below

commit 9515fe92d31c67590500b22abd4249ba5f6575bc
Author: Antoine Pitrou <an...@python.org>
AuthorDate: Thu Apr 5 15:34:46 2018 +0200

    ARROW-2380: [Python] Streamline conversions
    
    * Correctly check for overflow when constructing string / binary objects
    * Accept bytearrays in the same places as bytes objects
    * Other refactorings
    
    Author: Antoine Pitrou <an...@python.org>
    
    Closes #1835 from pitrou/ARROW-2380-py-streamline-conversions and squashes the following commits:
    
    3ebb34f <Antoine Pitrou> Fix performance regression
    b238d3e <Antoine Pitrou> Templatize integer conversions
    1b269c9 <Antoine Pitrou> Make PyBytesView initializers non-static
    08db003 <Antoine Pitrou> Create python/decimal.{h,cc}
    b5bbb3e <Antoine Pitrou> ARROW-2380:  Streamline conversions
---
 cpp/src/arrow/python/CMakeLists.txt             |   2 +
 cpp/src/arrow/python/arrow_to_pandas.cc         |   1 +
 cpp/src/arrow/python/builtin_convert.cc         | 254 ++--------------
 cpp/src/arrow/python/builtin_convert.h          |   3 -
 cpp/src/arrow/python/common.cc                  |  16 +-
 cpp/src/arrow/python/common.h                   |  96 ++++--
 cpp/src/arrow/python/{helpers.cc => decimal.cc} | 118 +-------
 cpp/src/arrow/python/{helpers.h => decimal.h}   |  58 +---
 cpp/src/arrow/python/helpers.cc                 | 371 +++++++++++++-----------
 cpp/src/arrow/python/helpers.h                  | 110 +++----
 cpp/src/arrow/python/numpy_to_arrow.cc          | 100 ++-----
 cpp/src/arrow/python/python-test.cc             |   1 +
 cpp/src/arrow/python/python_to_arrow.cc         |  56 ++--
 python/pyarrow/lib.pyx                          |   4 -
 python/pyarrow/tests/test_convert_builtin.py    |  38 ++-
 python/pyarrow/tests/test_convert_pandas.py     |   8 +-
 python/pyarrow/tests/test_feather.py            |   2 +-
 17 files changed, 444 insertions(+), 794 deletions(-)

diff --git a/cpp/src/arrow/python/CMakeLists.txt b/cpp/src/arrow/python/CMakeLists.txt
index b985df9..f6c92a7 100644
--- a/cpp/src/arrow/python/CMakeLists.txt
+++ b/cpp/src/arrow/python/CMakeLists.txt
@@ -54,6 +54,7 @@ set(ARROW_PYTHON_SRCS
   builtin_convert.cc
   common.cc
   config.cc
+  decimal.cc
   helpers.cc
   init.cc
   io.cc
@@ -104,6 +105,7 @@ install(FILES
   builtin_convert.h
   common.h
   config.h
+  decimal.h
   helpers.h
   init.h
   io.h
diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc
index 92461fc..41a07d0 100644
--- a/cpp/src/arrow/python/arrow_to_pandas.cc
+++ b/cpp/src/arrow/python/arrow_to_pandas.cc
@@ -47,6 +47,7 @@
 #include "arrow/python/builtin_convert.h"
 #include "arrow/python/common.h"
 #include "arrow/python/config.h"
+#include "arrow/python/decimal.h"
 #include "arrow/python/helpers.h"
 #include "arrow/python/numpy-internal.h"
 #include "arrow/python/numpy_convert.h"
diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc
index 5e99992..459e299 100644
--- a/cpp/src/arrow/python/builtin_convert.cc
+++ b/cpp/src/arrow/python/builtin_convert.cc
@@ -33,6 +33,7 @@
 #include "arrow/util/decimal.h"
 #include "arrow/util/logging.h"
 
+#include "arrow/python/decimal.h"
 #include "arrow/python/helpers.h"
 #include "arrow/python/numpy_convert.h"
 #include "arrow/python/util/datetime.h"
@@ -42,26 +43,9 @@ namespace py {
 
 Status InvalidConversion(PyObject* obj, const std::string& expected_types,
                          std::ostream* out) {
-  OwnedRef type(PyObject_Type(obj));
-  RETURN_IF_PYERROR();
-  DCHECK_NE(type.obj(), nullptr);
-
-  OwnedRef type_name(PyObject_GetAttrString(type.obj(), "__name__"));
-  RETURN_IF_PYERROR();
-  DCHECK_NE(type_name.obj(), nullptr);
-
-  PyObjectStringify bytestring(type_name.obj());
-  RETURN_IF_PYERROR();
-
-  const char* bytes = bytestring.bytes;
-  DCHECK_NE(bytes, nullptr) << "bytes from type(...).__name__ were null";
-
-  Py_ssize_t size = bytestring.size;
-
-  std::string cpp_type_name(bytes, size);
-
-  (*out) << "Got Python object of type " << cpp_type_name
+  (*out) << "Got Python object of type " << Py_TYPE(obj)->tp_name
          << " but can only handle these types: " << expected_types;
+  // XXX streamline this?
   return Status::OK();
 }
 
@@ -100,7 +84,7 @@ class ScalarVisitor {
       ++date_count_;
     } else if (PyDateTime_CheckExact(obj)) {
       ++timestamp_count_;
-    } else if (PyBytes_Check(obj)) {
+    } else if (internal::IsPyBinary(obj)) {
       ++binary_count_;
     } else if (PyUnicode_Check(obj)) {
       ++unicode_count_;
@@ -445,122 +429,15 @@ class BoolConverter : public TypedConverterVisitor<BooleanBuilder, BoolConverter
   }
 };
 
-class Int8Converter : public TypedConverterVisitor<Int8Builder, Int8Converter> {
- public:
-  // Append a non-missing item
-  Status AppendItem(PyObject* obj) {
-    const auto val = static_cast<int64_t>(PyLong_AsLongLong(obj));
-
-    if (ARROW_PREDICT_FALSE(val > std::numeric_limits<int8_t>::max() ||
-                            val < std::numeric_limits<int8_t>::min())) {
-      return Status::Invalid(
-          "Cannot coerce values to array type that would "
-          "lose data");
-    }
-    RETURN_IF_PYERROR();
-    return typed_builder_->Append(static_cast<int8_t>(val));
-  }
-};
-
-class Int16Converter : public TypedConverterVisitor<Int16Builder, Int16Converter> {
- public:
-  // Append a non-missing item
-  Status AppendItem(PyObject* obj) {
-    const auto val = static_cast<int64_t>(PyLong_AsLongLong(obj));
-
-    if (ARROW_PREDICT_FALSE(val > std::numeric_limits<int16_t>::max() ||
-                            val < std::numeric_limits<int16_t>::min())) {
-      return Status::Invalid(
-          "Cannot coerce values to array type that would "
-          "lose data");
-    }
-    RETURN_IF_PYERROR();
-    return typed_builder_->Append(static_cast<int16_t>(val));
-  }
-};
-
-class Int32Converter : public TypedConverterVisitor<Int32Builder, Int32Converter> {
+template <typename IntType>
+class TypedIntConverter
+    : public TypedConverterVisitor<NumericBuilder<IntType>, TypedIntConverter<IntType>> {
  public:
   // Append a non-missing item
   Status AppendItem(PyObject* obj) {
-    const auto val = static_cast<int64_t>(PyLong_AsLongLong(obj));
-
-    if (ARROW_PREDICT_FALSE(val > std::numeric_limits<int32_t>::max() ||
-                            val < std::numeric_limits<int32_t>::min())) {
-      return Status::Invalid(
-          "Cannot coerce values to array type that would "
-          "lose data");
-    }
-    RETURN_IF_PYERROR();
-    return typed_builder_->Append(static_cast<int32_t>(val));
-  }
-};
-
-class Int64Converter : public TypedConverterVisitor<Int64Builder, Int64Converter> {
- public:
-  // Append a non-missing item
-  Status AppendItem(PyObject* obj) {
-    const auto val = static_cast<int64_t>(PyLong_AsLongLong(obj));
-    RETURN_IF_PYERROR();
-    return typed_builder_->Append(val);
-  }
-};
-
-class UInt8Converter : public TypedConverterVisitor<UInt8Builder, UInt8Converter> {
- public:
-  // Append a non-missing item
-  Status AppendItem(PyObject* obj) {
-    const auto val = static_cast<uint64_t>(PyLong_AsLongLong(obj));
-    RETURN_IF_PYERROR();
-
-    if (ARROW_PREDICT_FALSE(val > std::numeric_limits<uint8_t>::max())) {
-      return Status::Invalid(
-          "Cannot coerce values to array type that would "
-          "lose data");
-    }
-    return typed_builder_->Append(static_cast<uint8_t>(val));
-  }
-};
-
-class UInt16Converter : public TypedConverterVisitor<UInt16Builder, UInt16Converter> {
- public:
-  // Append a non-missing item
-  Status AppendItem(PyObject* obj) {
-    const auto val = static_cast<uint64_t>(PyLong_AsLongLong(obj));
-    RETURN_IF_PYERROR();
-
-    if (ARROW_PREDICT_FALSE(val > std::numeric_limits<uint16_t>::max())) {
-      return Status::Invalid(
-          "Cannot coerce values to array type that would "
-          "lose data");
-    }
-    return typed_builder_->Append(static_cast<uint16_t>(val));
-  }
-};
-
-class UInt32Converter : public TypedConverterVisitor<UInt32Builder, UInt32Converter> {
- public:
-  // Append a non-missing item
-  Status AppendItem(PyObject* obj) {
-    const auto val = static_cast<uint64_t>(PyLong_AsLongLong(obj));
-    RETURN_IF_PYERROR();
-
-    if (ARROW_PREDICT_FALSE(val > std::numeric_limits<uint32_t>::max())) {
-      return Status::Invalid(
-          "Cannot coerce values to array type that would "
-          "lose data");
-    }
-    return typed_builder_->Append(static_cast<uint32_t>(val));
-  }
-};
-
-class UInt64Converter : public TypedConverterVisitor<UInt64Builder, UInt64Converter> {
- public:
-  // Append a non-missing item
-  Status AppendItem(PyObject* obj) {
-    uint64_t val;
-    RETURN_NOT_OK(internal::UInt64FromPythonInt(obj, &val));
-    return typed_builder_->Append(val);
+    typename IntType::c_type value;
+    RETURN_NOT_OK(internal::CIntFromPython(obj, &value));
+    return this->typed_builder_->Append(value);
   }
 };
 
@@ -573,12 +450,7 @@ class Date32Converter : public TypedConverterVisitor<Date32Builder, Date32Conver
       auto pydate = reinterpret_cast<PyDateTime_Date*>(obj);
       t = static_cast<int32_t>(PyDate_to_s(pydate));
     } else {
-      const auto casted_val = static_cast<int64_t>(PyLong_AsLongLong(obj));
-      RETURN_IF_PYERROR();
-      if (casted_val > std::numeric_limits<int32_t>::max()) {
-        return Status::Invalid("Integer as date32 larger than INT32_MAX");
-      }
-      t = static_cast<int32_t>(casted_val);
+      RETURN_NOT_OK(internal::CIntFromPython(obj, &t, "Integer too large for date32"));
     }
     return typed_builder_->Append(t);
   }
@@ -593,8 +465,7 @@ class Date64Converter : public TypedConverterVisitor<Date64Builder, Date64Conver
       auto pydate = reinterpret_cast<PyDateTime_Date*>(obj);
       t = PyDate_to_ms(pydate);
     } else {
-      t = static_cast<int64_t>(PyLong_AsLongLong(obj));
-      RETURN_IF_PYERROR();
+      RETURN_NOT_OK(internal::CIntFromPython(obj, &t, "Integer too large for date64"));
     }
     return typed_builder_->Append(t);
   }
@@ -645,8 +516,7 @@ class TimestampConverter
 
       t = reinterpret_cast<PyDatetimeScalarObject*>(obj)->obval;
     } else {
-      t = static_cast<int64_t>(PyLong_AsLongLong(obj));
-      RETURN_IF_PYERROR();
+      RETURN_NOT_OK(internal::CIntFromPython(obj, &t));
     }
     return typed_builder_->Append(t);
   }
@@ -690,27 +560,7 @@ class BytesConverter : public TypedConverterVisitor<BinaryBuilder, BytesConverte
  public:
   // Append a non-missing item
   Status AppendItem(PyObject* obj) {
-    PyObject* bytes_obj;
-    const char* bytes;
-    Py_ssize_t length;
-    OwnedRef tmp;
-
-    if (PyUnicode_Check(obj)) {
-      tmp.reset(PyUnicode_AsUTF8String(obj));
-      RETURN_IF_PYERROR();
-      bytes_obj = tmp.obj();
-    } else if (PyBytes_Check(obj)) {
-      bytes_obj = obj;
-    } else {
-      std::stringstream ss;
-      ss << "Error converting to Binary type: ";
-      RETURN_NOT_OK(InvalidConversion(obj, "bytes", &ss));
-      return Status::Invalid(ss.str());
-    }
-    // No error checking
-    length = PyBytes_GET_SIZE(bytes_obj);
-    bytes = PyBytes_AS_STRING(bytes_obj);
-    return typed_builder_->Append(bytes, static_cast<int32_t>(length));
+    return internal::BuilderAppend(typed_builder_, obj);
   }
 };
 
@@ -719,27 +569,7 @@ class FixedWidthBytesConverter
  public:
   // Append a non-missing item
   Status AppendItem(PyObject* obj) {
-    PyObject* bytes_obj;
-    OwnedRef tmp;
-    Py_ssize_t expected_length =
-        std::dynamic_pointer_cast<FixedSizeBinaryType>(typed_builder_->type())
-            ->byte_width();
-    if (PyUnicode_Check(obj)) {
-      tmp.reset(PyUnicode_AsUTF8String(obj));
-      RETURN_IF_PYERROR();
-      bytes_obj = tmp.obj();
-    } else if (PyBytes_Check(obj)) {
-      bytes_obj = obj;
-    } else {
-      std::stringstream ss;
-      ss << "Error converting to FixedSizeBinary type: ";
-      RETURN_NOT_OK(InvalidConversion(obj, "bytes", &ss));
-      return Status::Invalid(ss.str());
-    }
-    // No error checking
-    RETURN_NOT_OK(CheckPythonBytesAreFixedLength(bytes_obj, expected_length));
-    return typed_builder_->Append(
-        reinterpret_cast<const uint8_t*>(PyBytes_AS_STRING(bytes_obj)));
+    return internal::BuilderAppend(typed_builder_, obj);
   }
 };
 
@@ -747,32 +577,7 @@ class UTF8Converter : public TypedConverterVisitor<StringBuilder, UTF8Converter>
  public:
   // Append a non-missing item
   Status AppendItem(PyObject* obj) {
-    PyObject* bytes_obj;
-    OwnedRef tmp;
-    const char* bytes;
-    Py_ssize_t length;
-
-    if (PyBytes_Check(obj)) {
-      tmp.reset(
-          PyUnicode_FromStringAndSize(PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj)));
-      RETURN_IF_PYERROR();
-      bytes_obj = obj;
-    } else if (!PyUnicode_Check(obj)) {
-      OwnedRef repr(PyObject_Repr(obj));
-      PyObjectStringify stringified(repr.obj());
-      std::stringstream ss;
-      ss << "Non bytes/unicode value encountered: " << stringified.bytes;
-      return Status::Invalid(ss.str());
-    } else {
-      tmp.reset(PyUnicode_AsUTF8String(obj));
-      RETURN_IF_PYERROR();
-      bytes_obj = tmp.obj();
-    }
-
-    // No error checking
-    length = PyBytes_GET_SIZE(bytes_obj);
-    bytes = PyBytes_AS_STRING(bytes_obj);
-    return typed_builder_->Append(bytes, static_cast<int32_t>(length));
+    return internal::BuilderAppend(typed_builder_, obj, true /* check_valid */);
   }
 };
 
@@ -876,21 +681,21 @@ std::unique_ptr<SeqConverter> GetConverter(const std::shared_ptr<DataType>& type
     case Type::BOOL:
       return std::unique_ptr<SeqConverter>(new BoolConverter);
     case Type::INT8:
-      return std::unique_ptr<SeqConverter>(new Int8Converter);
+      return std::unique_ptr<SeqConverter>(new TypedIntConverter<Int8Type>);
     case Type::INT16:
-      return std::unique_ptr<SeqConverter>(new Int16Converter);
+      return std::unique_ptr<SeqConverter>(new TypedIntConverter<Int16Type>);
     case Type::INT32:
-      return std::unique_ptr<SeqConverter>(new Int32Converter);
+      return std::unique_ptr<SeqConverter>(new TypedIntConverter<Int32Type>);
     case Type::INT64:
-      return std::unique_ptr<SeqConverter>(new Int64Converter);
+      return std::unique_ptr<SeqConverter>(new TypedIntConverter<Int64Type>);
     case Type::UINT8:
-      return std::unique_ptr<SeqConverter>(new UInt8Converter);
+      return std::unique_ptr<SeqConverter>(new TypedIntConverter<UInt8Type>);
     case Type::UINT16:
-      return std::unique_ptr<SeqConverter>(new UInt16Converter);
+      return std::unique_ptr<SeqConverter>(new TypedIntConverter<UInt16Type>);
     case Type::UINT32:
-      return std::unique_ptr<SeqConverter>(new UInt32Converter);
+      return std::unique_ptr<SeqConverter>(new TypedIntConverter<UInt32Type>);
     case Type::UINT64:
-      return std::unique_ptr<SeqConverter>(new UInt64Converter);
+      return std::unique_ptr<SeqConverter>(new TypedIntConverter<UInt64Type>);
     case Type::DATE32:
       return std::unique_ptr<SeqConverter>(new Date32Converter);
     case Type::DATE64:
@@ -1032,16 +837,5 @@ Status ConvertPySequence(PyObject* obj, int64_t size,
   return ConvertPySequenceReal(obj, size, &type, pool, out);
 }
 
-Status CheckPythonBytesAreFixedLength(PyObject* obj, Py_ssize_t expected_length) {
-  const Py_ssize_t length = PyBytes_GET_SIZE(obj);
-  if (length != expected_length) {
-    std::stringstream ss;
-    ss << "Found byte string of length " << length << ", expected length is "
-       << expected_length;
-    return Status::Invalid(ss.str());
-  }
-  return Status::OK();
-}
-
 }  // namespace py
 }  // namespace arrow
diff --git a/cpp/src/arrow/python/builtin_convert.h b/cpp/src/arrow/python/builtin_convert.h
index 4bd3f08..7a32bec 100644
--- a/cpp/src/arrow/python/builtin_convert.h
+++ b/cpp/src/arrow/python/builtin_convert.h
@@ -73,9 +73,6 @@ ARROW_EXPORT
 Status InvalidConversion(PyObject* obj, const std::string& expected_type_name,
                          std::ostream* out);
 
-ARROW_EXPORT Status CheckPythonBytesAreFixedLength(PyObject* obj,
-                                                   Py_ssize_t expected_length);
-
 }  // namespace py
 }  // namespace arrow
 
diff --git a/cpp/src/arrow/python/common.cc b/cpp/src/arrow/python/common.cc
index 1ded880..bd13f29 100644
--- a/cpp/src/arrow/python/common.cc
+++ b/cpp/src/arrow/python/common.cc
@@ -25,6 +25,8 @@
 #include "arrow/status.h"
 #include "arrow/util/logging.h"
 
+#include "arrow/python/helpers.h"
+
 namespace arrow {
 namespace py {
 
@@ -87,19 +89,15 @@ Status CheckPyError(StatusCode code) {
     PyObject* exc_value = nullptr;
     PyObject* traceback = nullptr;
 
-    OwnedRef exc_type_ref(exc_type);
-    OwnedRef exc_value_ref(exc_value);
-    OwnedRef traceback_ref(traceback);
-
     PyErr_Fetch(&exc_type, &exc_value, &traceback);
-
     PyErr_NormalizeException(&exc_type, &exc_value, &traceback);
 
-    OwnedRef exc_value_str(PyObject_Str(exc_value));
-    PyObjectStringify stringified(exc_value_str.obj());
-    std::string message(stringified.bytes);
+    OwnedRef exc_type_ref(exc_type);
+    OwnedRef exc_value_ref(exc_value);
+    OwnedRef traceback_ref(traceback);
 
-    PyErr_Clear();
+    std::string message;
+    RETURN_NOT_OK(internal::PyObject_StdStringStr(exc_value, &message));
     return Status(code, message);
   }
   return Status::OK();
diff --git a/cpp/src/arrow/python/common.h b/cpp/src/arrow/python/common.h
index b2844b1..76aee16 100644
--- a/cpp/src/arrow/python/common.h
+++ b/cpp/src/arrow/python/common.h
@@ -19,7 +19,9 @@
 #define ARROW_PYTHON_COMMON_H
 
 #include <memory>
+#include <sstream>
 #include <string>
+#include <utility>
 
 #include "arrow/python/config.h"
 
@@ -33,6 +35,15 @@ class MemoryPool;
 
 namespace py {
 
+ARROW_EXPORT Status CheckPyError(StatusCode code = StatusCode::UnknownError);
+
+ARROW_EXPORT Status PassPyError();
+
+// TODO(wesm): We can just let errors pass through. To be explored later
+#define RETURN_IF_PYERROR() RETURN_NOT_OK(CheckPyError());
+
+#define PY_RETURN_IF_ERROR(CODE) RETURN_NOT_OK(CheckPyError(CODE));
+
 class ARROW_EXPORT PyAcquireGIL {
  public:
   PyAcquireGIL() : acquired_gil_(false) { acquire(); }
@@ -70,6 +81,11 @@ class ARROW_EXPORT OwnedRef {
   OwnedRef(OwnedRef&& other) : OwnedRef(other.detach()) {}
   explicit OwnedRef(PyObject* obj) : obj_(obj) {}
 
+  OwnedRef& operator=(OwnedRef&& other) {
+    obj_ = other.detach();
+    return *this;
+  }
+
   ~OwnedRef() { reset(); }
 
   void reset(PyObject* obj) {
@@ -89,6 +105,8 @@ class ARROW_EXPORT OwnedRef {
 
   PyObject** ref() { return &obj_; }
 
+  operator bool() const { return obj_ != NULLPTR; }
+
  private:
   ARROW_DISALLOW_COPY_AND_ASSIGN(OwnedRef);
 
@@ -110,36 +128,72 @@ class ARROW_EXPORT OwnedRefNoGIL : public OwnedRef {
   }
 };
 
-struct ARROW_EXPORT PyObjectStringify {
-  OwnedRef tmp_obj;
+// A temporary conversion of a Python object to a bytes area.
+struct ARROW_EXPORT PyBytesView {
   const char* bytes;
   Py_ssize_t size;
 
-  explicit PyObjectStringify(PyObject* obj) {
-    PyObject* bytes_obj;
+  PyBytesView() : bytes(nullptr), size(0), ref(nullptr) {}
+
+  // View the given Python object as binary-like, i.e. bytes
+  Status FromBinary(PyObject* obj) { return FromBinary(obj, "a bytes object"); }
+
+  // View the given Python object as string-like, i.e. str or (utf8) bytes
+  Status FromString(PyObject* obj, bool check_valid = false) {
     if (PyUnicode_Check(obj)) {
-      bytes_obj = PyUnicode_AsUTF8String(obj);
-      tmp_obj.reset(bytes_obj);
-      bytes = PyBytes_AsString(bytes_obj);
-      size = PyBytes_GET_SIZE(bytes_obj);
-    } else if (PyBytes_Check(obj)) {
-      bytes = PyBytes_AsString(obj);
-      size = PyBytes_GET_SIZE(obj);
+#if PY_MAJOR_VERSION >= 3
+      Py_ssize_t size;
+      // The utf-8 representation is cached on the unicode object
+      const char* data = PyUnicode_AsUTF8AndSize(obj, &size);
+      RETURN_IF_PYERROR();
+      this->bytes = data;
+      this->size = size;
+      this->ref.reset();
+      return Status::OK();
+#else
+      PyObject* converted = PyUnicode_AsUTF8String(obj);
+      RETURN_IF_PYERROR();
+      this->bytes = PyBytes_AS_STRING(converted);
+      this->size = PyBytes_GET_SIZE(converted);
+      this->ref.reset(converted);
+      return Status::OK();
+#endif
     } else {
-      bytes = NULLPTR;
-      size = -1;
+      RETURN_NOT_OK(FromBinary(obj, "a string or bytes object"));
+      if (check_valid) {
+        // Check the bytes are valid utf-8
+        OwnedRef decoded(PyUnicode_FromStringAndSize(bytes, size));
+        RETURN_IF_PYERROR();
+      }
+      return Status::OK();
     }
   }
-};
-
-Status CheckPyError(StatusCode code = StatusCode::UnknownError);
 
-Status PassPyError();
-
-// TODO(wesm): We can just let errors pass through. To be explored later
-#define RETURN_IF_PYERROR() RETURN_NOT_OK(CheckPyError());
+ protected:
+  PyBytesView(const char* b, Py_ssize_t s, PyObject* obj = nullptr)
+      : bytes(b), size(s), ref(obj) {}
+
+  Status FromBinary(PyObject* obj, const char* expected_msg) {
+    if (PyBytes_Check(obj)) {
+      this->bytes = PyBytes_AS_STRING(obj);
+      this->size = PyBytes_GET_SIZE(obj);
+      this->ref.reset();
+      return Status::OK();
+    } else if (PyByteArray_Check(obj)) {
+      this->bytes = PyByteArray_AS_STRING(obj);
+      this->size = PyByteArray_GET_SIZE(obj);
+      this->ref.reset();
+      return Status::OK();
+    } else {
+      std::stringstream ss;
+      ss << "Expected " << expected_msg << ", got a '" << Py_TYPE(obj)->tp_name
+         << "' object";
+      return Status::TypeError(ss.str());
+    }
+  }
 
-#define PY_RETURN_IF_ERROR(CODE) RETURN_NOT_OK(CheckPyError(CODE));
+  OwnedRef ref;
+};
 
 // Return the common PyArrow memory pool
 ARROW_EXPORT void set_default_memory_pool(MemoryPool* pool);
diff --git a/cpp/src/arrow/python/helpers.cc b/cpp/src/arrow/python/decimal.cc
similarity index 66%
copy from cpp/src/arrow/python/helpers.cc
copy to cpp/src/arrow/python/decimal.cc
index 4fd9ef2..10593c7 100644
--- a/cpp/src/arrow/python/helpers.cc
+++ b/cpp/src/arrow/python/decimal.cc
@@ -17,9 +17,9 @@
 
 #include <algorithm>
 #include <limits>
-#include <sstream>
 
 #include "arrow/python/common.h"
+#include "arrow/python/decimal.h"
 #include "arrow/python/helpers.h"
 #include "arrow/util/decimal.h"
 #include "arrow/util/logging.h"
@@ -28,75 +28,8 @@
 
 namespace arrow {
 namespace py {
-
-#define GET_PRIMITIVE_TYPE(NAME, FACTORY) \
-  case Type::NAME:                        \
-    return FACTORY()
-
-std::shared_ptr<DataType> GetPrimitiveType(Type::type type) {
-  switch (type) {
-    case Type::NA:
-      return null();
-      GET_PRIMITIVE_TYPE(UINT8, uint8);
-      GET_PRIMITIVE_TYPE(INT8, int8);
-      GET_PRIMITIVE_TYPE(UINT16, uint16);
-      GET_PRIMITIVE_TYPE(INT16, int16);
-      GET_PRIMITIVE_TYPE(UINT32, uint32);
-      GET_PRIMITIVE_TYPE(INT32, int32);
-      GET_PRIMITIVE_TYPE(UINT64, uint64);
-      GET_PRIMITIVE_TYPE(INT64, int64);
-      GET_PRIMITIVE_TYPE(DATE32, date32);
-      GET_PRIMITIVE_TYPE(DATE64, date64);
-      GET_PRIMITIVE_TYPE(BOOL, boolean);
-      GET_PRIMITIVE_TYPE(HALF_FLOAT, float16);
-      GET_PRIMITIVE_TYPE(FLOAT, float32);
-      GET_PRIMITIVE_TYPE(DOUBLE, float64);
-      GET_PRIMITIVE_TYPE(BINARY, binary);
-      GET_PRIMITIVE_TYPE(STRING, utf8);
-    default:
-      return nullptr;
-  }
-}
-
-PyObject* PyHalf_FromHalf(npy_half value) {
-  PyObject* result = PyArrayScalar_New(Half);
-  if (result != NULL) {
-    PyArrayScalar_ASSIGN(result, Half, value);
-  }
-  return result;
-}
-
-Status PyFloat_AsHalf(PyObject* obj, npy_half* out) {
-  if (PyArray_IsScalar(obj, Half)) {
-    *out = PyArrayScalar_VAL(obj, Half);
-    return Status::OK();
-  } else {
-    // XXX: cannot use npy_double_to_half() without linking with Numpy
-    return Status::TypeError("Expected np.float16 instance");
-  }
-}
-
 namespace internal {
 
-Status ImportModule(const std::string& module_name, OwnedRef* ref) {
-  PyObject* module = PyImport_ImportModule(module_name.c_str());
-  RETURN_IF_PYERROR();
-  DCHECK_NE(module, nullptr) << "unable to import the " << module_name << " module";
-  ref->reset(module);
-  return Status::OK();
-}
-
-Status ImportFromModule(const OwnedRef& module, const std::string& name, OwnedRef* ref) {
-  /// Assumes that ImportModule was called first
-  DCHECK_NE(module.obj(), nullptr) << "Cannot import from nullptr Python module";
-
-  PyObject* attr = PyObject_GetAttrString(module.obj(), name.c_str());
-  RETURN_IF_PYERROR();
-  DCHECK_NE(attr, nullptr) << "unable to import the " << name << " object";
-  ref->reset(attr);
-  return Status::OK();
-}
-
 Status ImportDecimalType(OwnedRef* decimal_type) {
   OwnedRef decimal_module;
   RETURN_NOT_OK(ImportModule("decimal", &decimal_module));
@@ -106,20 +39,7 @@ Status ImportDecimalType(OwnedRef* decimal_type) {
 
 Status PythonDecimalToString(PyObject* python_decimal, std::string* out) {
   // Call Python's str(decimal_object)
-  OwnedRef str_obj(PyObject_Str(python_decimal));
-  RETURN_IF_PYERROR();
-
-  PyObjectStringify str(str_obj.obj());
-  RETURN_IF_PYERROR();
-
-  const char* bytes = str.bytes;
-  DCHECK_NE(bytes, nullptr);
-
-  Py_ssize_t size = str.size;
-
-  std::string c_string(bytes, size);
-  *out = c_string;
-  return Status::OK();
+  return PyObject_StdStringStr(python_decimal, out);
 }
 
 // \brief Infer the precision and scale of a Python decimal.Decimal instance
@@ -219,31 +139,6 @@ Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arr
   return Status::OK();
 }
 
-bool IsPyInteger(PyObject* obj) {
-#if PYARROW_IS_PY2
-  return PyLong_Check(obj) || PyInt_Check(obj);
-#else
-  return PyLong_Check(obj);
-#endif
-}
-
-Status UInt64FromPythonInt(PyObject* obj, uint64_t* out) {
-  OwnedRef ref;
-  // PyLong_AsUnsignedLongLong() doesn't handle conversion from non-ints
-  // (e.g. np.uint64), so do it ourselves
-  if (!PyLong_Check(obj)) {
-    ref.reset(PyNumber_Long(obj));
-    RETURN_IF_PYERROR();
-    obj = ref.obj();
-  }
-  auto result = static_cast<uint64_t>(PyLong_AsUnsignedLongLong(obj));
-  if (result == static_cast<uint64_t>(-1)) {
-    RETURN_IF_PYERROR();
-  }
-  *out = static_cast<uint64_t>(result);
-  return Status::OK();
-}
-
 bool PyDecimal_Check(PyObject* obj) {
   static OwnedRef decimal_type;
   if (!decimal_type.obj()) {
@@ -301,15 +196,6 @@ Status DecimalMetadata::Update(PyObject* object) {
   return Update(precision, scale);
 }
 
-bool PyFloat_IsNaN(PyObject* obj) {
-  return PyFloat_Check(obj) && std::isnan(PyFloat_AsDouble(obj));
-}
-
-bool PandasObjectIsNull(PyObject* obj) {
-  return obj == Py_None || obj == numpy_nan || PyFloat_IsNaN(obj) ||
-         (internal::PyDecimal_Check(obj) && internal::PyDecimal_ISNAN(obj));
-}
-
 }  // namespace internal
 }  // namespace py
 }  // namespace arrow
diff --git a/cpp/src/arrow/python/helpers.h b/cpp/src/arrow/python/decimal.h
similarity index 63%
copy from cpp/src/arrow/python/helpers.h
copy to cpp/src/arrow/python/decimal.h
index e2f3b18..41d821f 100644
--- a/cpp/src/arrow/python/helpers.h
+++ b/cpp/src/arrow/python/decimal.h
@@ -15,20 +15,12 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#ifndef PYARROW_HELPERS_H
-#define PYARROW_HELPERS_H
+#ifndef ARROW_PYTHON_DECIMAL_H
+#define ARROW_PYTHON_DECIMAL_H
 
-#include "arrow/python/platform.h"
-
-#include <memory>
 #include <string>
-#include <utility>
-
-#include <numpy/halffloat.h>
 
 #include "arrow/type.h"
-#include "arrow/util/macros.h"
-#include "arrow/util/visibility.h"
 
 namespace arrow {
 
@@ -36,34 +28,13 @@ class Decimal128;
 
 namespace py {
 
-class OwnedRef;
-
-// \brief Get an arrow DataType instance from Arrow's Type::type enum
-// \param[in] type One of the values of Arrow's Type::type enum
-// \return A shared pointer to DataType
-ARROW_EXPORT std::shared_ptr<DataType> GetPrimitiveType(Type::type type);
-
-// \brief Construct a np.float16 object from a npy_half value.
-ARROW_EXPORT PyObject* PyHalf_FromHalf(npy_half value);
-
-// \brief Convert a Python object to a npy_half value.
-ARROW_EXPORT Status PyFloat_AsHalf(PyObject* obj, npy_half* out);
+//
+// Python Decimal support
+//
 
 namespace internal {
 
-// \brief Import a Python module
-// \param[in] module_name The name of the module
-// \param[out] ref The OwnedRef containing the module PyObject*
-Status ImportModule(const std::string& module_name, OwnedRef* ref);
-
-// \brief Import an object from a Python module
-// \param[in] module A Python module
-// \param[in] name The name of the object to import
-// \param[out] ref The OwnedRef containing the \c name attribute of the Python module \c
-// module
-Status ImportFromModule(const OwnedRef& module, const std::string& name, OwnedRef* ref);
-
-// \brief Import
+// \brief Import the Python Decimal type
 Status ImportDecimalType(OwnedRef* decimal_type);
 
 // \brief Convert a Python Decimal object to a C++ string
@@ -87,15 +58,6 @@ PyObject* DecimalFromString(PyObject* decimal_constructor,
 Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type,
                                 Decimal128* out);
 
-// \brief Check whether obj is an integer, independent of Python versions.
-bool IsPyInteger(PyObject* obj);
-
-// \brief Use pandas missing value semantics to check if a value is null
-bool PandasObjectIsNull(PyObject* obj);
-
-// \brief Check whether obj is nan
-bool PyFloat_IsNaN(PyObject* obj);
-
 // \brief Check whether obj is an instance of Decimal
 bool PyDecimal_Check(PyObject* obj);
 
@@ -103,12 +65,6 @@ bool PyDecimal_Check(PyObject* obj);
 // is not a Decimal instance
 bool PyDecimal_ISNAN(PyObject* obj);
 
-// \brief Convert a Python integer into an unsigned 64-bit integer
-// \param[in] obj A Python integer
-// \param[out] out A pointer to a C uint64_t to hold the result of the conversion
-// \return The status of the operation
-Status UInt64FromPythonInt(PyObject* obj, uint64_t* out);
-
 // \brief Helper class to track and update the precision and scale of a decimal
 class DecimalMetadata {
  public:
@@ -137,4 +93,4 @@ class DecimalMetadata {
 }  // namespace py
 }  // namespace arrow
 
-#endif  // PYARROW_HELPERS_H
+#endif  // ARROW_PYTHON_DECIMAL_H
diff --git a/cpp/src/arrow/python/helpers.cc b/cpp/src/arrow/python/helpers.cc
index 4fd9ef2..bb0837c 100644
--- a/cpp/src/arrow/python/helpers.cc
+++ b/cpp/src/arrow/python/helpers.cc
@@ -15,13 +15,14 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include <algorithm>
 #include <limits>
 #include <sstream>
+#include <type_traits>
+#include <typeinfo>
 
 #include "arrow/python/common.h"
+#include "arrow/python/decimal.h"
 #include "arrow/python/helpers.h"
-#include "arrow/util/decimal.h"
 #include "arrow/util/logging.h"
 
 #include <arrow/api.h>
@@ -78,6 +79,60 @@ Status PyFloat_AsHalf(PyObject* obj, npy_half* out) {
 
 namespace internal {
 
+std::string PyBytes_AsStdString(PyObject* obj) {
+  DCHECK(PyBytes_Check(obj));
+  return std::string(PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj));
+}
+
+Status PyUnicode_AsStdString(PyObject* obj, std::string* out) {
+  DCHECK(PyUnicode_Check(obj));
+#if PY_MAJOR_VERSION >= 3
+  Py_ssize_t size;
+  // The utf-8 representation is cached on the unicode object
+  const char* data = PyUnicode_AsUTF8AndSize(obj, &size);
+  RETURN_IF_PYERROR();
+  *out = std::string(data, size);
+  return Status::OK();
+#else
+  OwnedRef bytes_ref(PyUnicode_AsUTF8String(obj));
+  RETURN_IF_PYERROR();
+  *out = PyBytes_AsStdString(bytes_ref.obj());
+  return Status::OK();
+#endif
+}
+
+std::string PyObject_StdStringRepr(PyObject* obj) {
+#if PY_MAJOR_VERSION >= 3
+  OwnedRef unicode_ref(PyObject_Repr(obj));
+  OwnedRef bytes_ref;
+
+  if (unicode_ref) {
+    bytes_ref.reset(
+        PyUnicode_AsEncodedString(unicode_ref.obj(), "utf8", "backslashreplace"));
+  }
+#else
+  OwnedRef bytes_ref(PyObject_Repr(obj));
+  if (!bytes_ref) {
+    PyErr_Clear();
+    std::stringstream ss;
+    ss << "<object of type '" << Py_TYPE(obj)->tp_name << "' repr() failed>";
+    return ss.str();
+  }
+#endif
+  return PyBytes_AsStdString(bytes_ref.obj());
+}
+
+Status PyObject_StdStringStr(PyObject* obj, std::string* out) {
+  OwnedRef string_ref(PyObject_Str(obj));
+  RETURN_IF_PYERROR();
+#if PY_MAJOR_VERSION >= 3
+  return PyUnicode_AsStdString(string_ref.obj(), out);
+#else
+  *out = PyBytes_AsStdString(string_ref.obj());
+  return Status::OK();
+#endif
+}
+
 Status ImportModule(const std::string& module_name, OwnedRef* ref) {
   PyObject* module = PyImport_ImportModule(module_name.c_str());
   RETURN_IF_PYERROR();
@@ -97,209 +152,177 @@ Status ImportFromModule(const OwnedRef& module, const std::string& name, OwnedRe
   return Status::OK();
 }
 
-Status ImportDecimalType(OwnedRef* decimal_type) {
-  OwnedRef decimal_module;
-  RETURN_NOT_OK(ImportModule("decimal", &decimal_module));
-  RETURN_NOT_OK(ImportFromModule(decimal_module, "Decimal", decimal_type));
+Status BuilderAppend(BinaryBuilder* builder, PyObject* obj, bool* is_full) {
+  PyBytesView view;
+  // XXX For some reason, we must accept unicode objects here
+  RETURN_NOT_OK(view.FromString(obj));
+  int32_t length;
+  RETURN_NOT_OK(CastSize(view.size, &length));
+  // Did we reach the builder size limit?
+  if (ARROW_PREDICT_FALSE(builder->value_data_length() + length > kBinaryMemoryLimit)) {
+    if (is_full) {
+      *is_full = true;
+      return Status::OK();
+    } else {
+      return Status::Invalid("Maximum array size reached (2GB)");
+    }
+  }
+  RETURN_NOT_OK(builder->Append(view.bytes, length));
+  if (is_full) {
+    *is_full = false;
+  }
   return Status::OK();
 }
 
-Status PythonDecimalToString(PyObject* python_decimal, std::string* out) {
-  // Call Python's str(decimal_object)
-  OwnedRef str_obj(PyObject_Str(python_decimal));
-  RETURN_IF_PYERROR();
-
-  PyObjectStringify str(str_obj.obj());
-  RETURN_IF_PYERROR();
-
-  const char* bytes = str.bytes;
-  DCHECK_NE(bytes, nullptr);
-
-  Py_ssize_t size = str.size;
-
-  std::string c_string(bytes, size);
-  *out = c_string;
+Status BuilderAppend(FixedSizeBinaryBuilder* builder, PyObject* obj, bool* is_full) {
+  PyBytesView view;
+  // XXX For some reason, we must accept unicode objects here
+  RETURN_NOT_OK(view.FromString(obj));
+  const auto expected_length =
+      static_cast<const FixedSizeBinaryType&>(*builder->type()).byte_width();
+  if (ARROW_PREDICT_FALSE(view.size != expected_length)) {
+    std::stringstream ss;
+    ss << "Got bytestring of length " << view.size << " (expected " << expected_length
+       << ")";
+    return Status::Invalid(ss.str());
+  }
+  // Did we reach the builder size limit?
+  if (ARROW_PREDICT_FALSE(builder->value_data_length() + view.size >
+                          kBinaryMemoryLimit)) {
+    if (is_full) {
+      *is_full = true;
+      return Status::OK();
+    } else {
+      return Status::Invalid("Maximum array size reached (2GB)");
+    }
+  }
+  RETURN_NOT_OK(builder->Append(view.bytes));
+  if (is_full) {
+    *is_full = false;
+  }
   return Status::OK();
 }
 
-// \brief Infer the precision and scale of a Python decimal.Decimal instance
-// \param python_decimal[in] An instance of decimal.Decimal
-// \param precision[out] The value of the inferred precision
-// \param scale[out] The value of the inferred scale
-// \return The status of the operation
-static Status InferDecimalPrecisionAndScale(PyObject* python_decimal, int32_t* precision,
-                                            int32_t* scale) {
-  DCHECK_NE(python_decimal, NULLPTR);
-  DCHECK_NE(precision, NULLPTR);
-  DCHECK_NE(scale, NULLPTR);
-
-  // TODO(phillipc): Make sure we perform PyDecimal_Check(python_decimal) as a DCHECK
-  OwnedRef as_tuple(PyObject_CallMethod(python_decimal, const_cast<char*>("as_tuple"),
-                                        const_cast<char*>("")));
-  RETURN_IF_PYERROR();
-  DCHECK(PyTuple_Check(as_tuple.obj()));
-
-  OwnedRef digits(PyObject_GetAttrString(as_tuple.obj(), "digits"));
-  RETURN_IF_PYERROR();
-  DCHECK(PyTuple_Check(digits.obj()));
-
-  const auto num_digits = static_cast<int32_t>(PyTuple_Size(digits.obj()));
-  RETURN_IF_PYERROR();
-
-  OwnedRef py_exponent(PyObject_GetAttrString(as_tuple.obj(), "exponent"));
-  RETURN_IF_PYERROR();
-  DCHECK(IsPyInteger(py_exponent.obj()));
-
-  const auto exponent = static_cast<int32_t>(PyLong_AsLong(py_exponent.obj()));
-  RETURN_IF_PYERROR();
-
-  const int32_t abs_exponent = std::abs(exponent);
-
-  int32_t num_additional_zeros;
-
-  if (num_digits <= abs_exponent) {
-    DCHECK_NE(exponent, 0) << "exponent should never be zero here";
-
-    // we have leading/trailing zeros, leading if exponent is negative
-    num_additional_zeros = exponent < 0 ? abs_exponent - num_digits : exponent;
-    *scale = static_cast<int32_t>(exponent < 0) * -exponent;
-  } else {
-    // we can use the number of digits as the precision
-    num_additional_zeros = 0;
-    *scale = -exponent;
+Status BuilderAppend(StringBuilder* builder, PyObject* obj, bool check_valid,
+                     bool* is_full) {
+  PyBytesView view;
+  RETURN_NOT_OK(view.FromString(obj, check_valid));
+  int32_t length;
+  RETURN_NOT_OK(CastSize(view.size, &length));
+  // Did we reach the builder size limit?
+  if (ARROW_PREDICT_FALSE(builder->value_data_length() + length > kBinaryMemoryLimit)) {
+    if (is_full) {
+      *is_full = true;
+      return Status::OK();
+    } else {
+      return Status::Invalid("Maximum array size reached (2GB)");
+    }
+  }
+  RETURN_NOT_OK(builder->Append(view.bytes, length));
+  if (is_full) {
+    *is_full = false;
   }
-
-  *precision = num_digits + num_additional_zeros;
   return Status::OK();
 }
 
-PyObject* DecimalFromString(PyObject* decimal_constructor,
-                            const std::string& decimal_string) {
-  DCHECK_NE(decimal_constructor, nullptr);
-
-  auto string_size = decimal_string.size();
-  DCHECK_GT(string_size, 0);
-
-  auto string_bytes = decimal_string.c_str();
-  DCHECK_NE(string_bytes, nullptr);
-
-  return PyObject_CallFunction(decimal_constructor, const_cast<char*>("s#"), string_bytes,
-                               string_size);
-}
-
-Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type,
-                                Decimal128* out) {
-  DCHECK_NE(python_decimal, NULLPTR);
-  DCHECK_NE(out, NULLPTR);
-
-  std::string string;
-  RETURN_NOT_OK(PythonDecimalToString(python_decimal, &string));
+namespace {
 
-  int32_t inferred_precision;
-  int32_t inferred_scale;
-
-  RETURN_NOT_OK(
-      Decimal128::FromString(string, out, &inferred_precision, &inferred_scale));
-
-  const int32_t precision = arrow_type.precision();
-  const int32_t scale = arrow_type.scale();
-
-  if (ARROW_PREDICT_FALSE(inferred_precision > precision)) {
-    std::stringstream buf;
-    buf << "Decimal type with precision " << inferred_precision
-        << " does not fit into precision inferred from first array element: "
-        << precision;
-    return Status::Invalid(buf.str());
+Status IntegerOverflowStatus(const std::string& overflow_message) {
+  if (overflow_message.empty()) {
+    return Status::Invalid("Value too large to fit in C integer type");
+  } else {
+    return Status::Invalid(overflow_message);
   }
+}
 
-  if (scale != inferred_scale) {
-    DCHECK_NE(out, NULLPTR);
-    RETURN_NOT_OK(out->Rescale(inferred_scale, scale, out));
+// Extract C signed int from Python object
+template <typename Int,
+          typename std::enable_if<std::is_signed<Int>::value, Int>::type = 0>
+Status CIntFromPythonImpl(PyObject* obj, Int* out, const std::string& overflow_message) {
+  static_assert(sizeof(Int) <= sizeof(long long),  // NOLINT
+                "integer type larger than long long");
+
+  if (sizeof(Int) > sizeof(long)) {  // NOLINT
+    const auto value = PyLong_AsLongLong(obj);
+    if (ARROW_PREDICT_FALSE(value == -1)) {
+      RETURN_IF_PYERROR();
+    }
+    if (ARROW_PREDICT_FALSE(value < std::numeric_limits<Int>::min() ||
+                            value > std::numeric_limits<Int>::max())) {
+      return IntegerOverflowStatus(overflow_message);
+    }
+    *out = static_cast<Int>(value);
+  } else {
+    const auto value = PyLong_AsLong(obj);
+    if (ARROW_PREDICT_FALSE(value == -1)) {
+      RETURN_IF_PYERROR();
+    }
+    if (ARROW_PREDICT_FALSE(value < std::numeric_limits<Int>::min() ||
+                            value > std::numeric_limits<Int>::max())) {
+      return IntegerOverflowStatus(overflow_message);
+    }
+    *out = static_cast<Int>(value);
   }
   return Status::OK();
 }
 
-bool IsPyInteger(PyObject* obj) {
-#if PYARROW_IS_PY2
-  return PyLong_Check(obj) || PyInt_Check(obj);
-#else
-  return PyLong_Check(obj);
-#endif
-}
+// Extract C unsigned int from Python object
+template <typename Int,
+          typename std::enable_if<std::is_unsigned<Int>::value, Int>::type = 0>
+Status CIntFromPythonImpl(PyObject* obj, Int* out, const std::string& overflow_message) {
+  static_assert(sizeof(Int) <= sizeof(unsigned long long),  // NOLINT
+                "integer type larger than unsigned long long");
 
-Status UInt64FromPythonInt(PyObject* obj, uint64_t* out) {
   OwnedRef ref;
-  // PyLong_AsUnsignedLongLong() doesn't handle conversion from non-ints
-  // (e.g. np.uint64), so do it ourselves
+  // PyLong_AsUnsignedLong() and PyLong_AsUnsignedLongLong() don't handle
+  // conversion from non-ints (e.g. np.uint64), so do it ourselves
   if (!PyLong_Check(obj)) {
     ref.reset(PyNumber_Long(obj));
-    RETURN_IF_PYERROR();
+    if (!ref) {
+      RETURN_IF_PYERROR();
+    }
     obj = ref.obj();
   }
-  auto result = static_cast<uint64_t>(PyLong_AsUnsignedLongLong(obj));
-  if (result == static_cast<uint64_t>(-1)) {
-    RETURN_IF_PYERROR();
+  if (sizeof(Int) > sizeof(unsigned long)) {  // NOLINT
+    const auto value = PyLong_AsUnsignedLongLong(obj);
+    if (ARROW_PREDICT_FALSE(value == static_cast<decltype(value)>(-1))) {
+      RETURN_IF_PYERROR();
+    }
+    if (ARROW_PREDICT_FALSE(value > std::numeric_limits<Int>::max())) {
+      return IntegerOverflowStatus(overflow_message);
+    }
+    *out = static_cast<Int>(value);
+  } else {
+    const auto value = PyLong_AsUnsignedLong(obj);
+    if (ARROW_PREDICT_FALSE(value == static_cast<decltype(value)>(-1))) {
+      RETURN_IF_PYERROR();
+    }
+    if (ARROW_PREDICT_FALSE(value > std::numeric_limits<Int>::max())) {
+      return IntegerOverflowStatus(overflow_message);
+    }
+    *out = static_cast<Int>(value);
   }
-  *out = static_cast<uint64_t>(result);
   return Status::OK();
 }
 
-bool PyDecimal_Check(PyObject* obj) {
-  static OwnedRef decimal_type;
-  if (!decimal_type.obj()) {
-    Status status = ImportDecimalType(&decimal_type);
-    DCHECK_OK(status);
-    DCHECK(PyType_Check(decimal_type.obj()));
-  }
-  // PyObject_IsInstance() is slower as it has to check for virtual subclasses
-  const int result =
-      PyType_IsSubtype(Py_TYPE(obj), reinterpret_cast<PyTypeObject*>(decimal_type.obj()));
-  DCHECK_NE(result, -1) << " error during PyType_IsSubtype check";
-  return result == 1;
-}
-
-bool PyDecimal_ISNAN(PyObject* obj) {
-  DCHECK(PyDecimal_Check(obj)) << "obj is not an instance of decimal.Decimal";
-  OwnedRef is_nan(
-      PyObject_CallMethod(obj, const_cast<char*>("is_nan"), const_cast<char*>("")));
-  return PyObject_IsTrue(is_nan.obj()) == 1;
-}
-
-DecimalMetadata::DecimalMetadata()
-    : DecimalMetadata(std::numeric_limits<int32_t>::min(),
-                      std::numeric_limits<int32_t>::min()) {}
+}  // namespace
 
-DecimalMetadata::DecimalMetadata(int32_t precision, int32_t scale)
-    : precision_(precision), scale_(scale) {}
-
-Status DecimalMetadata::Update(int32_t suggested_precision, int32_t suggested_scale) {
-  const int32_t current_precision = precision_;
-  precision_ = std::max(current_precision, suggested_precision);
-
-  const int32_t current_scale = scale_;
-  scale_ = std::max(current_scale, suggested_scale);
-
-  // if our suggested scale is zero and we don't yet have enough precision then we need to
-  // add whatever the current scale is to the precision
-  if (suggested_scale == 0 && suggested_precision > current_precision) {
-    precision_ += scale_;
+template <typename Int>
+Status CIntFromPython(PyObject* obj, Int* out, const std::string& overflow_message) {
+  if (PyBool_Check(obj)) {
+    return Status::TypeError("Expected integer, got bool");
   }
-
-  return Status::OK();
+  return CIntFromPythonImpl(obj, out, overflow_message);
 }
 
-Status DecimalMetadata::Update(PyObject* object) {
-  DCHECK(PyDecimal_Check(object)) << "Object is not a Python Decimal";
-
-  if (ARROW_PREDICT_FALSE(PyDecimal_ISNAN(object))) {
-    return Status::OK();
-  }
-
-  int32_t precision;
-  int32_t scale;
-  RETURN_NOT_OK(InferDecimalPrecisionAndScale(object, &precision, &scale));
-  return Update(precision, scale);
-}
+template Status CIntFromPython(PyObject*, int8_t*, const std::string&);
+template Status CIntFromPython(PyObject*, int16_t*, const std::string&);
+template Status CIntFromPython(PyObject*, int32_t*, const std::string&);
+template Status CIntFromPython(PyObject*, int64_t*, const std::string&);
+template Status CIntFromPython(PyObject*, uint8_t*, const std::string&);
+template Status CIntFromPython(PyObject*, uint16_t*, const std::string&);
+template Status CIntFromPython(PyObject*, uint32_t*, const std::string&);
+template Status CIntFromPython(PyObject*, uint64_t*, const std::string&);
 
 bool PyFloat_IsNaN(PyObject* obj) {
   return PyFloat_Check(obj) && std::isnan(PyFloat_AsDouble(obj));
diff --git a/cpp/src/arrow/python/helpers.h b/cpp/src/arrow/python/helpers.h
index e2f3b18..195d5fb 100644
--- a/cpp/src/arrow/python/helpers.h
+++ b/cpp/src/arrow/python/helpers.h
@@ -15,11 +15,12 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#ifndef PYARROW_HELPERS_H
-#define PYARROW_HELPERS_H
+#ifndef ARROW_PYTHON_HELPERS_H
+#define ARROW_PYTHON_HELPERS_H
 
 #include "arrow/python/platform.h"
 
+#include <limits>
 #include <memory>
 #include <string>
 #include <utility>
@@ -32,8 +33,6 @@
 
 namespace arrow {
 
-class Decimal128;
-
 namespace py {
 
 class OwnedRef;
@@ -63,32 +62,14 @@ Status ImportModule(const std::string& module_name, OwnedRef* ref);
 // module
 Status ImportFromModule(const OwnedRef& module, const std::string& name, OwnedRef* ref);
 
-// \brief Import
-Status ImportDecimalType(OwnedRef* decimal_type);
-
-// \brief Convert a Python Decimal object to a C++ string
-// \param[in] python_decimal A Python decimal.Decimal instance
-// \param[out] The string representation of the Python Decimal instance
-// \return The status of the operation
-Status PythonDecimalToString(PyObject* python_decimal, std::string* out);
-
-// \brief Convert a C++ std::string to a Python Decimal instance
-// \param[in] decimal_constructor The decimal type object
-// \param[in] decimal_string A decimal string
-// \return An instance of decimal.Decimal
-PyObject* DecimalFromString(PyObject* decimal_constructor,
-                            const std::string& decimal_string);
-
-// \brief Convert a Python decimal to an Arrow Decimal128 object
-// \param[in] python_decimal A Python decimal.Decimal instance
-// \param[in] arrow_type An instance of arrow::DecimalType
-// \param[out] out A pointer to a Decimal128
-// \return The status of the operation
-Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type,
-                                Decimal128* out);
-
 // \brief Check whether obj is an integer, independent of Python versions.
-bool IsPyInteger(PyObject* obj);
+inline bool IsPyInteger(PyObject* obj) {
+#if PYARROW_IS_PY2
+  return PyLong_Check(obj) || PyInt_Check(obj);
+#else
+  return PyLong_Check(obj);
+#endif
+}
 
 // \brief Use pandas missing value semantics to check if a value is null
 bool PandasObjectIsNull(PyObject* obj);
@@ -96,45 +77,48 @@ bool PandasObjectIsNull(PyObject* obj);
 // \brief Check whether obj is nan
 bool PyFloat_IsNaN(PyObject* obj);
 
-// \brief Check whether obj is an instance of Decimal
-bool PyDecimal_Check(PyObject* obj);
-
-// \brief Check whether obj is nan. This function will abort the program if the argument
-// is not a Decimal instance
-bool PyDecimal_ISNAN(PyObject* obj);
+inline bool IsPyBinary(PyObject* obj) {
+  return PyBytes_Check(obj) || PyByteArray_Check(obj);
+}
 
-// \brief Convert a Python integer into an unsigned 64-bit integer
+// \brief Convert a Python integer into a C integer
 // \param[in] obj A Python integer
-// \param[out] out A pointer to a C uint64_t to hold the result of the conversion
+// \param[out] out A pointer to a C integer to hold the result of the conversion
 // \return The status of the operation
-Status UInt64FromPythonInt(PyObject* obj, uint64_t* out);
-
-// \brief Helper class to track and update the precision and scale of a decimal
-class DecimalMetadata {
- public:
-  DecimalMetadata();
-  DecimalMetadata(int32_t precision, int32_t scale);
-
-  // \brief Adjust the precision and scale of a decimal type given a new precision and a
-  // new scale \param[in] suggested_precision A candidate precision \param[in]
-  // suggested_scale A candidate scale \return The status of the operation
-  Status Update(int32_t suggested_precision, int32_t suggested_scale);
-
-  // \brief A convenient interface for updating the precision and scale based on a Python
-  // Decimal object \param object A Python Decimal object \return The status of the
-  // operation
-  Status Update(PyObject* object);
-
-  int32_t precision() const { return precision_; }
-  int32_t scale() const { return scale_; }
-
- private:
-  int32_t precision_;
-  int32_t scale_;
-};
+template <typename Int>
+Status CIntFromPython(PyObject* obj, Int* out, const std::string& overflow_message = "");
+
+// \brief Convert a Python unicode string to a std::string
+Status PyUnicode_AsStdString(PyObject* obj, std::string* out);
+
+// \brief Convert a Python bytes object to a std::string
+std::string PyBytes_AsStdString(PyObject* obj);
+
+// \brief Call str() on the given object and return the result as a std::string
+Status PyObject_StdStringStr(PyObject* obj, std::string* out);
+
+// \brief Return the repr() of the given object (always succeeds)
+std::string PyObject_StdStringRepr(PyObject* obj);
+
+// \brief Cast the given size to int32_t, with error checking
+inline Status CastSize(Py_ssize_t size, int32_t* out,
+                       const char* error_msg = "Maximum size exceeded (2GB)") {
+  // size is assumed to be positive
+  if (size > std::numeric_limits<int32_t>::max()) {
+    return Status::Invalid(error_msg);
+  }
+  *out = static_cast<int32_t>(size);
+  return Status::OK();
+}
+
+Status BuilderAppend(StringBuilder* builder, PyObject* obj, bool check_valid = false,
+                     bool* is_full = nullptr);
+Status BuilderAppend(BinaryBuilder* builder, PyObject* obj, bool* is_full = nullptr);
+Status BuilderAppend(FixedSizeBinaryBuilder* builder, PyObject* obj,
+                     bool* is_full = nullptr);
 
 }  // namespace internal
 }  // namespace py
 }  // namespace arrow
 
-#endif  // PYARROW_HELPERS_H
+#endif  // ARROW_PYTHON_HELPERS_H
diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc
index eb0af8b..e37013c 100644
--- a/cpp/src/arrow/python/numpy_to_arrow.cc
+++ b/cpp/src/arrow/python/numpy_to_arrow.cc
@@ -49,6 +49,7 @@
 #include "arrow/python/builtin_convert.h"
 #include "arrow/python/common.h"
 #include "arrow/python/config.h"
+#include "arrow/python/decimal.h"
 #include "arrow/python/helpers.h"
 #include "arrow/python/numpy-internal.h"
 #include "arrow/python/numpy_convert.h"
@@ -65,14 +66,6 @@ using internal::NumPyTypeSize;
 
 namespace {
 
-inline bool PyObject_is_string(PyObject* obj) {
-#if PY_MAJOR_VERSION >= 3
-  return PyUnicode_Check(obj) || PyBytes_Check(obj);
-#else
-  return PyString_Check(obj) || PyUnicode_Check(obj);
-#endif
-}
-
 inline bool PyObject_is_integer(PyObject* obj) {
   return !PyBool_Check(obj) && PyArray_IsIntegerScalar(obj);
 }
@@ -204,8 +197,6 @@ int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap) {
 static Status AppendObjectBinaries(PyArrayObject* arr, PyArrayObject* mask,
                                    int64_t offset, BinaryBuilder* builder,
                                    int64_t* end_offset) {
-  PyObject* obj;
-
   Ndarray1DIndexer<PyObject*> objects(arr);
   Ndarray1DIndexer<uint8_t> mask_values;
 
@@ -216,30 +207,15 @@ static Status AppendObjectBinaries(PyArrayObject* arr, PyArrayObject* mask,
   }
 
   for (; offset < objects.size(); ++offset) {
-    OwnedRef tmp_obj;
-    obj = objects[offset];
+    PyObject* obj = objects[offset];
     if ((have_mask && mask_values[offset]) || internal::PandasObjectIsNull(obj)) {
       RETURN_NOT_OK(builder->AppendNull());
       continue;
-    } else if (PyBytes_Check(obj)) {
-      const int32_t length = static_cast<int32_t>(PyBytes_GET_SIZE(obj));
-      if (ARROW_PREDICT_FALSE(builder->value_data_length() + length >
-                              kBinaryMemoryLimit)) {
-        break;
-      }
-      RETURN_NOT_OK(builder->Append(PyBytes_AS_STRING(obj), length));
-    } else if (PyByteArray_Check(obj)) {
-      const int32_t length = static_cast<int32_t>(PyByteArray_GET_SIZE(obj));
-      if (ARROW_PREDICT_FALSE(builder->value_data_length() + length >
-                              kBinaryMemoryLimit)) {
-        break;
-      }
-      RETURN_NOT_OK(builder->Append(PyByteArray_AS_STRING(obj), length));
-    } else {
-      std::stringstream ss;
-      ss << "Error converting from Python objects to bytes: ";
-      RETURN_NOT_OK(InvalidConversion(obj, "str, bytes, bytearray", &ss));
-      return Status::Invalid(ss.str());
+    }
+    bool is_full;
+    RETURN_NOT_OK(internal::BuilderAppend(builder, obj, &is_full));
+    if (is_full) {
+      break;
     }
   }
 
@@ -275,27 +251,16 @@ static Status AppendObjectStrings(PyArrayObject* arr, PyArrayObject* mask, int64
     if ((have_mask && mask_values[offset]) || internal::PandasObjectIsNull(obj)) {
       RETURN_NOT_OK(builder->AppendNull());
       continue;
-    } else if (PyUnicode_Check(obj)) {
-      obj = PyUnicode_AsUTF8String(obj);
-      if (obj == NULL) {
-        PyErr_Clear();
-        return Status::Invalid("failed converting unicode to UTF8");
-      }
-      tmp_obj.reset(obj);
-    } else if (PyBytes_Check(obj)) {
+    }
+    if (internal::IsPyBinary(obj)) {
       *have_bytes = true;
-    } else {
-      std::stringstream ss;
-      ss << "Error converting from Python objects to String/UTF8: ";
-      RETURN_NOT_OK(InvalidConversion(obj, "str, bytes", &ss));
-      return Status::Invalid(ss.str());
     }
-
-    const int32_t length = static_cast<int32_t>(PyBytes_GET_SIZE(obj));
-    if (ARROW_PREDICT_FALSE(builder->value_data_length() + length > kBinaryMemoryLimit)) {
+    bool is_full;
+    RETURN_NOT_OK(
+        internal::BuilderAppend(builder, obj, false /* check_valid */, &is_full));
+    if (is_full) {
       break;
     }
-    RETURN_NOT_OK(builder->Append(PyBytes_AS_STRING(obj), length));
   }
 
   // If we consumed the whole array, this will be the length of arr
@@ -324,28 +289,12 @@ static Status AppendObjectFixedWidthBytes(PyArrayObject* arr, PyArrayObject* mas
     if ((have_mask && mask_values[offset]) || internal::PandasObjectIsNull(obj)) {
       RETURN_NOT_OK(builder->AppendNull());
       continue;
-    } else if (PyUnicode_Check(obj)) {
-      obj = PyUnicode_AsUTF8String(obj);
-      if (obj == NULL) {
-        PyErr_Clear();
-        return Status::Invalid("failed converting unicode to UTF8");
-      }
-
-      tmp_obj.reset(obj);
-    } else if (!PyBytes_Check(obj)) {
-      std::stringstream ss;
-      ss << "Error converting from Python objects to FixedSizeBinary: ";
-      RETURN_NOT_OK(InvalidConversion(obj, "str, bytes", &ss));
-      return Status::Invalid(ss.str());
     }
-
-    RETURN_NOT_OK(CheckPythonBytesAreFixedLength(obj, byte_width));
-    if (ARROW_PREDICT_FALSE(builder->value_data_length() + byte_width >
-                            kBinaryMemoryLimit)) {
+    bool is_full;
+    RETURN_NOT_OK(internal::BuilderAppend(builder, obj, &is_full));
+    if (is_full) {
       break;
     }
-    RETURN_NOT_OK(
-        builder->Append(reinterpret_cast<const uint8_t*>(PyBytes_AS_STRING(obj))));
   }
 
   // If we consumed the whole array, this will be the length of arr
@@ -981,15 +930,10 @@ Status NumPyConverter::ConvertObjectIntegers() {
     obj = objects[i];
     if ((have_mask && mask_values[i]) || internal::PandasObjectIsNull(obj)) {
       RETURN_NOT_OK(builder.AppendNull());
-    } else if (PyObject_is_integer(obj)) {
-      const int64_t val = static_cast<int64_t>(PyLong_AsLong(obj));
-      RETURN_IF_PYERROR();
-      RETURN_NOT_OK(builder.Append(val));
     } else {
-      std::stringstream ss;
-      ss << "Error converting from Python objects to Int64: ";
-      RETURN_NOT_OK(InvalidConversion(obj, "integer", &ss));
-      return Status::Invalid(ss.str());
+      int64_t val;
+      RETURN_NOT_OK(internal::CIntFromPython(obj, &val));
+      RETURN_NOT_OK(builder.Append(val));
     }
   }
 
@@ -1102,7 +1046,9 @@ Status NumPyConverter::ConvertObjectsInfer() {
     PyObject* obj = objects[i];
     if (internal::PandasObjectIsNull(obj)) {
       continue;
-    } else if (PyObject_is_string(obj)) {
+    } else if (PyUnicode_Check(obj) || internal::IsPyBinary(obj)) {
+      // The exact Arrow type (Binary or String) will be decided based on
+      // Python object types
       return ConvertObjectStrings();
     } else if (PyFloat_Check(obj)) {
       return ConvertObjectFloats();
@@ -1119,8 +1065,6 @@ Status NumPyConverter::ConvertObjectsInfer() {
       return ConvertTimes();
     } else if (PyObject_IsInstance(obj, decimal_type_.obj()) == 1) {
       return ConvertDecimals();
-    } else if (PyByteArray_Check(obj)) {
-      return ConvertObjectBytes();
     } else if (PyList_Check(obj)) {
       std::shared_ptr<DataType> inferred_type;
       RETURN_NOT_OK(InferArrowType(obj, &inferred_type));
diff --git a/cpp/src/arrow/python/python-test.cc b/cpp/src/arrow/python/python-test.cc
index c18b159..293255b 100644
--- a/cpp/src/arrow/python/python-test.cc
+++ b/cpp/src/arrow/python/python-test.cc
@@ -28,6 +28,7 @@
 
 #include "arrow/python/arrow_to_pandas.h"
 #include "arrow/python/builtin_convert.h"
+#include "arrow/python/decimal.h"
 #include "arrow/python/helpers.h"
 
 namespace arrow {
diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc
index d781d9f..998fa8a 100644
--- a/cpp/src/arrow/python/python_to_arrow.cc
+++ b/cpp/src/arrow/python/python_to_arrow.cc
@@ -84,7 +84,9 @@ class SequenceBuilder {
     if (*tag == -1) {
       *tag = num_tags_++;
     }
-    RETURN_NOT_OK(offsets_.Append(static_cast<int32_t>(offset)));
+    int32_t offset32;
+    RETURN_NOT_OK(internal::CastSize(offset, &offset32));
+    RETURN_NOT_OK(offsets_.Append(offset32));
     RETURN_NOT_OK(types_.Append(*tag));
     return nones_.AppendToBitmap(true);
   }
@@ -173,26 +175,34 @@ class SequenceBuilder {
   /// \param size
   /// The size of the sublist
   Status AppendList(Py_ssize_t size) {
+    int32_t offset;
+    RETURN_NOT_OK(internal::CastSize(list_offsets_.back() + size, &offset));
     RETURN_NOT_OK(Update(list_offsets_.size() - 1, &list_tag_));
-    list_offsets_.push_back(list_offsets_.back() + static_cast<int32_t>(size));
+    list_offsets_.push_back(offset);
     return Status::OK();
   }
 
   Status AppendTuple(Py_ssize_t size) {
+    int32_t offset;
+    RETURN_NOT_OK(internal::CastSize(tuple_offsets_.back() + size, &offset));
     RETURN_NOT_OK(Update(tuple_offsets_.size() - 1, &tuple_tag_));
-    tuple_offsets_.push_back(tuple_offsets_.back() + static_cast<int32_t>(size));
+    tuple_offsets_.push_back(offset);
     return Status::OK();
   }
 
   Status AppendDict(Py_ssize_t size) {
+    int32_t offset;
+    RETURN_NOT_OK(internal::CastSize(dict_offsets_.back() + size, &offset));
     RETURN_NOT_OK(Update(dict_offsets_.size() - 1, &dict_tag_));
-    dict_offsets_.push_back(dict_offsets_.back() + static_cast<int32_t>(size));
+    dict_offsets_.push_back(offset);
     return Status::OK();
   }
 
   Status AppendSet(Py_ssize_t size) {
+    int32_t offset;
+    RETURN_NOT_OK(internal::CastSize(set_offsets_.back() + size, &offset));
     RETURN_NOT_OK(Update(set_offsets_.size() - 1, &set_tag_));
-    set_offsets_.push_back(set_offsets_.back() + static_cast<int32_t>(size));
+    set_offsets_.push_back(offset);
     return Status::OK();
   }
 
@@ -365,17 +375,8 @@ Status CallCustomCallback(PyObject* context, PyObject* method_name, PyObject* el
   *result = NULL;
   if (context == Py_None) {
     std::stringstream ss;
-    OwnedRef repr(PyObject_Repr(elem));
-    RETURN_IF_PYERROR();
-#if PY_MAJOR_VERSION >= 3
-    OwnedRef ascii(PyUnicode_AsASCIIString(repr.obj()));
-    RETURN_IF_PYERROR();
-    ss << "error while calling callback on " << PyBytes_AsString(ascii.obj())
+    ss << "error while calling callback on " << internal::PyObject_StdStringRepr(elem)
        << ": handler not registered";
-#else
-    ss << "error while calling callback on " << PyString_AsString(repr.obj())
-       << ": handler not registered";
-#endif
     return Status::SerializationError(ss.str());
   } else {
     *result = PyObject_CallMethodObjArgs(context, method_name, elem, NULL);
@@ -483,24 +484,15 @@ Status Append(PyObject* context, PyObject* elem, SequenceBuilder* builder,
 #endif
   } else if (PyBytes_Check(elem)) {
     auto data = reinterpret_cast<uint8_t*>(PyBytes_AS_STRING(elem));
-    const int64_t size = static_cast<int64_t>(PyBytes_GET_SIZE(elem));
-    if (size > std::numeric_limits<int32_t>::max()) {
-      return Status::Invalid("Cannot writes bytes over 2GB");
-    }
-    RETURN_NOT_OK(builder->AppendBytes(data, static_cast<int32_t>(size)));
+    int32_t size;
+    RETURN_NOT_OK(internal::CastSize(PyBytes_GET_SIZE(elem), &size));
+    RETURN_NOT_OK(builder->AppendBytes(data, size));
   } else if (PyUnicode_Check(elem)) {
-    Py_ssize_t size;
-#if PY_MAJOR_VERSION >= 3
-    char* data = PyUnicode_AsUTF8AndSize(elem, &size);
-#else
-    OwnedRef str(PyUnicode_AsUTF8String(elem));
-    char* data = PyString_AS_STRING(str.obj());
-    size = PyString_GET_SIZE(str.obj());
-#endif
-    if (size > std::numeric_limits<int32_t>::max()) {
-      return Status::Invalid("Cannot writes bytes over 2GB");
-    }
-    RETURN_NOT_OK(builder->AppendString(data, static_cast<int32_t>(size)));
+    PyBytesView view;
+    RETURN_NOT_OK(view.FromString(elem));
+    int32_t size;
+    RETURN_NOT_OK(internal::CastSize(view.size, &size));
+    RETURN_NOT_OK(builder->AppendString(view.bytes, size));
   } else if (PyList_CheckExact(elem)) {
     RETURN_NOT_OK(builder->AppendList(PyList_Size(elem)));
     sublists->push_back(elem);
diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx
index 672be08..8929ea0 100644
--- a/python/pyarrow/lib.pyx
+++ b/python/pyarrow/lib.pyx
@@ -33,10 +33,6 @@ from pyarrow.includes.common cimport PyObject_to_object
 cimport pyarrow.includes.libarrow as libarrow
 cimport cpython as cp
 
-cdef _pandas():
-    import pandas as pd
-    return pd
-
 arrow_init_numpy()
 set_numpy_nan(np.nan)
 
diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py
index 988d512..1c5dd71 100644
--- a/python/pyarrow/tests/test_convert_builtin.py
+++ b/python/pyarrow/tests/test_convert_builtin.py
@@ -215,6 +215,26 @@ def test_sequence_numpy_integer_inferred(seq, np_scalar_pa_type):
     assert arr.to_pylist() == expected
 
 
+@pytest.mark.parametrize("bits", [8, 16, 32, 64])
+def test_signed_integer_overflow(bits):
+    ty = getattr(pa, "int%d" % bits)()
+    # XXX ideally would raise OverflowError
+    with pytest.raises((ValueError, pa.ArrowException)):
+        pa.array([2 ** (bits - 1)], ty)
+    with pytest.raises((ValueError, pa.ArrowException)):
+        pa.array([-2 ** (bits - 1) - 1], ty)
+
+
+@pytest.mark.parametrize("bits", [8, 16, 32, 64])
+def test_unsigned_integer_overflow(bits):
+    ty = getattr(pa, "uint%d" % bits)()
+    # XXX ideally would raise OverflowError
+    with pytest.raises((ValueError, pa.ArrowException)):
+        pa.array([2 ** bits], ty)
+    with pytest.raises((ValueError, pa.ArrowException)):
+        pa.array([-1], ty)
+
+
 def test_garbage_collection():
     import gc
 
@@ -260,12 +280,14 @@ def test_sequence_bytes():
     u1 = b'ma\xc3\xb1ana'
     data = [b'foo',
             u1.decode('utf-8'),  # unicode gets encoded,
+            bytearray(b'bar'),
             None]
-    arr = pa.array(data)
-    assert len(arr) == 3
-    assert arr.null_count == 1
-    assert arr.type == pa.binary()
-    assert arr.to_pylist() == [b'foo', u1, None]
+    for ty in [None, pa.binary()]:
+        arr = pa.array(data, type=ty)
+        assert len(arr) == 4
+        assert arr.null_count == 1
+        assert arr.type == pa.binary()
+        assert arr.to_pylist() == [b'foo', u1, b'bar', None]
 
 
 def test_sequence_utf8_to_unicode():
@@ -281,12 +303,12 @@ def test_sequence_utf8_to_unicode():
 
 
 def test_sequence_fixed_size_bytes():
-    data = [b'foof', None, b'barb', b'2346']
+    data = [b'foof', None, bytearray(b'barb'), b'2346']
     arr = pa.array(data, type=pa.binary(4))
     assert len(arr) == 4
     assert arr.null_count == 1
     assert arr.type == pa.binary(4)
-    assert arr.to_pylist() == data
+    assert arr.to_pylist() == [b'foof', None, b'barb', b'2346']
 
 
 def test_fixed_size_bytes_does_not_accept_varying_lengths():
@@ -473,7 +495,7 @@ def test_sequence_mixed_types_with_specified_type_fails():
     data = ['-10', '-5', {'a': 1}, '0', '5', '10']
 
     type = pa.string()
-    with pytest.raises(pa.ArrowInvalid):
+    with pytest.raises(TypeError):
         pa.array(data, type=type)
 
 
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index 04b1fa4..c6e2b75 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -30,7 +30,7 @@ import numpy.testing as npt
 import pandas as pd
 import pandas.util.testing as tm
 
-from pyarrow.compat import u, PY2
+from pyarrow.compat import PY2
 import pyarrow as pa
 import pyarrow.types as patypes
 
@@ -1065,13 +1065,13 @@ class TestConvertStringLikeTypes(object):
         _check_pandas_roundtrip(df, expected_schema=schema)
 
     def test_bytes_to_binary(self):
-        values = [u('qux'), b'foo', None, 'bar', 'qux', np.nan]
+        values = [u'qux', b'foo', None, bytearray(b'barz'), 'qux', np.nan]
         df = pd.DataFrame({'strings': values})
 
         table = pa.Table.from_pandas(df)
         assert table[0].type == pa.binary()
 
-        values2 = [b'qux', b'foo', None, b'bar', b'qux', np.nan]
+        values2 = [b'qux', b'foo', None, b'barz', b'qux', np.nan]
         expected = pd.DataFrame({'strings': values2})
         _check_pandas_roundtrip(df, expected)
 
@@ -1093,7 +1093,7 @@ class TestConvertStringLikeTypes(object):
         assert table[0].data.num_chunks == 2
 
     def test_fixed_size_bytes(self):
-        values = [b'foo', None, b'bar', None, None, b'hey']
+        values = [b'foo', None, bytearray(b'bar'), None, None, b'hey']
         df = pd.DataFrame({'strings': values})
         schema = pa.schema([pa.field('strings', pa.binary(3))])
         table = pa.Table.from_pandas(df, schema=schema)
diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py
index b0764fd..a14673f 100644
--- a/python/pyarrow/tests/test_feather.py
+++ b/python/pyarrow/tests/test_feather.py
@@ -465,7 +465,7 @@ class TestFeatherReader(unittest.TestCase):
 
         # non-strings
         df = pd.DataFrame({'a': ['a', 1, 2.0]})
-        self._assert_error_on_write(df, ValueError)
+        self._assert_error_on_write(df, TypeError)
 
     @pytest.mark.slow
     def test_large_dataframe(self):

-- 
To stop receiving notification emails like this one, please contact
apitrou@apache.org.