You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/09/30 04:03:09 UTC

[1/2] arrow git commit: ARROW-838: [Python] Expand pyarrow.array to handle NumPy arrays not originating in pandas

Repository: arrow
Updated Branches:
  refs/heads/master 7c616114f -> 796129b4f


http://git-wip-us.apache.org/repos/asf/arrow/blob/796129b4/cpp/src/arrow/python/pandas_to_arrow.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/pandas_to_arrow.cc b/cpp/src/arrow/python/pandas_to_arrow.cc
deleted file mode 100644
index dc5b67f..0000000
--- a/cpp/src/arrow/python/pandas_to_arrow.cc
+++ /dev/null
@@ -1,1215 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-// Functions for pandas conversion via NumPy
-
-#define ARROW_NO_DEFAULT_MEMORY_POOL
-
-#include "arrow/python/numpy_interop.h"
-
-#include "arrow/python/pandas_to_arrow.h"
-
-#include <algorithm>
-#include <cmath>
-#include <cstdint>
-#include <limits>
-#include <memory>
-#include <sstream>
-#include <string>
-#include <vector>
-
-#include "arrow/array.h"
-#include "arrow/status.h"
-#include "arrow/table.h"
-#include "arrow/type_fwd.h"
-#include "arrow/type_traits.h"
-#include "arrow/util/bit-util.h"
-#include "arrow/util/decimal.h"
-#include "arrow/util/logging.h"
-#include "arrow/util/macros.h"
-#include "arrow/visitor_inline.h"
-
-#include "arrow/compute/cast.h"
-#include "arrow/compute/context.h"
-
-#include "arrow/python/builtin_convert.h"
-#include "arrow/python/common.h"
-#include "arrow/python/config.h"
-#include "arrow/python/helpers.h"
-#include "arrow/python/numpy-internal.h"
-#include "arrow/python/numpy_convert.h"
-#include "arrow/python/type_traits.h"
-#include "arrow/python/util/datetime.h"
-
-namespace arrow {
-namespace py {
-
-using internal::NumPyTypeSize;
-
-// ----------------------------------------------------------------------
-// Conversion utilities
-
-static inline bool PyFloat_isnan(const PyObject* obj) {
-  if (PyFloat_Check(obj)) {
-    double val = PyFloat_AS_DOUBLE(obj);
-    return val != val;
-  } else {
-    return false;
-  }
-}
-static inline bool PandasObjectIsNull(const PyObject* obj) {
-  return obj == Py_None || obj == numpy_nan || PyFloat_isnan(obj);
-}
-
-static inline bool PyObject_is_string(const PyObject* obj) {
-#if PY_MAJOR_VERSION >= 3
-  return PyUnicode_Check(obj) || PyBytes_Check(obj);
-#else
-  return PyString_Check(obj) || PyUnicode_Check(obj);
-#endif
-}
-
-static inline bool PyObject_is_float(const PyObject* obj) { return PyFloat_Check(obj); }
-
-static inline bool PyObject_is_integer(const PyObject* obj) {
-  return (!PyBool_Check(obj)) && PyArray_IsIntegerScalar(obj);
-}
-
-template <int TYPE>
-static int64_t ValuesToBitmap(PyArrayObject* arr, uint8_t* bitmap) {
-  typedef internal::npy_traits<TYPE> traits;
-  typedef typename traits::value_type T;
-
-  int64_t null_count = 0;
-
-  Ndarray1DIndexer<T> values(arr);
-  for (int i = 0; i < values.size(); ++i) {
-    if (traits::isnull(values[i])) {
-      ++null_count;
-    } else {
-      BitUtil::SetBit(bitmap, i);
-    }
-  }
-
-  return null_count;
-}
-
-// Returns null count
-static int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap) {
-  int64_t null_count = 0;
-
-  Ndarray1DIndexer<uint8_t> mask_values(mask);
-  for (int i = 0; i < length; ++i) {
-    if (mask_values[i]) {
-      ++null_count;
-    } else {
-      BitUtil::SetBit(bitmap, i);
-    }
-  }
-  return null_count;
-}
-
-template <int TYPE, typename BuilderType>
-static Status AppendNdarrayToBuilder(PyArrayObject* array, BuilderType* builder) {
-  typedef internal::npy_traits<TYPE> traits;
-  typedef typename traits::value_type T;
-
-  // TODO(wesm): Vector append when not strided
-  Ndarray1DIndexer<T> values(array);
-  if (traits::supports_nulls) {
-    for (int64_t i = 0; i < values.size(); ++i) {
-      if (traits::isnull(values[i])) {
-        RETURN_NOT_OK(builder->AppendNull());
-      } else {
-        RETURN_NOT_OK(builder->Append(values[i]));
-      }
-    }
-  } else {
-    for (int64_t i = 0; i < values.size(); ++i) {
-      RETURN_NOT_OK(builder->Append(values[i]));
-    }
-  }
-  return Status::OK();
-}
-
-Status CheckFlatNumpyArray(PyArrayObject* numpy_array, int np_type) {
-  if (PyArray_NDIM(numpy_array) != 1) {
-    return Status::Invalid("only handle 1-dimensional arrays");
-  }
-
-  const int received_type = PyArray_DESCR(numpy_array)->type_num;
-  if (received_type != np_type) {
-    std::stringstream ss;
-    ss << "trying to convert NumPy type " << GetNumPyTypeName(np_type) << " but got "
-       << GetNumPyTypeName(received_type);
-    return Status::Invalid(ss.str());
-  }
-
-  return Status::OK();
-}
-
-constexpr int64_t kBinaryMemoryLimit = std::numeric_limits<int32_t>::max();
-
-/// Append as many string objects from NumPy arrays to a `StringBuilder` as we
-/// can fit
-///
-/// \param[in] offset starting offset for appending
-/// \param[out] values_consumed ending offset where we stopped appending. Will
-/// be length of arr if fully consumed
-/// \param[out] have_bytes true if we encountered any PyBytes object
-static Status AppendObjectStrings(PyArrayObject* arr, PyArrayObject* mask, int64_t offset,
-                                  StringBuilder* builder, int64_t* end_offset,
-                                  bool* have_bytes) {
-  PyObject* obj;
-
-  Ndarray1DIndexer<PyObject*> objects(arr);
-  Ndarray1DIndexer<uint8_t> mask_values;
-
-  bool have_mask = false;
-  if (mask != nullptr) {
-    mask_values.Init(mask);
-    have_mask = true;
-  }
-
-  for (; offset < objects.size(); ++offset) {
-    OwnedRef tmp_obj;
-    obj = objects[offset];
-    if ((have_mask && mask_values[offset]) || PandasObjectIsNull(obj)) {
-      RETURN_NOT_OK(builder->AppendNull());
-      continue;
-    } else if (PyUnicode_Check(obj)) {
-      obj = PyUnicode_AsUTF8String(obj);
-      if (obj == NULL) {
-        PyErr_Clear();
-        return Status::Invalid("failed converting unicode to UTF8");
-      }
-      tmp_obj.reset(obj);
-    } else if (PyBytes_Check(obj)) {
-      *have_bytes = true;
-    } else {
-      std::stringstream ss;
-      ss << "Error converting to Python objects to String/UTF8: ";
-      RETURN_NOT_OK(InvalidConversion(obj, "str, bytes", &ss));
-      return Status::Invalid(ss.str());
-    }
-
-    const int32_t length = static_cast<int32_t>(PyBytes_GET_SIZE(obj));
-    if (ARROW_PREDICT_FALSE(builder->value_data_length() + length > kBinaryMemoryLimit)) {
-      break;
-    }
-    RETURN_NOT_OK(builder->Append(PyBytes_AS_STRING(obj), length));
-  }
-
-  // If we consumed the whole array, this will be the length of arr
-  *end_offset = offset;
-  return Status::OK();
-}
-
-static Status AppendObjectFixedWidthBytes(PyArrayObject* arr, PyArrayObject* mask,
-                                          int byte_width, int64_t offset,
-                                          FixedSizeBinaryBuilder* builder,
-                                          int64_t* end_offset) {
-  PyObject* obj;
-
-  Ndarray1DIndexer<PyObject*> objects(arr);
-  Ndarray1DIndexer<uint8_t> mask_values;
-
-  bool have_mask = false;
-  if (mask != nullptr) {
-    mask_values.Init(mask);
-    have_mask = true;
-  }
-
-  for (; offset < objects.size(); ++offset) {
-    OwnedRef tmp_obj;
-    obj = objects[offset];
-    if ((have_mask && mask_values[offset]) || PandasObjectIsNull(obj)) {
-      RETURN_NOT_OK(builder->AppendNull());
-      continue;
-    } else if (PyUnicode_Check(obj)) {
-      obj = PyUnicode_AsUTF8String(obj);
-      if (obj == NULL) {
-        PyErr_Clear();
-        return Status::Invalid("failed converting unicode to UTF8");
-      }
-
-      tmp_obj.reset(obj);
-    } else if (!PyBytes_Check(obj)) {
-      std::stringstream ss;
-      ss << "Error converting to Python objects to FixedSizeBinary: ";
-      RETURN_NOT_OK(InvalidConversion(obj, "str, bytes", &ss));
-      return Status::Invalid(ss.str());
-    }
-
-    RETURN_NOT_OK(CheckPythonBytesAreFixedLength(obj, byte_width));
-    if (ARROW_PREDICT_FALSE(builder->value_data_length() + byte_width >
-                            kBinaryMemoryLimit)) {
-      break;
-    }
-    RETURN_NOT_OK(
-        builder->Append(reinterpret_cast<const uint8_t*>(PyBytes_AS_STRING(obj))));
-  }
-
-  // If we consumed the whole array, this will be the length of arr
-  *end_offset = offset;
-  return Status::OK();
-}
-
-// ----------------------------------------------------------------------
-// Conversion from NumPy-in-Pandas to Arrow
-
-class PandasConverter {
- public:
-  PandasConverter(MemoryPool* pool, PyObject* ao, PyObject* mo,
-                  const std::shared_ptr<DataType>& type)
-      : pool_(pool),
-        type_(type),
-        arr_(reinterpret_cast<PyArrayObject*>(ao)),
-        mask_(nullptr) {
-    if (mo != nullptr && mo != Py_None) {
-      mask_ = reinterpret_cast<PyArrayObject*>(mo);
-    }
-    length_ = static_cast<int64_t>(PyArray_SIZE(arr_));
-  }
-
-  bool is_strided() const {
-    npy_intp* astrides = PyArray_STRIDES(arr_);
-    return astrides[0] != PyArray_DESCR(arr_)->elsize;
-  }
-
-  Status InitNullBitmap() {
-    int64_t null_bytes = BitUtil::BytesForBits(length_);
-
-    null_bitmap_ = std::make_shared<PoolBuffer>(pool_);
-    RETURN_NOT_OK(null_bitmap_->Resize(null_bytes));
-
-    null_bitmap_data_ = null_bitmap_->mutable_data();
-    memset(null_bitmap_data_, 0, static_cast<size_t>(null_bytes));
-
-    return Status::OK();
-  }
-
-  // ----------------------------------------------------------------------
-  // Traditional visitor conversion for non-object arrays
-
-  template <typename ArrowType>
-  Status ConvertData(std::shared_ptr<Buffer>* data);
-
-  template <typename T>
-  Status PushBuilderResult(T* builder) {
-    std::shared_ptr<Array> out;
-    RETURN_NOT_OK(builder->Finish(&out));
-    out_arrays_.emplace_back(out);
-    return Status::OK();
-  }
-
-  Status PushArray(const std::shared_ptr<ArrayData>& data) {
-    std::shared_ptr<Array> result;
-    RETURN_NOT_OK(MakeArray(data, &result));
-    out_arrays_.emplace_back(std::move(result));
-    return Status::OK();
-  }
-
-  template <typename ArrowType>
-  Status VisitNative() {
-    using traits = internal::arrow_traits<ArrowType::type_id>;
-
-    if (mask_ != nullptr || traits::supports_nulls) {
-      RETURN_NOT_OK(InitNullBitmap());
-    }
-
-    std::shared_ptr<Buffer> data;
-    RETURN_NOT_OK(ConvertData<ArrowType>(&data));
-
-    int64_t null_count = 0;
-    if (mask_ != nullptr) {
-      null_count = MaskToBitmap(mask_, length_, null_bitmap_data_);
-    } else if (traits::supports_nulls) {
-      // TODO(wesm): this presumes the NumPy C type and arrow C type are the
-      // same
-      null_count = ValuesToBitmap<traits::npy_type>(arr_, null_bitmap_data_);
-    }
-
-    BufferVector buffers = {null_bitmap_, data};
-    auto arr_data =
-        std::make_shared<ArrayData>(type_, length_, std::move(buffers), null_count, 0);
-    return PushArray(arr_data);
-  }
-
-  template <typename T>
-  typename std::enable_if<std::is_base_of<PrimitiveCType, T>::value ||
-                              std::is_same<BooleanType, T>::value,
-                          Status>::type
-  Visit(const T& type) {
-    return VisitNative<T>();
-  }
-
-  Status Visit(const Date32Type& type) { return VisitNative<Date32Type>(); }
-  Status Visit(const Date64Type& type) { return VisitNative<Int64Type>(); }
-  Status Visit(const TimestampType& type) { return VisitNative<TimestampType>(); }
-  Status Visit(const Time32Type& type) { return VisitNative<Int32Type>(); }
-  Status Visit(const Time64Type& type) { return VisitNative<Int64Type>(); }
-
-  Status TypeNotImplemented(std::string type_name) {
-    std::stringstream ss;
-    ss << "PandasConverter doesn't implement <" << type_name << "> conversion. ";
-    return Status::NotImplemented(ss.str());
-  }
-
-  Status Visit(const NullType& type) { return TypeNotImplemented(type.ToString()); }
-
-  Status Visit(const BinaryType& type) { return TypeNotImplemented(type.ToString()); }
-
-  Status Visit(const FixedSizeBinaryType& type) {
-    return TypeNotImplemented(type.ToString());
-  }
-
-  Status Visit(const DecimalType& type) { return TypeNotImplemented(type.ToString()); }
-
-  Status Visit(const DictionaryType& type) { return TypeNotImplemented(type.ToString()); }
-
-  Status Visit(const NestedType& type) { return TypeNotImplemented(type.ToString()); }
-
-  Status Convert() {
-    if (PyArray_NDIM(arr_) != 1) {
-      return Status::Invalid("only handle 1-dimensional arrays");
-    }
-
-    if (type_ == nullptr) {
-      return Status::Invalid("Must pass data type");
-    }
-
-    // Visit the type to perform conversion
-    return VisitTypeInline(*type_, this);
-  }
-
-  const std::vector<std::shared_ptr<Array>>& result() const { return out_arrays_; }
-
-  // ----------------------------------------------------------------------
-  // Conversion logic for various object dtype arrays
-
-  template <int ITEM_TYPE, typename ArrowType>
-  Status ConvertTypedLists(const std::shared_ptr<DataType>& type, ListBuilder* builder,
-                           PyObject* list);
-
-  template <typename ArrowType>
-  Status ConvertDates();
-
-  Status ConvertBooleans();
-  Status ConvertObjectStrings();
-  Status ConvertObjectFloats();
-  Status ConvertObjectFixedWidthBytes(const std::shared_ptr<DataType>& type);
-  Status ConvertObjectIntegers();
-  Status ConvertLists(const std::shared_ptr<DataType>& type);
-  Status ConvertLists(const std::shared_ptr<DataType>& type, ListBuilder* builder,
-                      PyObject* list);
-  Status ConvertDecimals();
-  Status ConvertTimes();
-  Status ConvertObjects();
-  Status ConvertObjectsInfer();
-  Status ConvertObjectsInferAndCast();
-
- protected:
-  MemoryPool* pool_;
-  std::shared_ptr<DataType> type_;
-  PyArrayObject* arr_;
-  PyArrayObject* mask_;
-  int64_t length_;
-
-  // Used in visitor pattern
-  std::vector<std::shared_ptr<Array>> out_arrays_;
-
-  std::shared_ptr<ResizableBuffer> null_bitmap_;
-  uint8_t* null_bitmap_data_;
-};
-
-template <typename T, typename T2>
-void CopyStrided(T* input_data, int64_t length, int64_t stride, T2* output_data) {
-  // Passing input_data as non-const is a concession to PyObject*
-  int64_t j = 0;
-  for (int64_t i = 0; i < length; ++i) {
-    output_data[i] = static_cast<T2>(input_data[j]);
-    j += stride;
-  }
-}
-
-template <>
-void CopyStrided<PyObject*, PyObject*>(PyObject** input_data, int64_t length,
-                                       int64_t stride, PyObject** output_data) {
-  int64_t j = 0;
-  for (int64_t i = 0; i < length; ++i) {
-    output_data[i] = input_data[j];
-    if (output_data[i] != nullptr) {
-      Py_INCREF(output_data[i]);
-    }
-    j += stride;
-  }
-}
-
-static Status CastBuffer(const std::shared_ptr<Buffer>& input, const int64_t length,
-                         const std::shared_ptr<DataType>& in_type,
-                         const std::shared_ptr<DataType>& out_type, MemoryPool* pool,
-                         std::shared_ptr<Buffer>* out) {
-  // Must cast
-  std::vector<std::shared_ptr<Buffer>> buffers = {nullptr, input};
-  auto tmp_data = std::make_shared<ArrayData>(in_type, length, buffers, 0);
-
-  std::shared_ptr<Array> tmp_array, casted_array;
-  RETURN_NOT_OK(MakeArray(tmp_data, &tmp_array));
-
-  compute::FunctionContext context(pool);
-  compute::CastOptions cast_options;
-  cast_options.allow_int_overflow = false;
-
-  RETURN_NOT_OK(
-      compute::Cast(&context, *tmp_array, out_type, cast_options, &casted_array));
-  *out = casted_array->data()->buffers[1];
-  return Status::OK();
-}
-
-template <typename ArrowType>
-inline Status PandasConverter::ConvertData(std::shared_ptr<Buffer>* data) {
-  using traits = internal::arrow_traits<ArrowType::type_id>;
-  using T = typename traits::T;
-
-  if (is_strided()) {
-    // Strided, must copy into new contiguous memory
-    const int64_t stride = PyArray_STRIDES(arr_)[0];
-    const int64_t stride_elements = stride / sizeof(T);
-
-    auto new_buffer = std::make_shared<PoolBuffer>(pool_);
-    RETURN_NOT_OK(new_buffer->Resize(sizeof(T) * length_));
-    CopyStrided(reinterpret_cast<T*>(PyArray_DATA(arr_)), length_, stride_elements,
-                reinterpret_cast<T*>(new_buffer->mutable_data()));
-    *data = new_buffer;
-  } else {
-    // Can zero-copy
-    *data = std::make_shared<NumPyBuffer>(reinterpret_cast<PyObject*>(arr_));
-  }
-
-  std::shared_ptr<DataType> input_type;
-  RETURN_NOT_OK(
-      NumPyDtypeToArrow(reinterpret_cast<PyObject*>(PyArray_DESCR(arr_)), &input_type));
-
-  if (!input_type->Equals(*type_)) {
-    RETURN_NOT_OK(CastBuffer(*data, length_, input_type, type_, pool_, data));
-  }
-
-  return Status::OK();
-}
-
-template <>
-inline Status PandasConverter::ConvertData<Date32Type>(std::shared_ptr<Buffer>* data) {
-  // Handle LONGLONG->INT64 and other fun things
-  int type_num_compat = cast_npy_type_compat(PyArray_DESCR(arr_)->type_num);
-  int type_size = NumPyTypeSize(type_num_compat);
-
-  if (type_size == 4) {
-    // Source and target are INT32, so can refer to the main implementation.
-    return ConvertData<Int32Type>(data);
-  } else if (type_size == 8) {
-    // We need to scale down from int64 to int32
-    auto new_buffer = std::make_shared<PoolBuffer>(pool_);
-    RETURN_NOT_OK(new_buffer->Resize(sizeof(int32_t) * length_));
-
-    auto input = reinterpret_cast<const int64_t*>(PyArray_DATA(arr_));
-    auto output = reinterpret_cast<int32_t*>(new_buffer->mutable_data());
-
-    if (is_strided()) {
-      // Strided, must copy into new contiguous memory
-      const int64_t stride = PyArray_STRIDES(arr_)[0];
-      const int64_t stride_elements = stride / sizeof(int64_t);
-      CopyStrided(input, length_, stride_elements, output);
-    } else {
-      // TODO(wesm): int32 overflow checks
-      for (int64_t i = 0; i < length_; ++i) {
-        *output++ = static_cast<int32_t>(*input++);
-      }
-    }
-    *data = new_buffer;
-  } else {
-    std::stringstream ss;
-    ss << "Cannot convert NumPy array of element size ";
-    ss << type_size << " to a Date32 array";
-    return Status::NotImplemented(ss.str());
-  }
-
-  return Status::OK();
-}
-
-template <>
-inline Status PandasConverter::ConvertData<BooleanType>(std::shared_ptr<Buffer>* data) {
-  int64_t nbytes = BitUtil::BytesForBits(length_);
-  auto buffer = std::make_shared<PoolBuffer>(pool_);
-  RETURN_NOT_OK(buffer->Resize(nbytes));
-
-  Ndarray1DIndexer<uint8_t> values(arr_);
-
-  uint8_t* bitmap = buffer->mutable_data();
-
-  memset(bitmap, 0, nbytes);
-  for (int i = 0; i < length_; ++i) {
-    if (values[i] > 0) {
-      BitUtil::SetBit(bitmap, i);
-    }
-  }
-
-  *data = buffer;
-  return Status::OK();
-}
-
-template <typename T>
-struct UnboxDate {};
-
-template <>
-struct UnboxDate<Date32Type> {
-  static int32_t Unbox(PyObject* obj) {
-    return PyDate_to_days(reinterpret_cast<PyDateTime_Date*>(obj));
-  }
-};
-
-template <>
-struct UnboxDate<Date64Type> {
-  static int64_t Unbox(PyObject* obj) {
-    return PyDate_to_ms(reinterpret_cast<PyDateTime_Date*>(obj));
-  }
-};
-
-template <typename ArrowType>
-Status PandasConverter::ConvertDates() {
-  PyAcquireGIL lock;
-
-  using BuilderType = typename TypeTraits<ArrowType>::BuilderType;
-
-  Ndarray1DIndexer<PyObject*> objects(arr_);
-
-  if (mask_ != nullptr) {
-    return Status::NotImplemented("mask not supported in object conversions yet");
-  }
-
-  BuilderType builder(pool_);
-  RETURN_NOT_OK(builder.Resize(length_));
-
-  /// We have to run this in this compilation unit, since we cannot use the
-  /// datetime API otherwise
-  PyDateTime_IMPORT;
-
-  PyObject* obj;
-  for (int64_t i = 0; i < length_; ++i) {
-    obj = objects[i];
-    if (PyDate_CheckExact(obj)) {
-      RETURN_NOT_OK(builder.Append(UnboxDate<ArrowType>::Unbox(obj)));
-    } else if (PandasObjectIsNull(obj)) {
-      RETURN_NOT_OK(builder.AppendNull());
-    } else {
-      std::stringstream ss;
-      ss << "Error converting from Python objects to Date: ";
-      RETURN_NOT_OK(InvalidConversion(obj, "datetime.date", &ss));
-      return Status::Invalid(ss.str());
-    }
-  }
-
-  return PushBuilderResult(&builder);
-}
-
-Status PandasConverter::ConvertDecimals() {
-  PyAcquireGIL lock;
-
-  // Import the decimal module and Decimal class
-  OwnedRef decimal;
-  OwnedRef Decimal;
-  RETURN_NOT_OK(ImportModule("decimal", &decimal));
-  RETURN_NOT_OK(ImportFromModule(decimal, "Decimal", &Decimal));
-
-  Ndarray1DIndexer<PyObject*> objects(arr_);
-  PyObject* object = objects[0];
-
-  int precision;
-  int scale;
-
-  RETURN_NOT_OK(InferDecimalPrecisionAndScale(object, &precision, &scale));
-
-  type_ = std::make_shared<DecimalType>(precision, scale);
-
-  DecimalBuilder builder(type_, pool_);
-  RETURN_NOT_OK(builder.Resize(length_));
-
-  for (int64_t i = 0; i < length_; ++i) {
-    object = objects[i];
-    if (PyObject_IsInstance(object, Decimal.obj())) {
-      std::string string;
-      RETURN_NOT_OK(PythonDecimalToString(object, &string));
-
-      Decimal128 value;
-      RETURN_NOT_OK(Decimal128::FromString(string, &value));
-      RETURN_NOT_OK(builder.Append(value));
-    } else if (PandasObjectIsNull(object)) {
-      RETURN_NOT_OK(builder.AppendNull());
-    } else {
-      std::stringstream ss;
-      ss << "Error converting from Python objects to Decimal: ";
-      RETURN_NOT_OK(InvalidConversion(object, "decimal.Decimal", &ss));
-      return Status::Invalid(ss.str());
-    }
-  }
-  return PushBuilderResult(&builder);
-}
-
-Status PandasConverter::ConvertTimes() {
-  // Convert array of datetime.time objects to Arrow
-  PyAcquireGIL lock;
-  PyDateTime_IMPORT;
-
-  Ndarray1DIndexer<PyObject*> objects(arr_);
-
-  // datetime.time stores microsecond resolution
-  Time64Builder builder(::arrow::time64(TimeUnit::MICRO), pool_);
-  RETURN_NOT_OK(builder.Resize(length_));
-
-  PyObject* obj;
-  for (int64_t i = 0; i < length_; ++i) {
-    obj = objects[i];
-    if (PyTime_Check(obj)) {
-      RETURN_NOT_OK(builder.Append(PyTime_to_us(obj)));
-    } else if (PandasObjectIsNull(obj)) {
-      RETURN_NOT_OK(builder.AppendNull());
-    } else {
-      std::stringstream ss;
-      ss << "Error converting from Python objects to Time: ";
-      RETURN_NOT_OK(InvalidConversion(obj, "datetime.time", &ss));
-      return Status::Invalid(ss.str());
-    }
-  }
-  return PushBuilderResult(&builder);
-}
-
-Status PandasConverter::ConvertObjectStrings() {
-  PyAcquireGIL lock;
-
-  // The output type at this point is inconclusive because there may be bytes
-  // and unicode mixed in the object array
-  StringBuilder builder(pool_);
-  RETURN_NOT_OK(builder.Resize(length_));
-
-  bool global_have_bytes = false;
-  int64_t offset = 0;
-  while (offset < length_) {
-    bool chunk_have_bytes = false;
-    RETURN_NOT_OK(
-        AppendObjectStrings(arr_, mask_, offset, &builder, &offset, &chunk_have_bytes));
-
-    global_have_bytes = global_have_bytes | chunk_have_bytes;
-    std::shared_ptr<Array> chunk;
-    RETURN_NOT_OK(builder.Finish(&chunk));
-    out_arrays_.emplace_back(std::move(chunk));
-  }
-
-  // If we saw PyBytes, convert everything to BinaryArray
-  if (global_have_bytes) {
-    for (size_t i = 0; i < out_arrays_.size(); ++i) {
-      auto binary_data = out_arrays_[i]->data()->ShallowCopy();
-      binary_data->type = ::arrow::binary();
-      out_arrays_[i] = std::make_shared<BinaryArray>(binary_data);
-    }
-  }
-  return Status::OK();
-}
-
-Status PandasConverter::ConvertObjectFloats() {
-  PyAcquireGIL lock;
-
-  Ndarray1DIndexer<PyObject*> objects(arr_);
-  Ndarray1DIndexer<uint8_t> mask_values;
-
-  bool have_mask = false;
-  if (mask_ != nullptr) {
-    mask_values.Init(mask_);
-    have_mask = true;
-  }
-
-  DoubleBuilder builder(pool_);
-  RETURN_NOT_OK(builder.Resize(length_));
-
-  PyObject* obj;
-  for (int64_t i = 0; i < objects.size(); ++i) {
-    obj = objects[i];
-    if ((have_mask && mask_values[i]) || PandasObjectIsNull(obj)) {
-      RETURN_NOT_OK(builder.AppendNull());
-    } else if (PyFloat_Check(obj)) {
-      double val = PyFloat_AsDouble(obj);
-      RETURN_IF_PYERROR();
-      RETURN_NOT_OK(builder.Append(val));
-    } else {
-      std::stringstream ss;
-      ss << "Error converting from Python objects to Double: ";
-      RETURN_NOT_OK(InvalidConversion(obj, "float", &ss));
-      return Status::Invalid(ss.str());
-    }
-  }
-
-  return PushBuilderResult(&builder);
-}
-
-Status PandasConverter::ConvertObjectIntegers() {
-  PyAcquireGIL lock;
-
-  Int64Builder builder(pool_);
-  RETURN_NOT_OK(builder.Resize(length_));
-
-  Ndarray1DIndexer<PyObject*> objects(arr_);
-  Ndarray1DIndexer<uint8_t> mask_values;
-
-  bool have_mask = false;
-  if (mask_ != nullptr) {
-    mask_values.Init(mask_);
-    have_mask = true;
-  }
-
-  PyObject* obj;
-  for (int64_t i = 0; i < objects.size(); ++i) {
-    obj = objects[i];
-    if ((have_mask && mask_values[i]) || PandasObjectIsNull(obj)) {
-      RETURN_NOT_OK(builder.AppendNull());
-    } else if (PyObject_is_integer(obj)) {
-      const int64_t val = static_cast<int64_t>(PyLong_AsLong(obj));
-      RETURN_IF_PYERROR();
-      RETURN_NOT_OK(builder.Append(val));
-    } else {
-      std::stringstream ss;
-      ss << "Error converting from Python objects to Int64: ";
-      RETURN_NOT_OK(InvalidConversion(obj, "integer", &ss));
-      return Status::Invalid(ss.str());
-    }
-  }
-
-  return PushBuilderResult(&builder);
-}
-
-Status PandasConverter::ConvertObjectFixedWidthBytes(
-    const std::shared_ptr<DataType>& type) {
-  PyAcquireGIL lock;
-
-  int32_t byte_width = static_cast<const FixedSizeBinaryType&>(*type).byte_width();
-
-  // The output type at this point is inconclusive because there may be bytes
-  // and unicode mixed in the object array
-  FixedSizeBinaryBuilder builder(type, pool_);
-  RETURN_NOT_OK(builder.Resize(length_));
-
-  int64_t offset = 0;
-  while (offset < length_) {
-    RETURN_NOT_OK(
-        AppendObjectFixedWidthBytes(arr_, mask_, byte_width, offset, &builder, &offset));
-
-    std::shared_ptr<Array> chunk;
-    RETURN_NOT_OK(builder.Finish(&chunk));
-    out_arrays_.emplace_back(std::move(chunk));
-  }
-  return Status::OK();
-}
-
-Status PandasConverter::ConvertBooleans() {
-  PyAcquireGIL lock;
-
-  Ndarray1DIndexer<PyObject*> objects(arr_);
-  Ndarray1DIndexer<uint8_t> mask_values;
-
-  bool have_mask = false;
-  if (mask_ != nullptr) {
-    mask_values.Init(mask_);
-    have_mask = true;
-  }
-
-  int64_t nbytes = BitUtil::BytesForBits(length_);
-  auto data = std::make_shared<PoolBuffer>(pool_);
-  RETURN_NOT_OK(data->Resize(nbytes));
-  uint8_t* bitmap = data->mutable_data();
-  memset(bitmap, 0, nbytes);
-
-  int64_t null_count = 0;
-  PyObject* obj;
-  for (int64_t i = 0; i < length_; ++i) {
-    obj = objects[i];
-    if ((have_mask && mask_values[i]) || PandasObjectIsNull(obj)) {
-      ++null_count;
-    } else if (obj == Py_True) {
-      BitUtil::SetBit(bitmap, i);
-      BitUtil::SetBit(null_bitmap_data_, i);
-    } else if (obj == Py_False) {
-      BitUtil::SetBit(null_bitmap_data_, i);
-    } else {
-      std::stringstream ss;
-      ss << "Error converting from Python objects to Boolean: ";
-      RETURN_NOT_OK(InvalidConversion(obj, "bool", &ss));
-      return Status::Invalid(ss.str());
-    }
-  }
-
-  out_arrays_.push_back(
-      std::make_shared<BooleanArray>(length_, data, null_bitmap_, null_count));
-  return Status::OK();
-}
-
-Status PandasConverter::ConvertObjectsInfer() {
-  Ndarray1DIndexer<PyObject*> objects;
-
-  PyAcquireGIL lock;
-  objects.Init(arr_);
-  PyDateTime_IMPORT;
-
-  OwnedRef decimal;
-  OwnedRef Decimal;
-  RETURN_NOT_OK(ImportModule("decimal", &decimal));
-  RETURN_NOT_OK(ImportFromModule(decimal, "Decimal", &Decimal));
-
-  for (int64_t i = 0; i < length_; ++i) {
-    PyObject* obj = objects[i];
-    if (PandasObjectIsNull(obj)) {
-      continue;
-    } else if (PyObject_is_string(obj)) {
-      return ConvertObjectStrings();
-    } else if (PyObject_is_float(obj)) {
-      return ConvertObjectFloats();
-    } else if (PyBool_Check(obj)) {
-      return ConvertBooleans();
-    } else if (PyObject_is_integer(obj)) {
-      return ConvertObjectIntegers();
-    } else if (PyDate_CheckExact(obj)) {
-      // We could choose Date32 or Date64
-      return ConvertDates<Date32Type>();
-    } else if (PyTime_Check(obj)) {
-      return ConvertTimes();
-    } else if (PyObject_IsInstance(const_cast<PyObject*>(obj), Decimal.obj())) {
-      return ConvertDecimals();
-    } else if (PyList_Check(obj) || PyArray_Check(obj)) {
-      std::shared_ptr<DataType> inferred_type;
-      RETURN_NOT_OK(InferArrowType(obj, &inferred_type));
-      return ConvertLists(inferred_type);
-    } else {
-      const std::string supported_types =
-          "string, bool, float, int, date, time, decimal, list, array";
-      std::stringstream ss;
-      ss << "Error inferring Arrow type for Python object array. ";
-      RETURN_NOT_OK(InvalidConversion(obj, supported_types, &ss));
-      return Status::Invalid(ss.str());
-    }
-  }
-  out_arrays_.push_back(std::make_shared<NullArray>(length_));
-  return Status::OK();
-}
-
-Status PandasConverter::ConvertObjectsInferAndCast() {
-  size_t position = out_arrays_.size();
-  RETURN_NOT_OK(ConvertObjectsInfer());
-
-  std::shared_ptr<Array> arr = out_arrays_[position];
-
-  // Perform cast
-  compute::FunctionContext context(pool_);
-  compute::CastOptions options;
-  options.allow_int_overflow = false;
-
-  std::shared_ptr<Array> casted;
-  RETURN_NOT_OK(compute::Cast(&context, *arr, type_, options, &casted));
-
-  // Replace with casted values
-  out_arrays_[position] = casted;
-
-  return Status::OK();
-}
-
-Status PandasConverter::ConvertObjects() {
-  // Python object arrays are annoying, since we could have one of:
-  //
-  // * Strings
-  // * Booleans with nulls
-  // * decimal.Decimals
-  // * Mixed type (not supported at the moment by arrow format)
-  //
-  // Additionally, nulls may be encoded either as np.nan or None. So we have to
-  // do some type inference and conversion
-
-  RETURN_NOT_OK(InitNullBitmap());
-
-  // This means we received an explicit type from the user
-  if (type_) {
-    switch (type_->id()) {
-      case Type::STRING:
-        return ConvertObjectStrings();
-      case Type::FIXED_SIZE_BINARY:
-        return ConvertObjectFixedWidthBytes(type_);
-      case Type::BOOL:
-        return ConvertBooleans();
-      case Type::DATE32:
-        return ConvertDates<Date32Type>();
-      case Type::DATE64:
-        return ConvertDates<Date64Type>();
-      case Type::LIST: {
-        const auto& list_field = static_cast<const ListType&>(*type_);
-        return ConvertLists(list_field.value_field()->type());
-      }
-      case Type::DECIMAL:
-        return ConvertDecimals();
-      default:
-        return ConvertObjectsInferAndCast();
-    }
-  } else {
-    // Re-acquire GIL
-    return ConvertObjectsInfer();
-  }
-}
-
-template <typename T>
-Status LoopPySequence(PyObject* sequence, T func) {
-  if (PySequence_Check(sequence)) {
-    OwnedRef ref;
-    Py_ssize_t size = PySequence_Size(sequence);
-    if (PyArray_Check(sequence)) {
-      auto array = reinterpret_cast<PyArrayObject*>(sequence);
-      Ndarray1DIndexer<PyObject*> objects(array);
-      for (int64_t i = 0; i < size; ++i) {
-        RETURN_NOT_OK(func(objects[i]));
-      }
-    } else {
-      for (int64_t i = 0; i < size; ++i) {
-        ref.reset(PySequence_GetItem(sequence, i));
-        RETURN_NOT_OK(func(ref.obj()));
-      }
-    }
-  } else if (PyObject_HasAttrString(sequence, "__iter__")) {
-    OwnedRef iter = OwnedRef(PyObject_GetIter(sequence));
-    PyObject* item;
-    while ((item = PyIter_Next(iter.obj()))) {
-      OwnedRef ref = OwnedRef(item);
-      RETURN_NOT_OK(func(ref.obj()));
-    }
-  } else {
-    return Status::TypeError("Object is not a sequence or iterable");
-  }
-
-  return Status::OK();
-}
-
-template <int ITEM_TYPE, typename ArrowType>
-inline Status PandasConverter::ConvertTypedLists(const std::shared_ptr<DataType>& type,
-                                                 ListBuilder* builder, PyObject* list) {
-  typedef internal::npy_traits<ITEM_TYPE> traits;
-  typedef typename traits::BuilderClass BuilderT;
-
-  PyAcquireGIL lock;
-
-  // TODO: mask not supported here
-  if (mask_ != nullptr) {
-    return Status::NotImplemented("mask not supported in object conversions yet");
-  }
-
-  BuilderT* value_builder = static_cast<BuilderT*>(builder->value_builder());
-
-  auto foreach_item = [&](PyObject* object) {
-    if (PandasObjectIsNull(object)) {
-      return builder->AppendNull();
-    } else if (PyArray_Check(object)) {
-      auto numpy_array = reinterpret_cast<PyArrayObject*>(object);
-      RETURN_NOT_OK(builder->Append(true));
-
-      // TODO(uwe): Support more complex numpy array structures
-      RETURN_NOT_OK(CheckFlatNumpyArray(numpy_array, ITEM_TYPE));
-
-      return AppendNdarrayToBuilder<ITEM_TYPE, BuilderT>(numpy_array, value_builder);
-    } else if (PyList_Check(object)) {
-      int64_t size;
-      std::shared_ptr<DataType> inferred_type;
-      RETURN_NOT_OK(builder->Append(true));
-      RETURN_NOT_OK(InferArrowTypeAndSize(object, &size, &inferred_type));
-      if (inferred_type->id() != Type::NA && inferred_type->id() != type->id()) {
-        std::stringstream ss;
-        ss << inferred_type->ToString() << " cannot be converted to " << type->ToString();
-        return Status::TypeError(ss.str());
-      }
-      return AppendPySequence(object, size, type, value_builder);
-    } else {
-      return Status::TypeError("Unsupported Python type for list items");
-    }
-  };
-
-  return LoopPySequence(list, foreach_item);
-}
-
-template <>
-inline Status PandasConverter::ConvertTypedLists<NPY_OBJECT, NullType>(
-    const std::shared_ptr<DataType>& type, ListBuilder* builder, PyObject* list) {
-  PyAcquireGIL lock;
-
-  // TODO: mask not supported here
-  if (mask_ != nullptr) {
-    return Status::NotImplemented("mask not supported in object conversions yet");
-  }
-
-  auto value_builder = static_cast<NullBuilder*>(builder->value_builder());
-
-  auto foreach_item = [&](PyObject* object) {
-    if (PandasObjectIsNull(object)) {
-      return builder->AppendNull();
-    } else if (PyArray_Check(object)) {
-      auto numpy_array = reinterpret_cast<PyArrayObject*>(object);
-      RETURN_NOT_OK(builder->Append(true));
-
-      // TODO(uwe): Support more complex numpy array structures
-      RETURN_NOT_OK(CheckFlatNumpyArray(numpy_array, NPY_OBJECT));
-
-      for (int64_t i = 0; i < static_cast<int64_t>(PyArray_SIZE(numpy_array)); ++i) {
-        RETURN_NOT_OK(value_builder->AppendNull());
-      }
-      return Status::OK();
-    } else if (PyList_Check(object)) {
-      RETURN_NOT_OK(builder->Append(true));
-      const Py_ssize_t size = PySequence_Size(object);
-      for (Py_ssize_t i = 0; i < size; ++i) {
-        RETURN_NOT_OK(value_builder->AppendNull());
-      }
-      return Status::OK();
-    } else {
-      return Status::TypeError("Unsupported Python type for list items");
-    }
-  };
-
-  return LoopPySequence(list, foreach_item);
-}
-
-template <>
-inline Status PandasConverter::ConvertTypedLists<NPY_OBJECT, StringType>(
-    const std::shared_ptr<DataType>& type, ListBuilder* builder, PyObject* list) {
-  PyAcquireGIL lock;
-  // TODO: If there are bytes involed, convert to Binary representation
-  bool have_bytes = false;
-
-  // TODO: mask not supported here
-  if (mask_ != nullptr) {
-    return Status::NotImplemented("mask not supported in object conversions yet");
-  }
-
-  auto value_builder = static_cast<StringBuilder*>(builder->value_builder());
-
-  auto foreach_item = [&](PyObject* object) {
-    if (PandasObjectIsNull(object)) {
-      return builder->AppendNull();
-    } else if (PyArray_Check(object)) {
-      auto numpy_array = reinterpret_cast<PyArrayObject*>(object);
-      RETURN_NOT_OK(builder->Append(true));
-
-      // TODO(uwe): Support more complex numpy array structures
-      RETURN_NOT_OK(CheckFlatNumpyArray(numpy_array, NPY_OBJECT));
-
-      int64_t offset = 0;
-      RETURN_NOT_OK(AppendObjectStrings(numpy_array, nullptr, 0, value_builder, &offset,
-                                        &have_bytes));
-      if (offset < PyArray_SIZE(numpy_array)) {
-        return Status::Invalid("Array cell value exceeded 2GB");
-      }
-      return Status::OK();
-    } else if (PyList_Check(object)) {
-      int64_t size;
-      std::shared_ptr<DataType> inferred_type;
-      RETURN_NOT_OK(builder->Append(true));
-      RETURN_NOT_OK(InferArrowTypeAndSize(object, &size, &inferred_type));
-      if (inferred_type->id() != Type::NA && inferred_type->id() != Type::STRING) {
-        std::stringstream ss;
-        ss << inferred_type->ToString() << " cannot be converted to STRING.";
-        return Status::TypeError(ss.str());
-      }
-      return AppendPySequence(object, size, inferred_type, value_builder);
-    } else {
-      return Status::TypeError("Unsupported Python type for list items");
-    }
-  };
-
-  return LoopPySequence(list, foreach_item);
-}
-
-#define LIST_CASE(TYPE, NUMPY_TYPE, ArrowType)                            \
-  case Type::TYPE: {                                                      \
-    return ConvertTypedLists<NUMPY_TYPE, ArrowType>(type, builder, list); \
-  }
-
-Status PandasConverter::ConvertLists(const std::shared_ptr<DataType>& type,
-                                     ListBuilder* builder, PyObject* list) {
-  switch (type->id()) {
-    LIST_CASE(NA, NPY_OBJECT, NullType)
-    LIST_CASE(UINT8, NPY_UINT8, UInt8Type)
-    LIST_CASE(INT8, NPY_INT8, Int8Type)
-    LIST_CASE(UINT16, NPY_UINT16, UInt16Type)
-    LIST_CASE(INT16, NPY_INT16, Int16Type)
-    LIST_CASE(UINT32, NPY_UINT32, UInt32Type)
-    LIST_CASE(INT32, NPY_INT32, Int32Type)
-    LIST_CASE(UINT64, NPY_UINT64, UInt64Type)
-    LIST_CASE(INT64, NPY_INT64, Int64Type)
-    LIST_CASE(TIMESTAMP, NPY_DATETIME, TimestampType)
-    LIST_CASE(FLOAT, NPY_FLOAT, FloatType)
-    LIST_CASE(DOUBLE, NPY_DOUBLE, DoubleType)
-    LIST_CASE(STRING, NPY_OBJECT, StringType)
-    case Type::LIST: {
-      const ListType& list_type = static_cast<const ListType&>(*type);
-      auto value_builder = static_cast<ListBuilder*>(builder->value_builder());
-
-      auto foreach_item = [&](PyObject* object) {
-        if (PandasObjectIsNull(object)) {
-          return builder->AppendNull();
-        } else {
-          RETURN_NOT_OK(builder->Append(true));
-          return ConvertLists(list_type.value_type(), value_builder, object);
-        }
-      };
-
-      return LoopPySequence(list, foreach_item);
-    }
-    default: {
-      std::stringstream ss;
-      ss << "Unknown list item type: ";
-      ss << type->ToString();
-      return Status::TypeError(ss.str());
-    }
-  }
-}
-
-Status PandasConverter::ConvertLists(const std::shared_ptr<DataType>& type) {
-  std::unique_ptr<ArrayBuilder> array_builder;
-  RETURN_NOT_OK(MakeBuilder(pool_, arrow::list(type), &array_builder));
-  ListBuilder* list_builder = static_cast<ListBuilder*>(array_builder.get());
-  RETURN_NOT_OK(ConvertLists(type, list_builder, reinterpret_cast<PyObject*>(arr_)));
-  return PushBuilderResult(list_builder);
-}
-
-Status PandasToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo,
-                     const std::shared_ptr<DataType>& type, std::shared_ptr<Array>* out) {
-  PandasConverter converter(pool, ao, mo, type);
-  RETURN_NOT_OK(converter.Convert());
-  *out = converter.result()[0];
-  DCHECK(*out);
-  return Status::OK();
-}
-
-Status PandasObjectsToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo,
-                            const std::shared_ptr<DataType>& type,
-                            std::shared_ptr<ChunkedArray>* out) {
-  PandasConverter converter(pool, ao, mo, type);
-  RETURN_NOT_OK(converter.ConvertObjects());
-  *out = std::make_shared<ChunkedArray>(converter.result());
-  return Status::OK();
-}
-
-}  // namespace py
-}  // namespace arrow

http://git-wip-us.apache.org/repos/asf/arrow/blob/796129b4/cpp/src/arrow/python/pandas_to_arrow.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/pandas_to_arrow.h b/cpp/src/arrow/python/pandas_to_arrow.h
deleted file mode 100644
index 3e655ba..0000000
--- a/cpp/src/arrow/python/pandas_to_arrow.h
+++ /dev/null
@@ -1,59 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-// Converting from pandas memory representation to Arrow data structures
-
-#ifndef ARROW_PYTHON_PANDAS_TO_ARROW_H
-#define ARROW_PYTHON_PANDAS_TO_ARROW_H
-
-#include "arrow/python/platform.h"
-
-#include <memory>
-
-#include "arrow/util/visibility.h"
-
-namespace arrow {
-
-class Array;
-class ChunkedArray;
-class DataType;
-class MemoryPool;
-class Status;
-
-namespace py {
-
-ARROW_EXPORT
-Status PandasToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo,
-                     const std::shared_ptr<DataType>& type, std::shared_ptr<Array>* out);
-
-/// Convert dtype=object arrays. If target data type is not known, pass a type
-/// with nullptr
-///
-/// \param[in] pool Memory pool for any memory allocations
-/// \param[in] ao an ndarray with the array data
-/// \param[in] mo an ndarray with a null mask (True is null), optional
-/// \param[in] type
-/// \param[out] out a ChunkedArray, to accommodate chunked output
-ARROW_EXPORT
-Status PandasObjectsToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo,
-                            const std::shared_ptr<DataType>& type,
-                            std::shared_ptr<ChunkedArray>* out);
-
-}  // namespace py
-}  // namespace arrow
-
-#endif  // ARROW_PYTHON_PANDAS_TO_ARROW_H

http://git-wip-us.apache.org/repos/asf/arrow/blob/796129b4/python/pyarrow/__init__.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index 0d76a35..ac06948 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -36,7 +36,7 @@ from pyarrow.lib import (null, bool_,
                          time32, time64, timestamp, date32, date64,
                          float16, float32, float64,
                          binary, string, decimal,
-                         list_, struct, dictionary, field,
+                         list_, struct, dictionary, field, type_for_alias,
                          DataType, NAType,
                          Field,
                          Schema,

http://git-wip-us.apache.org/repos/asf/arrow/blob/796129b4/python/pyarrow/array.pxi
----------------------------------------------------------------------
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index eec6180..f402def 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -16,58 +16,161 @@
 # under the License.
 
 
-def array(object sequence, DataType type=None, MemoryPool memory_pool=None,
-          size=None):
+cdef _sequence_to_array(object sequence, object size, DataType type,
+                        CMemoryPool* pool):
+    cdef shared_ptr[CArray] out
+    cdef int64_t c_size
+    if type is None:
+        with nogil:
+            check_status(ConvertPySequence(sequence, pool, &out))
+    else:
+        if size is None:
+            with nogil:
+                check_status(
+                    ConvertPySequence(
+                        sequence, pool, &out, type.sp_type
+                    )
+                )
+        else:
+            c_size = size
+            with nogil:
+                check_status(
+                    ConvertPySequence(
+                        sequence, pool, &out, type.sp_type, c_size
+                    )
+                )
+
+    return pyarrow_wrap_array(out)
+
+
+cdef _is_array_like(obj):
+    try:
+        import pandas
+        return isinstance(obj, (np.ndarray, pd.Series, pd.Index, Categorical))
+    except:
+        return isinstance(obj, np.ndarray)
+
+
+cdef _ndarray_to_array(object values, object mask, DataType type,
+                       c_bool use_pandas_null_sentinels,
+                       CMemoryPool* pool):
+    cdef shared_ptr[CChunkedArray] chunked_out
+    cdef shared_ptr[CDataType] c_type
+
+    dtype = values.dtype
+
+    if type is None and dtype != object:
+        with nogil:
+            check_status(NumPyDtypeToArrow(dtype, &c_type))
+
+    if type is not None:
+        c_type = type.sp_type
+
+    with nogil:
+        check_status(NdarrayToArrow(pool, values, mask,
+                                    use_pandas_null_sentinels,
+                                    c_type, &chunked_out))
+
+    if chunked_out.get().num_chunks() > 1:
+        return pyarrow_wrap_chunked_array(chunked_out)
+    else:
+        return pyarrow_wrap_array(chunked_out.get().chunk(0))
+
+
+cdef DataType _ensure_type(object type):
+    if type is None:
+        return None
+    elif not isinstance(type, DataType):
+        return type_for_alias(type)
+    else:
+        return type
+
+
+def array(object obj, type=None, mask=None,
+          MemoryPool memory_pool=None, size=None,
+          from_pandas=False):
     """
-    Create pyarrow.Array instance from a Python sequence
+    Create pyarrow.Array instance from a Python object
 
     Parameters
     ----------
-    sequence : sequence-like or iterable object of Python objects.
-        If both type and size are specified may be a single use iterable.
-    type : pyarrow.DataType, optional
-        If not passed, will be inferred from the data
+    obj : sequence, iterable, ndarray or Series
+        If both type and size are specified may be a single use iterable. If
+        not strongly-typed, Arrow type will be inferred for resulting array
+    mask : array (boolean), optional
+        Indicate which values are null (True) or not null (False).
+    type : pyarrow.DataType
+        Explicit type to attempt to coerce to, otherwise will be inferred from
+        the data
     memory_pool : pyarrow.MemoryPool, optional
         If not passed, will allocate memory from the currently-set default
         memory pool
+
     size : int64, optional
         Size of the elements. If the imput is larger than size bail at this
         length. For iterators, if size is larger than the input iterator this
         will be treated as a "max size", but will involve an initial allocation
         of size followed by a resize to the actual size (so if you know the
         exact size specifying it correctly will give you better performance).
+    from_pandas : boolean, default False
+        Use pandas's semantics for inferring nulls from values in ndarray-like
+        data. If passed, the mask tasks precendence, but if a value is unmasked
+        (not-null), but still null according to pandas semantics, then it is
+        null
+
+    Notes
+    -----
+    Localized timestamps will currently be returned as UTC (pandas's native
+    representation).  Timezone-naive data will be implicitly interpreted as
+    UTC.
+
+    Examples
+    --------
+    >>> import pandas as pd
+    >>> import pyarrow as pa
+    >>> pa.array(pd.Series([1, 2]))
+    <pyarrow.array.Int64Array object at 0x7f674e4c0e10>
+    [
+      1,
+      2
+    ]
+
+    >>> import numpy as np
+    >>> pa.array(pd.Series([1, 2]), np.array([0, 1],
+    ... dtype=bool))
+    <pyarrow.array.Int64Array object at 0x7f9019e11208>
+    [
+      1,
+      NA
+    ]
 
     Returns
     -------
-    array : pyarrow.Array
+    array : pyarrow.Array or pyarrow.ChunkedArray (if object data
+    overflowed binary storage)
     """
-    cdef:
-        shared_ptr[CArray] sp_array
-        CMemoryPool* pool
-        int64_t c_size
+    type = _ensure_type(type)
+    cdef CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool)
 
-    pool = maybe_unbox_memory_pool(memory_pool)
-    if type is None:
-        with nogil:
-            check_status(ConvertPySequence(sequence, pool, &sp_array))
-    else:
-        if size is None:
-            with nogil:
-                check_status(
-                    ConvertPySequence(
-                        sequence, pool, &sp_array, type.sp_type
-                    )
-                )
-        else:
-            c_size = size
-            with nogil:
-                check_status(
-                    ConvertPySequence(
-                        sequence, pool, &sp_array, type.sp_type, c_size
-                    )
-                )
+    if _is_array_like(obj):
+        if mask is not None:
+            mask = get_series_values(mask)
+
+        values = get_series_values(obj)
 
-    return pyarrow_wrap_array(sp_array)
+        if isinstance(values, Categorical):
+            return DictionaryArray.from_arrays(
+                values.codes, values.categories.values,
+                mask=mask, ordered=values.ordered,
+                memory_pool=memory_pool)
+        else:
+            values, type = pdcompat.get_datetimetz_type(values, obj.dtype,
+                                                        type)
+            return _ndarray_to_array(values, mask, type, from_pandas, pool)
+    else:
+        if mask is not None:
+            raise ValueError("Masks only supported with ndarray-like inputs")
+        return _sequence_to_array(obj, size, type, pool)
 
 
 def _normalize_slice(object arrow_obj, slice key):
@@ -112,7 +215,7 @@ cdef class Array:
         with nogil:
             check_status(DebugPrint(deref(self.ap), 0))
 
-    def cast(self, DataType target_type, safe=True):
+    def cast(self, object target_type, safe=True):
         """
         Cast array values to another data type
 
@@ -130,42 +233,37 @@ cdef class Array:
         cdef:
             CCastOptions options
             shared_ptr[CArray] result
+            DataType type
+
+        type = _ensure_type(target_type)
 
         if not safe:
             options.allow_int_overflow = 1
 
         with nogil:
-            check_status(Cast(_context(), self.ap[0], target_type.sp_type,
+            check_status(Cast(_context(), self.ap[0], type.sp_type,
                               options, &result))
 
         return pyarrow_wrap_array(result)
 
     @staticmethod
-    def from_pandas(obj, mask=None, DataType type=None,
-                    timestamps_to_ms=False,
-                    MemoryPool memory_pool=None):
+    def from_pandas(obj, mask=None, type=None, MemoryPool memory_pool=None):
         """
-        Convert pandas.Series to an Arrow Array.
+        Convert pandas.Series to an Arrow Array, using pandas's semantics about
+        what values indicate nulls. See pyarrow.array for more general
+        conversion from arrays or sequences to Arrow arrays
 
         Parameters
         ----------
-        series : pandas.Series or numpy.ndarray
-
-        mask : pandas.Series or numpy.ndarray, optional
-            boolean mask if the object is null (True) or valid (False)
-
+        sequence : ndarray, Inded Series
+        mask : array (boolean), optional
+            Indicate which values are null (True) or not null (False)
         type : pyarrow.DataType
-            Explicit type to attempt to coerce to
-
-        timestamps_to_ms : bool, optional
-            Convert datetime columns to ms resolution. This is needed for
-            compatibility with other functionality like Parquet I/O which
-            only supports milliseconds.
-
-            .. deprecated:: 0.7.0
-
-        memory_pool: MemoryPool, optional
-            Specific memory pool to use to allocate the resulting Arrow array.
+            Explicit type to attempt to coerce to, otherwise will be inferred
+            from the data
+        memory_pool : pyarrow.MemoryPool, optional
+            If not passed, will allocate memory from the currently-set default
+            memory pool
 
         Notes
         -----
@@ -173,78 +271,13 @@ cdef class Array:
         representation).  Timezone-naive data will be implicitly interpreted as
         UTC.
 
-        Examples
-        --------
-
-        >>> import pandas as pd
-        >>> import pyarrow as pa
-        >>> pa.Array.from_pandas(pd.Series([1, 2]))
-        <pyarrow.array.Int64Array object at 0x7f674e4c0e10>
-        [
-          1,
-          2
-        ]
-
-        >>> import numpy as np
-        >>> pa.Array.from_pandas(pd.Series([1, 2]), np.array([0, 1],
-        ... dtype=bool))
-        <pyarrow.array.Int64Array object at 0x7f9019e11208>
-        [
-          1,
-          NA
-        ]
-
         Returns
         -------
         array : pyarrow.Array or pyarrow.ChunkedArray (if object data
-        overflowed binary storage)
+        overflows binary buffer)
         """
-        cdef:
-            shared_ptr[CArray] out
-            shared_ptr[CChunkedArray] chunked_out
-            shared_ptr[CDataType] c_type
-            CMemoryPool* pool
-
-        if mask is not None:
-            mask = get_series_values(mask)
-
-        values = get_series_values(obj)
-        pool = maybe_unbox_memory_pool(memory_pool)
-
-        if isinstance(values, Categorical):
-            return DictionaryArray.from_arrays(
-                values.codes, values.categories.values,
-                mask=mask, ordered=values.ordered,
-                memory_pool=memory_pool)
-        elif values.dtype == object:
-            # Object dtype undergoes a different conversion path as more type
-            # inference may be needed
-            if type is not None:
-                c_type = type.sp_type
-            with nogil:
-                check_status(PandasObjectsToArrow(
-                    pool, values, mask, c_type, &chunked_out))
-
-            if chunked_out.get().num_chunks() > 1:
-                return pyarrow_wrap_chunked_array(chunked_out)
-            else:
-                out = chunked_out.get().chunk(0)
-        else:
-            values, type = pdcompat.maybe_coerce_datetime64(
-                values, obj.dtype, type, timestamps_to_ms=timestamps_to_ms)
-
-            if type is None:
-                dtype = values.dtype
-                with nogil:
-                    check_status(NumPyDtypeToArrow(dtype, &c_type))
-            else:
-                c_type = type.sp_type
-
-            with nogil:
-                check_status(PandasToArrow(
-                    pool, values, mask, c_type, &out))
-
-        return pyarrow_wrap_array(out)
+        return array(obj, mask=mask, type=type, memory_pool=memory_pool,
+                     from_pandas=True)
 
     property null_count:
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/796129b4/python/pyarrow/includes/libarrow.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 5e67088..fc17d1c 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -766,13 +766,10 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil:
 
     CStatus NumPyDtypeToArrow(object dtype, shared_ptr[CDataType]* type)
 
-    CStatus PandasToArrow(CMemoryPool* pool, object ao, object mo,
-                          const shared_ptr[CDataType]& type,
-                          shared_ptr[CArray]* out)
-
-    CStatus PandasObjectsToArrow(CMemoryPool* pool, object ao, object mo,
-                                 const shared_ptr[CDataType]& type,
-                                 shared_ptr[CChunkedArray]* out)
+    CStatus NdarrayToArrow(CMemoryPool* pool, object ao, object mo,
+                           c_bool use_pandas_null_sentinels,
+                           const shared_ptr[CDataType]& type,
+                           shared_ptr[CChunkedArray]* out)
 
     CStatus NdarrayToTensor(CMemoryPool* pool, object ao,
                             shared_ptr[CTensor]* out)

http://git-wip-us.apache.org/repos/asf/arrow/blob/796129b4/python/pyarrow/pandas_compat.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index d1e6f5a..be48aeb 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -203,7 +203,7 @@ def construct_metadata(df, column_names, index_levels, preserve_index, types):
     }
 
 
-def dataframe_to_arrays(df, timestamps_to_ms, schema, preserve_index):
+def dataframe_to_arrays(df, schema, preserve_index):
     names = []
     arrays = []
     index_columns = []
@@ -223,15 +223,13 @@ def dataframe_to_arrays(df, timestamps_to_ms, schema, preserve_index):
             field = schema.field_by_name(name)
             type = getattr(field, "type", None)
 
-        array = pa.Array.from_pandas(
-            col, type=type, timestamps_to_ms=timestamps_to_ms
-        )
+        array = pa.array(col, from_pandas=True, type=type)
         arrays.append(array)
         names.append(name)
         types.append(array.type)
 
     for i, column in enumerate(index_columns):
-        array = pa.Array.from_pandas(column, timestamps_to_ms=timestamps_to_ms)
+        array = pa.array(column)
         arrays.append(array)
         names.append(index_level_name(column, i))
         types.append(array.type)
@@ -242,25 +240,15 @@ def dataframe_to_arrays(df, timestamps_to_ms, schema, preserve_index):
     return names, arrays, metadata
 
 
-def maybe_coerce_datetime64(values, dtype, type_, timestamps_to_ms=False):
-    if timestamps_to_ms:
-        import warnings
-        warnings.warn('timestamps_to_ms=True is deprecated', FutureWarning)
-
+def get_datetimetz_type(values, dtype, type_):
     from pyarrow.compat import DatetimeTZDtype
 
     if values.dtype.type != np.datetime64:
         return values, type_
 
-    coerce_ms = timestamps_to_ms and values.dtype != 'datetime64[ms]'
-
-    if coerce_ms:
-        values = values.astype('datetime64[ms]')
-        type_ = pa.timestamp('ms')
-
     if isinstance(dtype, DatetimeTZDtype):
         tz = dtype.tz
-        unit = 'ms' if coerce_ms else dtype.unit
+        unit = dtype.unit
         type_ = pa.timestamp(unit, tz)
     elif type_ is None:
         # Trust the NumPy dtype

http://git-wip-us.apache.org/repos/asf/arrow/blob/796129b4/python/pyarrow/scalar.pxi
----------------------------------------------------------------------
diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi
index 3a847f7..c37ed3b 100644
--- a/python/pyarrow/scalar.pxi
+++ b/python/pyarrow/scalar.pxi
@@ -348,10 +348,10 @@ cdef class StructValue(ArrayValue):
 
 cdef dict _scalar_classes = {
     _Type_BOOL: BooleanValue,
-    _Type_UINT8: Int8Value,
-    _Type_UINT16: Int16Value,
-    _Type_UINT32: Int32Value,
-    _Type_UINT64: Int64Value,
+    _Type_UINT8: UInt8Value,
+    _Type_UINT16: UInt16Value,
+    _Type_UINT32: UInt32Value,
+    _Type_UINT64: UInt64Value,
     _Type_INT8: Int8Value,
     _Type_INT16: Int16Value,
     _Type_INT32: Int32Value,

http://git-wip-us.apache.org/repos/asf/arrow/blob/796129b4/python/pyarrow/table.pxi
----------------------------------------------------------------------
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index 028797e..e5422a5 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -575,7 +575,7 @@ cdef class RecordBatch:
         pyarrow.RecordBatch
         """
         names, arrays, metadata = pdcompat.dataframe_to_arrays(
-            df, False, schema, preserve_index
+            df, schema, preserve_index
         )
         return cls.from_arrays(arrays, names, metadata)
 
@@ -714,21 +714,13 @@ cdef class Table:
         return result
 
     @classmethod
-    def from_pandas(cls, df, bint timestamps_to_ms=False,
-                    Schema schema=None, bint preserve_index=True):
+    def from_pandas(cls, df, Schema schema=None, bint preserve_index=True):
         """
         Convert pandas.DataFrame to an Arrow Table
 
         Parameters
         ----------
         df : pandas.DataFrame
-        timestamps_to_ms : bool
-            Convert datetime columns to ms resolution. This is needed for
-            compability with other functionality like Parquet I/O which
-            only supports milliseconds.
-
-            .. deprecated:: 0.7.0
-
         schema : pyarrow.Schema, optional
             The expected schema of the Arrow Table. This can be used to
             indicate the type of columns if we cannot infer it automatically.
@@ -754,7 +746,6 @@ cdef class Table:
         """
         names, arrays, metadata = pdcompat.dataframe_to_arrays(
             df,
-            timestamps_to_ms=timestamps_to_ms,
             schema=schema,
             preserve_index=preserve_index
         )

http://git-wip-us.apache.org/repos/asf/arrow/blob/796129b4/python/pyarrow/tests/test_array.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py
index f316417..3bf3926 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -149,6 +149,14 @@ def test_array_factory_invalid_type():
         pa.array(arr)
 
 
+def test_array_ref_to_ndarray_base():
+    arr = np.array([1, 2, 3])
+
+    refcount = sys.getrefcount(arr)
+    arr2 = pa.array(arr)  # noqa
+    assert sys.getrefcount(arr) == (refcount + 1)
+
+
 def test_dictionary_from_numpy():
     indices = np.repeat([0, 1, 2], 2)
     dictionary = np.array(['foo', 'bar', 'baz'], dtype=object)
@@ -170,8 +178,8 @@ def test_dictionary_from_boxed_arrays():
     indices = np.repeat([0, 1, 2], 2)
     dictionary = np.array(['foo', 'bar', 'baz'], dtype=object)
 
-    iarr = pa.Array.from_pandas(indices)
-    darr = pa.Array.from_pandas(dictionary)
+    iarr = pa.array(indices)
+    darr = pa.array(dictionary)
 
     d1 = pa.DictionaryArray.from_arrays(iarr, darr)
 
@@ -201,9 +209,9 @@ def test_dictionary_with_pandas():
 
 def test_list_from_arrays():
     offsets_arr = np.array([0, 2, 5, 8], dtype='i4')
-    offsets = pa.Array.from_pandas(offsets_arr, type=pa.int32())
+    offsets = pa.array(offsets_arr, type='int32')
     pyvalues = [b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h']
-    values = pa.array(pyvalues, type=pa.binary())
+    values = pa.array(pyvalues, type='binary')
 
     result = pa.ListArray.from_arrays(offsets, values)
     expected = pa.array([pyvalues[:2], pyvalues[2:5], pyvalues[5:8]])
@@ -214,22 +222,22 @@ def test_list_from_arrays():
 def _check_cast_case(case, safe=True):
     in_data, in_type, out_data, out_type = case
 
-    in_arr = pa.Array.from_pandas(in_data, type=in_type)
+    in_arr = pa.array(in_data, type=in_type)
 
     casted = in_arr.cast(out_type, safe=safe)
-    expected = pa.Array.from_pandas(out_data, type=out_type)
+    expected = pa.array(out_data, type=out_type)
     assert casted.equals(expected)
 
 
 def test_cast_integers_safe():
     safe_cases = [
-        (np.array([0, 1, 2, 3], dtype='i1'), pa.int8(),
+        (np.array([0, 1, 2, 3], dtype='i1'), 'int8',
          np.array([0, 1, 2, 3], dtype='i4'), pa.int32()),
-        (np.array([0, 1, 2, 3], dtype='i1'), pa.int8(),
+        (np.array([0, 1, 2, 3], dtype='i1'), 'int8',
          np.array([0, 1, 2, 3], dtype='u4'), pa.uint16()),
-        (np.array([0, 1, 2, 3], dtype='i1'), pa.int8(),
+        (np.array([0, 1, 2, 3], dtype='i1'), 'int8',
          np.array([0, 1, 2, 3], dtype='u1'), pa.uint8()),
-        (np.array([0, 1, 2, 3], dtype='i1'), pa.int8(),
+        (np.array([0, 1, 2, 3], dtype='i1'), 'int8',
          np.array([0, 1, 2, 3], dtype='f8'), pa.float64())
     ]
 
@@ -237,13 +245,13 @@ def test_cast_integers_safe():
         _check_cast_case(case)
 
     unsafe_cases = [
-        (np.array([50000], dtype='i4'), pa.int32(), pa.int16()),
-        (np.array([70000], dtype='i4'), pa.int32(), pa.uint16()),
-        (np.array([-1], dtype='i4'), pa.int32(), pa.uint16()),
-        (np.array([50000], dtype='u2'), pa.uint16(), pa.int16())
+        (np.array([50000], dtype='i4'), 'int32', 'int16'),
+        (np.array([70000], dtype='i4'), 'int32', 'uint16'),
+        (np.array([-1], dtype='i4'), 'int32', 'uint16'),
+        (np.array([50000], dtype='u2'), 'uint16', 'int16')
     ]
     for in_data, in_type, out_type in unsafe_cases:
-        in_arr = pa.Array.from_pandas(in_data, type=in_type)
+        in_arr = pa.array(in_data, type=in_type)
 
         with pytest.raises(pa.ArrowInvalid):
             in_arr.cast(out_type)
@@ -252,11 +260,11 @@ def test_cast_integers_safe():
 def test_cast_integers_unsafe():
     # We let NumPy do the unsafe casting
     unsafe_cases = [
-        (np.array([50000], dtype='i4'), pa.int32(),
+        (np.array([50000], dtype='i4'), 'int32',
          np.array([50000], dtype='i2'), pa.int16()),
-        (np.array([70000], dtype='i4'), pa.int32(),
+        (np.array([70000], dtype='i4'), 'int32',
          np.array([70000], dtype='u2'), pa.uint16()),
-        (np.array([-1], dtype='i4'), pa.int32(),
+        (np.array([-1], dtype='i4'), 'int32',
          np.array([-1], dtype='u2'), pa.uint16()),
         (np.array([50000], dtype='u2'), pa.uint16(),
          np.array([50000], dtype='i2'), pa.int16())
@@ -315,3 +323,17 @@ def test_simple_type_construction():
 )
 def test_logical_type(type, expected):
     assert get_logical_type(type) == expected
+
+
+def test_array_conversions_no_sentinel_values():
+    arr = np.array([1, 2, 3, 4], dtype='int8')
+    refcount = sys.getrefcount(arr)
+    arr2 = pa.array(arr)  # noqa
+    assert sys.getrefcount(arr) == (refcount + 1)
+
+    assert arr2.type == 'int8'
+
+    arr3 = pa.array(np.array([1, np.nan, 2, 3, np.nan, 4], dtype='float32'),
+                    type='float32')
+    assert arr3.type == 'float32'
+    assert arr3.null_count == 0

http://git-wip-us.apache.org/repos/asf/arrow/blob/796129b4/python/pyarrow/tests/test_convert_pandas.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index 5d56cde..182f3af 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -18,7 +18,7 @@
 
 from collections import OrderedDict
 
-from datetime import datetime, date, time
+from datetime import date, time
 import unittest
 import decimal
 import json
@@ -82,7 +82,7 @@ class TestPandasConversion(unittest.TestCase):
         tm.assert_frame_equal(result, expected, check_dtype=check_dtype)
 
     def _check_series_roundtrip(self, s, type_=None):
-        arr = pa.Array.from_pandas(s, type=type_)
+        arr = pa.array(s, from_pandas=True, type=type_)
 
         result = pd.Series(arr.to_pandas(), name=s.name)
         if isinstance(arr.type, pa.TimestampType) and arr.type.tz is not None:
@@ -93,7 +93,7 @@ class TestPandasConversion(unittest.TestCase):
 
     def _check_array_roundtrip(self, values, expected=None, mask=None,
                                type=None):
-        arr = pa.Array.from_pandas(values, mask=mask, type=type)
+        arr = pa.array(values, from_pandas=True, mask=mask, type=type)
         result = arr.to_pandas()
 
         values_nulls = pd.isnull(values)
@@ -152,7 +152,7 @@ class TestPandasConversion(unittest.TestCase):
         for name, arrow_dtype in dtypes:
             values = np.random.randn(num_values).astype(name)
 
-            arr = pa.Array.from_pandas(values, null_mask)
+            arr = pa.array(values, from_pandas=True, mask=null_mask)
             arrays.append(arr)
             fields.append(pa.field(name, arrow_dtype))
             values[null_mask] = np.nan
@@ -223,7 +223,7 @@ class TestPandasConversion(unittest.TestCase):
         for name in int_dtypes:
             values = np.random.randint(0, 100, size=num_values)
 
-            arr = pa.Array.from_pandas(values, null_mask)
+            arr = pa.array(values, mask=null_mask)
             arrays.append(arr)
 
             expected = values.astype('f8')
@@ -244,8 +244,8 @@ class TestPandasConversion(unittest.TestCase):
 
         target_type = pa.int8()
 
-        result = pa.Array.from_pandas(arr, type=target_type)
-        expected = pa.Array.from_pandas(arr.astype('int8'))
+        result = pa.array(arr, type=target_type)
+        expected = pa.array(arr.astype('int8'))
         assert result.equals(expected)
 
     def test_boolean_no_nulls(self):
@@ -266,7 +266,7 @@ class TestPandasConversion(unittest.TestCase):
         mask = np.random.randint(0, 10, size=num_values) < 3
         values = np.random.randint(0, 10, size=num_values) < 5
 
-        arr = pa.Array.from_pandas(values, mask)
+        arr = pa.array(values, mask=mask)
 
         expected = values.astype(object)
         expected[mask] = None
@@ -292,7 +292,7 @@ class TestPandasConversion(unittest.TestCase):
         arr = np.array([None], dtype=object)
 
         def _check_type(t):
-            a2 = pa.Array.from_pandas(arr, type=t)
+            a2 = pa.array(arr, type=t)
             assert a2.type == t
             assert a2[0].as_py() is None
 
@@ -325,7 +325,7 @@ class TestPandasConversion(unittest.TestCase):
         df = pd.DataFrame({
             'strings': np.array([val] * 4000, dtype=object)
         })
-        arr = pa.Array.from_pandas(df['strings'])
+        arr = pa.array(df['strings'])
         assert isinstance(arr, pa.ChunkedArray)
         assert arr.num_chunks == 2
         arr = None
@@ -365,19 +365,6 @@ class TestPandasConversion(unittest.TestCase):
             expected_schema=schema,
         )
 
-    def test_timestamps_to_ms_explicit_schema(self):
-        # ARROW-1328
-        df = pd.DataFrame({'datetime': [datetime(2017, 1, 1)]})
-        pa_type = pa.from_numpy_dtype(df['datetime'].dtype)
-
-        with tm.assert_produces_warning(FutureWarning,
-                                        check_stacklevel=False):
-            arr = pa.Array.from_pandas(df['datetime'], type=pa_type,
-                                       timestamps_to_ms=True)
-
-        tm.assert_almost_equal(df['datetime'].values.astype('M8[ms]'),
-                               arr.to_pandas())
-
     def test_timestamps_notimezone_nulls(self):
         df = pd.DataFrame({
             'datetime64': np.array([
@@ -450,11 +437,11 @@ class TestPandasConversion(unittest.TestCase):
         t32 = pa.date32()
         t64 = pa.date64()
 
-        a32 = pa.Array.from_pandas(arr, type=t32)
-        a64 = pa.Array.from_pandas(arr, type=t64)
+        a32 = pa.array(arr, type=t32)
+        a64 = pa.array(arr, type=t64)
 
-        a32_expected = pa.Array.from_pandas(arr_i4, mask=mask, type=t32)
-        a64_expected = pa.Array.from_pandas(arr_i8, mask=mask, type=t64)
+        a32_expected = pa.array(arr_i4, mask=mask, type=t32)
+        a64_expected = pa.array(arr_i8, mask=mask, type=t64)
 
         assert a32.equals(a32_expected)
         assert a64.equals(a64_expected)
@@ -481,8 +468,8 @@ class TestPandasConversion(unittest.TestCase):
         arr = np.array([17259, 17260, 17261], dtype='int32')
         arr2 = arr.astype('int64') * 86400000
 
-        a1 = pa.Array.from_pandas(arr, type=t1)
-        a2 = pa.Array.from_pandas(arr2, type=t2)
+        a1 = pa.array(arr, type=t1)
+        a2 = pa.array(arr2, type=t2)
 
         expected = date(2017, 4, 3)
         assert a1[0].as_py() == expected
@@ -520,7 +507,7 @@ class TestPandasConversion(unittest.TestCase):
             np.arange(1, dtype=dtype)
         ])
         type_ = pa.list_(pa.int8())
-        parr = pa.Array.from_pandas(arr, type=type_)
+        parr = pa.array(arr, type=type_)
 
         assert parr[0].as_py() == list(range(10))
         assert parr[1].as_py() == list(range(5))
@@ -592,7 +579,7 @@ class TestPandasConversion(unittest.TestCase):
     def test_nested_lists_all_none(self):
         data = np.array([[None, None], None], dtype=object)
 
-        arr = pa.Array.from_pandas(data)
+        arr = pa.array(data)
         expected = pa.array(list(data))
         assert arr.equals(expected)
         assert arr.type == pa.list_(pa.null())
@@ -600,7 +587,7 @@ class TestPandasConversion(unittest.TestCase):
         data2 = np.array([None, None, [None, None],
                           np.array([None, None], dtype=object)],
                          dtype=object)
-        arr = pa.Array.from_pandas(data2)
+        arr = pa.array(data2)
         expected = pa.array([None, None, [None, None], [None, None]])
         assert arr.equals(expected)
 
@@ -760,7 +747,7 @@ class TestPandasConversion(unittest.TestCase):
         t1 = pa.time64('us')
 
         aobjs = np.array(pytimes + [None], dtype=object)
-        parr = pa.Array.from_pandas(aobjs)
+        parr = pa.array(aobjs)
         assert parr.type == t1
         assert parr[0].as_py() == pytimes[0]
         assert parr[1].as_py() == pytimes[1]
@@ -775,18 +762,18 @@ class TestPandasConversion(unittest.TestCase):
         arr = np.array([_pytime_to_micros(v) for v in pytimes],
                        dtype='int64')
 
-        a1 = pa.Array.from_pandas(arr, type=pa.time64('us'))
+        a1 = pa.array(arr, type=pa.time64('us'))
         assert a1[0].as_py() == pytimes[0]
 
-        a2 = pa.Array.from_pandas(arr * 1000, type=pa.time64('ns'))
+        a2 = pa.array(arr * 1000, type=pa.time64('ns'))
         assert a2[0].as_py() == pytimes[0]
 
-        a3 = pa.Array.from_pandas((arr / 1000).astype('i4'),
-                                  type=pa.time32('ms'))
+        a3 = pa.array((arr / 1000).astype('i4'),
+                      type=pa.time32('ms'))
         assert a3[0].as_py() == pytimes[0].replace(microsecond=1000)
 
-        a4 = pa.Array.from_pandas((arr / 1000000).astype('i4'),
-                                  type=pa.time32('s'))
+        a4 = pa.array((arr / 1000000).astype('i4'),
+                      type=pa.time32('s'))
         assert a4[0].as_py() == pytimes[0].replace(microsecond=0)
 
     def test_arrow_time_to_pandas(self):
@@ -809,14 +796,14 @@ class TestPandasConversion(unittest.TestCase):
 
         null_mask = np.array([False, False, True], dtype=bool)
 
-        a1 = pa.Array.from_pandas(arr, mask=null_mask, type=pa.time64('us'))
-        a2 = pa.Array.from_pandas(arr * 1000, mask=null_mask,
-                                  type=pa.time64('ns'))
+        a1 = pa.array(arr, mask=null_mask, type=pa.time64('us'))
+        a2 = pa.array(arr * 1000, mask=null_mask,
+                      type=pa.time64('ns'))
 
-        a3 = pa.Array.from_pandas((arr / 1000).astype('i4'), mask=null_mask,
-                                  type=pa.time32('ms'))
-        a4 = pa.Array.from_pandas((arr / 1000000).astype('i4'), mask=null_mask,
-                                  type=pa.time32('s'))
+        a3 = pa.array((arr / 1000).astype('i4'), mask=null_mask,
+                      type=pa.time32('ms'))
+        a4 = pa.array((arr / 1000000).astype('i4'), mask=null_mask,
+                      type=pa.time32('s'))
 
         names = ['time64[us]', 'time64[ns]', 'time32[ms]', 'time32[s]']
         batch = pa.RecordBatch.from_arrays([a1, a2, a3, a4], names)
@@ -841,8 +828,8 @@ class TestPandasConversion(unittest.TestCase):
 
         tm.assert_frame_equal(df, expected_df)
 
-    def _check_numpy_array_roundtrip(self, np_array):
-        arr = pa.Array.from_pandas(np_array)
+    def _check_array_from_pandas_roundtrip(self, np_array):
+        arr = pa.array(np_array, from_pandas=True)
         result = arr.to_pandas()
         npt.assert_array_equal(result, np_array)
 
@@ -853,7 +840,7 @@ class TestPandasConversion(unittest.TestCase):
                 '2006-01-13T12:34:56.432539784',
                 '2010-08-13T05:46:57.437699912'],
                 dtype='datetime64[ns]')
-        self._check_numpy_array_roundtrip(datetime64_ns)
+        self._check_array_from_pandas_roundtrip(datetime64_ns)
 
         datetime64_us = np.array([
                 '2007-07-13T01:23:34.123456',
@@ -861,7 +848,7 @@ class TestPandasConversion(unittest.TestCase):
                 '2006-01-13T12:34:56.432539',
                 '2010-08-13T05:46:57.437699'],
                 dtype='datetime64[us]')
-        self._check_numpy_array_roundtrip(datetime64_us)
+        self._check_array_from_pandas_roundtrip(datetime64_us)
 
         datetime64_ms = np.array([
                 '2007-07-13T01:23:34.123',
@@ -869,7 +856,7 @@ class TestPandasConversion(unittest.TestCase):
                 '2006-01-13T12:34:56.432',
                 '2010-08-13T05:46:57.437'],
                 dtype='datetime64[ms]')
-        self._check_numpy_array_roundtrip(datetime64_ms)
+        self._check_array_from_pandas_roundtrip(datetime64_ms)
 
         datetime64_s = np.array([
                 '2007-07-13T01:23:34',
@@ -877,7 +864,7 @@ class TestPandasConversion(unittest.TestCase):
                 '2006-01-13T12:34:56',
                 '2010-08-13T05:46:57'],
                 dtype='datetime64[s]')
-        self._check_numpy_array_roundtrip(datetime64_s)
+        self._check_array_from_pandas_roundtrip(datetime64_s)
 
         datetime64_d = np.array([
                 '2007-07-13',
@@ -885,11 +872,11 @@ class TestPandasConversion(unittest.TestCase):
                 '2006-01-15',
                 '2010-08-19'],
                 dtype='datetime64[D]')
-        self._check_numpy_array_roundtrip(datetime64_d)
+        self._check_array_from_pandas_roundtrip(datetime64_d)
 
     def test_all_nones(self):
         def _check_series(s):
-            converted = pa.Array.from_pandas(s)
+            converted = pa.array(s)
             assert isinstance(converted, pa.NullArray)
             assert len(converted) == 3
             assert converted.null_count == 3

http://git-wip-us.apache.org/repos/asf/arrow/blob/796129b4/python/pyarrow/tests/test_parquet.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index eb23894..b0593fe 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -457,8 +457,26 @@ def test_column_of_arrays(tmpdir):
 
 @parquet
 def test_coerce_timestamps(tmpdir):
+    from collections import OrderedDict
     # ARROW-622
-    df, schema = dataframe_with_arrays()
+    arrays = OrderedDict()
+    fields = [pa.field('datetime64',
+                       pa.list_(pa.timestamp('ms')))]
+    arrays['datetime64'] = [
+        np.array(['2007-07-13T01:23:34.123456789',
+                  None,
+                  '2010-08-13T05:46:57.437699912'],
+                 dtype='datetime64[ms]'),
+        None,
+        None,
+        np.array(['2007-07-13T02',
+                  None,
+                  '2010-08-13T05:46:57.437699912'],
+                 dtype='datetime64[ms]'),
+    ]
+
+    df = pd.DataFrame(arrays)
+    schema = pa.schema(fields)
 
     filename = tmpdir.join('pandas_rountrip.parquet')
     arrow_table = pa.Table.from_pandas(df, schema=schema)
@@ -497,41 +515,41 @@ def test_column_of_lists(tmpdir):
 def test_date_time_types():
     t1 = pa.date32()
     data1 = np.array([17259, 17260, 17261], dtype='int32')
-    a1 = pa.Array.from_pandas(data1, type=t1)
+    a1 = pa.array(data1, type=t1)
 
     t2 = pa.date64()
     data2 = data1.astype('int64') * 86400000
-    a2 = pa.Array.from_pandas(data2, type=t2)
+    a2 = pa.array(data2, type=t2)
 
     t3 = pa.timestamp('us')
     start = pd.Timestamp('2000-01-01').value / 1000
     data3 = np.array([start, start + 1, start + 2], dtype='int64')
-    a3 = pa.Array.from_pandas(data3, type=t3)
+    a3 = pa.array(data3, type=t3)
 
     t4 = pa.time32('ms')
     data4 = np.arange(3, dtype='i4')
-    a4 = pa.Array.from_pandas(data4, type=t4)
+    a4 = pa.array(data4, type=t4)
 
     t5 = pa.time64('us')
-    a5 = pa.Array.from_pandas(data4.astype('int64'), type=t5)
+    a5 = pa.array(data4.astype('int64'), type=t5)
 
     t6 = pa.time32('s')
-    a6 = pa.Array.from_pandas(data4, type=t6)
+    a6 = pa.array(data4, type=t6)
 
     ex_t6 = pa.time32('ms')
-    ex_a6 = pa.Array.from_pandas(data4 * 1000, type=ex_t6)
+    ex_a6 = pa.array(data4 * 1000, type=ex_t6)
 
     t7 = pa.timestamp('ns')
     start = pd.Timestamp('2001-01-01').value
     data7 = np.array([start, start + 1000, start + 2000],
                      dtype='int64')
-    a7 = pa.Array.from_pandas(data7, type=t7)
+    a7 = pa.array(data7, type=t7)
 
     t7_us = pa.timestamp('us')
     start = pd.Timestamp('2001-01-01').value
     data7_us = np.array([start, start + 1000, start + 2000],
                         dtype='int64') // 1000
-    a7_us = pa.Array.from_pandas(data7_us, type=t7_us)
+    a7_us = pa.array(data7_us, type=t7_us)
 
     table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6, a7],
                                  ['date32', 'date64', 'timestamp[us]',
@@ -575,7 +593,7 @@ def test_date_time_types():
             _write_table(table, buf, version="2.0")
 
     t7 = pa.time64('ns')
-    a7 = pa.Array.from_pandas(data4.astype('int64'), type=t7)
+    a7 = pa.array(data4.astype('int64'), type=t7)
 
     _assert_unsupported(a7)
 
@@ -1295,7 +1313,7 @@ def test_large_table_int32_overflow():
 
     arr = np.ones(size, dtype='uint8')
 
-    parr = pa.Array.from_pandas(arr, type=pa.uint8())
+    parr = pa.array(arr, type=pa.uint8())
 
     table = pa.Table.from_arrays([parr], names=['one'])
     f = io.BytesIO()

http://git-wip-us.apache.org/repos/asf/arrow/blob/796129b4/python/pyarrow/tests/test_schema.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_schema.py b/python/pyarrow/tests/test_schema.py
index 4bb6a5a..c77be98 100644
--- a/python/pyarrow/tests/test_schema.py
+++ b/python/pyarrow/tests/test_schema.py
@@ -69,6 +69,56 @@ def test_type_list():
     assert str(l2) == 'list<my_item: string>'
 
 
+def test_type_comparisons():
+    val = pa.int32()
+    assert val == pa.int32()
+    assert val == 'int32'
+
+    with pytest.raises(TypeError):
+        val == 5
+
+
+def test_type_for_alias():
+    cases = [
+        ('i1', pa.int8()),
+        ('int8', pa.int8()),
+        ('i2', pa.int16()),
+        ('int16', pa.int16()),
+        ('i4', pa.int32()),
+        ('int32', pa.int32()),
+        ('i8', pa.int64()),
+        ('int64', pa.int64()),
+        ('u1', pa.uint8()),
+        ('uint8', pa.uint8()),
+        ('u2', pa.uint16()),
+        ('uint16', pa.uint16()),
+        ('u4', pa.uint32()),
+        ('uint32', pa.uint32()),
+        ('u8', pa.uint64()),
+        ('uint64', pa.uint64()),
+        ('f4', pa.float32()),
+        ('float32', pa.float32()),
+        ('f8', pa.float64()),
+        ('float64', pa.float64()),
+        ('date32', pa.date32()),
+        ('date64', pa.date64()),
+        ('string', pa.string()),
+        ('str', pa.string()),
+        ('binary', pa.binary()),
+        ('time32[s]', pa.time32('s')),
+        ('time32[ms]', pa.time32('ms')),
+        ('time64[us]', pa.time64('us')),
+        ('time64[ns]', pa.time64('ns')),
+        ('timestamp[s]', pa.timestamp('s')),
+        ('timestamp[ms]', pa.timestamp('ms')),
+        ('timestamp[us]', pa.timestamp('us')),
+        ('timestamp[ns]', pa.timestamp('ns')),
+    ]
+
+    for val, expected in cases:
+        assert pa.type_for_alias(val) == expected
+
+
 def test_type_string():
     t = pa.string()
     assert str(t) == 'string'

http://git-wip-us.apache.org/repos/asf/arrow/blob/796129b4/python/pyarrow/types.pxi
----------------------------------------------------------------------
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index b298e74..316e09a 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -72,11 +72,19 @@ cdef class DataType:
     def __repr__(self):
         return '{0.__class__.__name__}({0})'.format(self)
 
-    def __richcmp__(DataType self, DataType other, int op):
+    def __richcmp__(DataType self, object other, int op):
+        cdef DataType other_type
+        if not isinstance(other, DataType):
+            if not isinstance(other, six.string_types):
+                raise TypeError(other)
+            other_type = type_for_alias(other)
+        else:
+            other_type = other
+
         if op == cp.Py_EQ:
-            return self.type.Equals(deref(other.type))
+            return self.type.Equals(deref(other_type.type))
         elif op == cp.Py_NE:
-            return not self.type.Equals(deref(other.type))
+            return not self.type.Equals(deref(other_type.type))
         else:
             raise TypeError('Invalid comparison')
 
@@ -922,6 +930,64 @@ def struct(fields):
     return pyarrow_wrap_data_type(struct_type)
 
 
+cdef dict _type_aliases = {
+    'null': null,
+    'i1': int8,
+    'int8': int8,
+    'i2': int16,
+    'int16': int16,
+    'i4': int32,
+    'int32': int32,
+    'i8': int64,
+    'int64': int64,
+    'u1': uint8,
+    'uint8': uint8,
+    'u2': uint16,
+    'uint16': uint16,
+    'u4': uint32,
+    'uint32': uint32,
+    'u8': uint64,
+    'uint64': uint64,
+    'f4': float32,
+    'float32': float32,
+    'f8': float64,
+    'float64': float64,
+    'string': string,
+    'str': string,
+    'utf8': string,
+    'binary': binary,
+    'date32': date32,
+    'date64': date64,
+    'time32[s]': time32('s'),
+    'time32[ms]': time32('ms'),
+    'time64[us]': time64('us'),
+    'time64[ns]': time64('ns'),
+    'timestamp[s]': timestamp('s'),
+    'timestamp[ms]': timestamp('ms'),
+    'timestamp[us]': timestamp('us'),
+    'timestamp[ns]': timestamp('ns'),
+}
+
+
+def type_for_alias(name):
+    """
+    Return DataType given a string alias if one exists
+
+    Returns
+    -------
+    type : DataType
+    """
+    name = name.lower()
+    try:
+        alias = _type_aliases[name]
+    except KeyError:
+        raise ValueError('No type alias for {0}'.format(name))
+
+    if isinstance(alias, DataType):
+        return alias
+    return alias()
+
+
 def schema(fields):
     """
     Construct pyarrow.Schema from collection of fields


[2/2] arrow git commit: ARROW-838: [Python] Expand pyarrow.array to handle NumPy arrays not originating in pandas

Posted by we...@apache.org.
ARROW-838: [Python] Expand pyarrow.array to handle NumPy arrays not originating in pandas

This unifies the ingest path for 1D data into `pyarrow.array`. I added the argument `from_pandas` to turn null sentinel checking on or off:

```
In [8]: arr = np.random.randn(10000000)

In [9]: arr[::3] = np.nan

In [10]: arr2 = pa.array(arr)

In [11]: arr2.null_count
Out[11]: 0

In [12]: %timeit arr2 = pa.array(arr)
The slowest run took 5.43 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 3: 68.4 µs per loop

In [13]: arr2 = pa.array(arr, from_pandas=True)

In [14]: arr2.null_count
Out[14]: 3333334

In [15]: %timeit arr2 = pa.array(arr, from_pandas=True)
1 loop, best of 3: 228 ms per loop
```

When the data is contiguous, it is always zero-copy, but then `from_pandas=True` and no null mask is passed, then a null bitmap is constructed and populated.

This also permits sequence reads into integers smaller than int64:

```
In [17]: pa.array([1, 2, 3, 4], type='i1')
Out[17]:
<pyarrow.lib.Int8Array object at 0x7ffa1c1c65e8>
[
  1,
  2,
  3,
  4
]
```

Oh, I also added NumPy-like string type aliases:

```
In [18]: pa.int32() == 'i4'
Out[18]: True
```

Author: Wes McKinney <we...@twosigma.com>

Closes #1146 from wesm/expand-py-array-method and squashes the following commits:

1570e525 [Wes McKinney] Code review comments
d3bbb3c3 [Wes McKinney] Handle type aliases in cast, too
797f0151 [Wes McKinney] Allow null checking to be skipped with from_pandas=False in pyarrow.array
f2802fc7 [Wes McKinney] Cleaner codepath for numpy->arrow conversions
587c575a [Wes McKinney] Add direct types sequence converters for more data types
cf40b767 [Wes McKinney] Add type aliases, some unit tests
7b530e4b [Wes McKinney] Consolidate both sequence and ndarray/Series/Index conversion in pyarrow.Array


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/796129b4
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/796129b4
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/796129b4

Branch: refs/heads/master
Commit: 796129b4f0f714fdb3c4fbf5bc2d2deb55424a84
Parents: 7c61611
Author: Wes McKinney <we...@twosigma.com>
Authored: Fri Sep 29 23:02:58 2017 -0500
Committer: Wes McKinney <we...@twosigma.com>
Committed: Fri Sep 29 23:02:58 2017 -0500

----------------------------------------------------------------------
 cpp/src/arrow/python/CMakeLists.txt         |    4 +-
 cpp/src/arrow/python/api.h                  |    2 +-
 cpp/src/arrow/python/builtin_convert.cc     |  223 ++--
 cpp/src/arrow/python/numpy_to_arrow.cc      | 1228 ++++++++++++++++++++++
 cpp/src/arrow/python/numpy_to_arrow.h       |   56 +
 cpp/src/arrow/python/pandas_to_arrow.cc     | 1215 ---------------------
 cpp/src/arrow/python/pandas_to_arrow.h      |   59 --
 python/pyarrow/__init__.py                  |    2 +-
 python/pyarrow/array.pxi                    |  279 ++---
 python/pyarrow/includes/libarrow.pxd        |   11 +-
 python/pyarrow/pandas_compat.py             |   22 +-
 python/pyarrow/scalar.pxi                   |    8 +-
 python/pyarrow/table.pxi                    |   13 +-
 python/pyarrow/tests/test_array.py          |   58 +-
 python/pyarrow/tests/test_convert_pandas.py |   95 +-
 python/pyarrow/tests/test_parquet.py        |   42 +-
 python/pyarrow/tests/test_schema.py         |   50 +
 python/pyarrow/types.pxi                    |   72 +-
 18 files changed, 1841 insertions(+), 1598 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/796129b4/cpp/src/arrow/python/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/CMakeLists.txt b/cpp/src/arrow/python/CMakeLists.txt
index 84aad82..7938d84 100644
--- a/cpp/src/arrow/python/CMakeLists.txt
+++ b/cpp/src/arrow/python/CMakeLists.txt
@@ -57,7 +57,7 @@ set(ARROW_PYTHON_SRCS
   init.cc
   io.cc
   numpy_convert.cc
-  pandas_to_arrow.cc
+  numpy_to_arrow.cc
   python_to_arrow.cc
   pyarrow.cc
 )
@@ -100,7 +100,7 @@ install(FILES
   io.h
   numpy_convert.h
   numpy_interop.h
-  pandas_to_arrow.h
+  numpy_to_arrow.h
   python_to_arrow.h
   platform.h
   pyarrow.h

http://git-wip-us.apache.org/repos/asf/arrow/blob/796129b4/cpp/src/arrow/python/api.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/api.h b/cpp/src/arrow/python/api.h
index 4ceb3f1..a000ac5 100644
--- a/cpp/src/arrow/python/api.h
+++ b/cpp/src/arrow/python/api.h
@@ -25,7 +25,7 @@
 #include "arrow/python/helpers.h"
 #include "arrow/python/io.h"
 #include "arrow/python/numpy_convert.h"
-#include "arrow/python/pandas_to_arrow.h"
+#include "arrow/python/numpy_to_arrow.h"
 #include "arrow/python/python_to_arrow.h"
 
 #endif  // ARROW_PYTHON_API_H

http://git-wip-us.apache.org/repos/asf/arrow/blob/796129b4/cpp/src/arrow/python/builtin_convert.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc
index 747b872..f9d7361 100644
--- a/cpp/src/arrow/python/builtin_convert.cc
+++ b/cpp/src/arrow/python/builtin_convert.cc
@@ -20,6 +20,7 @@
 #include <datetime.h>
 
 #include <algorithm>
+#include <limits>
 #include <sstream>
 #include <string>
 
@@ -359,7 +360,11 @@ class TypedConverterVisitor : public TypedConverter<BuilderType> {
     if (PySequence_Check(obj)) {
       for (int64_t i = 0; i < size; ++i) {
         OwnedRef ref(PySequence_GetItem(obj, i));
-        RETURN_NOT_OK(static_cast<Derived*>(this)->AppendItem(ref));
+        if (ref.obj() == Py_None) {
+          RETURN_NOT_OK(this->typed_builder_->AppendNull());
+        } else {
+          RETURN_NOT_OK(static_cast<Derived*>(this)->AppendItem(ref));
+        }
       }
     } else if (PyObject_HasAttrString(obj, "__iter__")) {
       PyObject* iter = PyObject_GetIter(obj);
@@ -370,7 +375,11 @@ class TypedConverterVisitor : public TypedConverter<BuilderType> {
       // consuming at size.
       while ((item = PyIter_Next(iter)) && i < size) {
         OwnedRef ref(item);
-        RETURN_NOT_OK(static_cast<Derived*>(this)->AppendItem(ref));
+        if (ref.obj() == Py_None) {
+          RETURN_NOT_OK(this->typed_builder_->AppendNull());
+        } else {
+          RETURN_NOT_OK(static_cast<Derived*>(this)->AppendItem(ref));
+        }
         ++i;
       }
       if (size != i) {
@@ -388,52 +397,136 @@ class TypedConverterVisitor : public TypedConverter<BuilderType> {
 class NullConverter : public TypedConverterVisitor<NullBuilder, NullConverter> {
  public:
   inline Status AppendItem(const OwnedRef& item) {
-    if (item.obj() == Py_None) {
-      return typed_builder_->AppendNull();
-    } else {
-      return Status::Invalid("NullConverter: passed non-None value");
-    }
+    return Status::Invalid("NullConverter: passed non-None value");
   }
 };
 
 class BoolConverter : public TypedConverterVisitor<BooleanBuilder, BoolConverter> {
  public:
   inline Status AppendItem(const OwnedRef& item) {
-    if (item.obj() == Py_None) {
-      return typed_builder_->AppendNull();
-    } else {
-      if (item.obj() == Py_True) {
-        return typed_builder_->Append(true);
-      } else {
-        return typed_builder_->Append(false);
-      }
+    return typed_builder_->Append(item.obj() == Py_True);
+  }
+};
+
+class Int8Converter : public TypedConverterVisitor<Int8Builder, Int8Converter> {
+ public:
+  inline Status AppendItem(const OwnedRef& item) {
+    int64_t val = static_cast<int64_t>(PyLong_AsLongLong(item.obj()));
+
+    if (ARROW_PREDICT_FALSE(val > std::numeric_limits<int8_t>::max() ||
+                            val < std::numeric_limits<int8_t>::min())) {
+      return Status::Invalid(
+          "Cannot coerce values to array type that would "
+          "lose data");
     }
+    RETURN_IF_PYERROR();
+    return typed_builder_->Append(static_cast<int8_t>(val));
+  }
+};
+
+class Int16Converter : public TypedConverterVisitor<Int16Builder, Int16Converter> {
+ public:
+  inline Status AppendItem(const OwnedRef& item) {
+    int64_t val = static_cast<int64_t>(PyLong_AsLongLong(item.obj()));
+
+    if (ARROW_PREDICT_FALSE(val > std::numeric_limits<int16_t>::max() ||
+                            val < std::numeric_limits<int16_t>::min())) {
+      return Status::Invalid(
+          "Cannot coerce values to array type that would "
+          "lose data");
+    }
+    RETURN_IF_PYERROR();
+    return typed_builder_->Append(static_cast<int16_t>(val));
+  }
+};
+
+class Int32Converter : public TypedConverterVisitor<Int32Builder, Int32Converter> {
+ public:
+  inline Status AppendItem(const OwnedRef& item) {
+    int64_t val = static_cast<int64_t>(PyLong_AsLongLong(item.obj()));
+
+    if (ARROW_PREDICT_FALSE(val > std::numeric_limits<int32_t>::max() ||
+                            val < std::numeric_limits<int32_t>::min())) {
+      return Status::Invalid(
+          "Cannot coerce values to array type that would "
+          "lose data");
+    }
+    RETURN_IF_PYERROR();
+    return typed_builder_->Append(static_cast<int32_t>(val));
   }
 };
 
 class Int64Converter : public TypedConverterVisitor<Int64Builder, Int64Converter> {
  public:
   inline Status AppendItem(const OwnedRef& item) {
-    int64_t val;
-    if (item.obj() == Py_None) {
-      return typed_builder_->AppendNull();
-    } else {
-      val = static_cast<int64_t>(PyLong_AsLongLong(item.obj()));
-      RETURN_IF_PYERROR();
-      return typed_builder_->Append(val);
+    int64_t val = static_cast<int64_t>(PyLong_AsLongLong(item.obj()));
+    RETURN_IF_PYERROR();
+    return typed_builder_->Append(val);
+  }
+};
+
+class UInt8Converter : public TypedConverterVisitor<UInt8Builder, UInt8Converter> {
+ public:
+  inline Status AppendItem(const OwnedRef& item) {
+    uint64_t val = static_cast<uint64_t>(PyLong_AsLongLong(item.obj()));
+
+    if (ARROW_PREDICT_FALSE(val > std::numeric_limits<uint8_t>::max() ||
+                            val < std::numeric_limits<uint8_t>::min())) {
+      return Status::Invalid(
+          "Cannot coerce values to array type that would "
+          "lose data");
     }
+    RETURN_IF_PYERROR();
+    return typed_builder_->Append(static_cast<uint8_t>(val));
   }
 };
 
-class DateConverter : public TypedConverterVisitor<Date64Builder, DateConverter> {
+class UInt16Converter : public TypedConverterVisitor<UInt16Builder, UInt16Converter> {
  public:
   inline Status AppendItem(const OwnedRef& item) {
-    if (item.obj() == Py_None) {
-      return typed_builder_->AppendNull();
-    } else {
-      PyDateTime_Date* pydate = reinterpret_cast<PyDateTime_Date*>(item.obj());
-      return typed_builder_->Append(PyDate_to_ms(pydate));
+    uint64_t val = static_cast<uint64_t>(PyLong_AsLongLong(item.obj()));
+
+    if (ARROW_PREDICT_FALSE(val > std::numeric_limits<uint16_t>::max() ||
+                            val < std::numeric_limits<uint16_t>::min())) {
+      return Status::Invalid(
+          "Cannot coerce values to array type that would "
+          "lose data");
     }
+    RETURN_IF_PYERROR();
+    return typed_builder_->Append(static_cast<uint16_t>(val));
+  }
+};
+
+class UInt32Converter : public TypedConverterVisitor<UInt32Builder, UInt32Converter> {
+ public:
+  inline Status AppendItem(const OwnedRef& item) {
+    uint64_t val = static_cast<uint64_t>(PyLong_AsLongLong(item.obj()));
+
+    if (ARROW_PREDICT_FALSE(val > std::numeric_limits<uint32_t>::max() ||
+                            val < std::numeric_limits<uint32_t>::min())) {
+      return Status::Invalid(
+          "Cannot coerce values to array type that would "
+          "lose data");
+    }
+    RETURN_IF_PYERROR();
+    return typed_builder_->Append(static_cast<uint32_t>(val));
+  }
+};
+
+class UInt64Converter : public TypedConverterVisitor<UInt64Builder, UInt64Converter> {
+ public:
+  inline Status AppendItem(const OwnedRef& item) {
+    int64_t val = static_cast<int64_t>(PyLong_AsLongLong(item.obj()));
+    RETURN_IF_PYERROR();
+    return typed_builder_->Append(val);
+  }
+};
+
+class DateConverter : public TypedConverterVisitor<Date64Builder, DateConverter> {
+ public:
+  inline Status AppendItem(const OwnedRef& item) {
+    auto pydate = reinterpret_cast<PyDateTime_Date*>(item.obj());
+    return typed_builder_->Append(PyDate_to_ms(pydate));
   }
 };
 
@@ -441,27 +534,17 @@ class TimestampConverter
     : public TypedConverterVisitor<Date64Builder, TimestampConverter> {
  public:
   inline Status AppendItem(const OwnedRef& item) {
-    if (item.obj() == Py_None) {
-      return typed_builder_->AppendNull();
-    } else {
-      PyDateTime_DateTime* pydatetime =
-          reinterpret_cast<PyDateTime_DateTime*>(item.obj());
-      return typed_builder_->Append(PyDateTime_to_us(pydatetime));
-    }
+    auto pydatetime = reinterpret_cast<PyDateTime_DateTime*>(item.obj());
+    return typed_builder_->Append(PyDateTime_to_us(pydatetime));
   }
 };
 
 class DoubleConverter : public TypedConverterVisitor<DoubleBuilder, DoubleConverter> {
  public:
   inline Status AppendItem(const OwnedRef& item) {
-    double val;
-    if (item.obj() == Py_None) {
-      return typed_builder_->AppendNull();
-    } else {
-      val = PyFloat_AsDouble(item.obj());
-      RETURN_IF_PYERROR();
-      return typed_builder_->Append(val);
-    }
+    double val = PyFloat_AsDouble(item.obj());
+    RETURN_IF_PYERROR();
+    return typed_builder_->Append(val);
   }
 };
 
@@ -473,10 +556,7 @@ class BytesConverter : public TypedConverterVisitor<BinaryBuilder, BytesConverte
     Py_ssize_t length;
     OwnedRef tmp;
 
-    if (item.obj() == Py_None) {
-      RETURN_NOT_OK(typed_builder_->AppendNull());
-      return Status::OK();
-    } else if (PyUnicode_Check(item.obj())) {
+    if (PyUnicode_Check(item.obj())) {
       tmp.reset(PyUnicode_AsUTF8String(item.obj()));
       RETURN_IF_PYERROR();
       bytes_obj = tmp.obj();
@@ -504,10 +584,7 @@ class FixedWidthBytesConverter
     Py_ssize_t expected_length =
         std::dynamic_pointer_cast<FixedSizeBinaryType>(typed_builder_->type())
             ->byte_width();
-    if (item.obj() == Py_None) {
-      RETURN_NOT_OK(typed_builder_->AppendNull());
-      return Status::OK();
-    } else if (PyUnicode_Check(item.obj())) {
+    if (PyUnicode_Check(item.obj())) {
       tmp.reset(PyUnicode_AsUTF8String(item.obj()));
       RETURN_IF_PYERROR();
       bytes_obj = tmp.obj();
@@ -535,9 +612,7 @@ class UTF8Converter : public TypedConverterVisitor<StringBuilder, UTF8Converter>
     Py_ssize_t length;
 
     PyObject* obj = item.obj();
-    if (obj == Py_None) {
-      return typed_builder_->AppendNull();
-    } else if (PyBytes_Check(obj)) {
+    if (PyBytes_Check(obj)) {
       tmp.reset(
           PyUnicode_FromStringAndSize(PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj)));
       RETURN_IF_PYERROR();
@@ -565,14 +640,10 @@ class ListConverter : public TypedConverterVisitor<ListBuilder, ListConverter> {
   Status Init(ArrayBuilder* builder) override;
 
   inline Status AppendItem(const OwnedRef& item) override {
-    if (item.obj() == Py_None) {
-      return typed_builder_->AppendNull();
-    } else {
-      RETURN_NOT_OK(typed_builder_->Append());
-      PyObject* item_obj = item.obj();
-      int64_t list_size = static_cast<int64_t>(PySequence_Size(item_obj));
-      return value_converter_->AppendData(item_obj, list_size);
-    }
+    RETURN_NOT_OK(typed_builder_->Append());
+    PyObject* item_obj = item.obj();
+    int64_t list_size = static_cast<int64_t>(PySequence_Size(item_obj));
+    return value_converter_->AppendData(item_obj, list_size);
   }
 
  protected:
@@ -584,16 +655,12 @@ class DecimalConverter
  public:
   inline Status AppendItem(const OwnedRef& item) {
     /// TODO(phillipc): Check for nan?
-    if (item.obj() != Py_None) {
-      std::string string;
-      RETURN_NOT_OK(PythonDecimalToString(item.obj(), &string));
-
-      Decimal128 value;
-      RETURN_NOT_OK(Decimal128::FromString(string, &value));
-      return typed_builder_->Append(value);
-    }
+    std::string string;
+    RETURN_NOT_OK(PythonDecimalToString(item.obj(), &string));
 
-    return typed_builder_->AppendNull();
+    Decimal128 value;
+    RETURN_NOT_OK(Decimal128::FromString(string, &value));
+    return typed_builder_->Append(value);
   }
 };
 
@@ -604,8 +671,22 @@ std::shared_ptr<SeqConverter> GetConverter(const std::shared_ptr<DataType>& type
       return std::make_shared<NullConverter>();
     case Type::BOOL:
       return std::make_shared<BoolConverter>();
+    case Type::INT8:
+      return std::make_shared<Int8Converter>();
+    case Type::INT16:
+      return std::make_shared<Int16Converter>();
+    case Type::INT32:
+      return std::make_shared<Int32Converter>();
     case Type::INT64:
       return std::make_shared<Int64Converter>();
+    case Type::UINT8:
+      return std::make_shared<UInt8Converter>();
+    case Type::UINT16:
+      return std::make_shared<UInt16Converter>();
+    case Type::UINT32:
+      return std::make_shared<UInt32Converter>();
+    case Type::UINT64:
+      return std::make_shared<UInt64Converter>();
     case Type::DATE64:
       return std::make_shared<DateConverter>();
     case Type::TIMESTAMP:

http://git-wip-us.apache.org/repos/asf/arrow/blob/796129b4/cpp/src/arrow/python/numpy_to_arrow.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc
new file mode 100644
index 0000000..7151c94
--- /dev/null
+++ b/cpp/src/arrow/python/numpy_to_arrow.cc
@@ -0,0 +1,1228 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Functions for pandas conversion via NumPy
+
+#define ARROW_NO_DEFAULT_MEMORY_POOL
+
+#include "arrow/python/numpy_to_arrow.h"
+#include "arrow/python/numpy_interop.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/status.h"
+#include "arrow/table.h"
+#include "arrow/type_fwd.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bit-util.h"
+#include "arrow/util/decimal.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/macros.h"
+#include "arrow/visitor_inline.h"
+
+#include "arrow/compute/cast.h"
+#include "arrow/compute/context.h"
+
+#include "arrow/python/builtin_convert.h"
+#include "arrow/python/common.h"
+#include "arrow/python/config.h"
+#include "arrow/python/helpers.h"
+#include "arrow/python/numpy-internal.h"
+#include "arrow/python/numpy_convert.h"
+#include "arrow/python/type_traits.h"
+#include "arrow/python/util/datetime.h"
+
+namespace arrow {
+namespace py {
+
+using internal::NumPyTypeSize;
+
+constexpr int64_t kBinaryMemoryLimit = std::numeric_limits<int32_t>::max();
+
+// ----------------------------------------------------------------------
+// Conversion utilities
+
+namespace {
+
+inline bool PyFloat_isnan(const PyObject* obj) {
+  if (PyFloat_Check(obj)) {
+    double val = PyFloat_AS_DOUBLE(obj);
+    return val != val;
+  } else {
+    return false;
+  }
+}
+
+inline bool PandasObjectIsNull(const PyObject* obj) {
+  return obj == Py_None || obj == numpy_nan || PyFloat_isnan(obj);
+}
+
+inline bool PyObject_is_string(const PyObject* obj) {
+#if PY_MAJOR_VERSION >= 3
+  return PyUnicode_Check(obj) || PyBytes_Check(obj);
+#else
+  return PyString_Check(obj) || PyUnicode_Check(obj);
+#endif
+}
+
+inline bool PyObject_is_float(const PyObject* obj) { return PyFloat_Check(obj); }
+
+inline bool PyObject_is_integer(const PyObject* obj) {
+  return (!PyBool_Check(obj)) && PyArray_IsIntegerScalar(obj);
+}
+
+template <int TYPE>
+inline int64_t ValuesToBitmap(PyArrayObject* arr, uint8_t* bitmap) {
+  typedef internal::npy_traits<TYPE> traits;
+  typedef typename traits::value_type T;
+
+  int64_t null_count = 0;
+
+  Ndarray1DIndexer<T> values(arr);
+  for (int i = 0; i < values.size(); ++i) {
+    if (traits::isnull(values[i])) {
+      ++null_count;
+    } else {
+      BitUtil::SetBit(bitmap, i);
+    }
+  }
+
+  return null_count;
+}
+
+// Returns null count
+int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap) {
+  int64_t null_count = 0;
+
+  Ndarray1DIndexer<uint8_t> mask_values(mask);
+  for (int i = 0; i < length; ++i) {
+    if (mask_values[i]) {
+      ++null_count;
+    } else {
+      BitUtil::SetBit(bitmap, i);
+    }
+  }
+  return null_count;
+}
+
+Status CheckFlatNumpyArray(PyArrayObject* numpy_array, int np_type) {
+  if (PyArray_NDIM(numpy_array) != 1) {
+    return Status::Invalid("only handle 1-dimensional arrays");
+  }
+
+  const int received_type = PyArray_DESCR(numpy_array)->type_num;
+  if (received_type != np_type) {
+    std::stringstream ss;
+    ss << "trying to convert NumPy type " << GetNumPyTypeName(np_type) << " but got "
+       << GetNumPyTypeName(received_type);
+    return Status::Invalid(ss.str());
+  }
+
+  return Status::OK();
+}
+
+}  // namespace
+
+/// Append as many string objects from NumPy arrays to a `StringBuilder` as we
+/// can fit
+///
+/// \param[in] offset starting offset for appending
+/// \param[out] values_consumed ending offset where we stopped appending. Will
+/// be length of arr if fully consumed
+/// \param[out] have_bytes true if we encountered any PyBytes object
+static Status AppendObjectStrings(PyArrayObject* arr, PyArrayObject* mask, int64_t offset,
+                                  StringBuilder* builder, int64_t* end_offset,
+                                  bool* have_bytes) {
+  PyObject* obj;
+
+  Ndarray1DIndexer<PyObject*> objects(arr);
+  Ndarray1DIndexer<uint8_t> mask_values;
+
+  bool have_mask = false;
+  if (mask != nullptr) {
+    mask_values.Init(mask);
+    have_mask = true;
+  }
+
+  for (; offset < objects.size(); ++offset) {
+    OwnedRef tmp_obj;
+    obj = objects[offset];
+    if ((have_mask && mask_values[offset]) || PandasObjectIsNull(obj)) {
+      RETURN_NOT_OK(builder->AppendNull());
+      continue;
+    } else if (PyUnicode_Check(obj)) {
+      obj = PyUnicode_AsUTF8String(obj);
+      if (obj == NULL) {
+        PyErr_Clear();
+        return Status::Invalid("failed converting unicode to UTF8");
+      }
+      tmp_obj.reset(obj);
+    } else if (PyBytes_Check(obj)) {
+      *have_bytes = true;
+    } else {
+      std::stringstream ss;
+      ss << "Error converting to Python objects to String/UTF8: ";
+      RETURN_NOT_OK(InvalidConversion(obj, "str, bytes", &ss));
+      return Status::Invalid(ss.str());
+    }
+
+    const int32_t length = static_cast<int32_t>(PyBytes_GET_SIZE(obj));
+    if (ARROW_PREDICT_FALSE(builder->value_data_length() + length > kBinaryMemoryLimit)) {
+      break;
+    }
+    RETURN_NOT_OK(builder->Append(PyBytes_AS_STRING(obj), length));
+  }
+
+  // If we consumed the whole array, this will be the length of arr
+  *end_offset = offset;
+  return Status::OK();
+}
+
+static Status AppendObjectFixedWidthBytes(PyArrayObject* arr, PyArrayObject* mask,
+                                          int byte_width, int64_t offset,
+                                          FixedSizeBinaryBuilder* builder,
+                                          int64_t* end_offset) {
+  PyObject* obj;
+
+  Ndarray1DIndexer<PyObject*> objects(arr);
+  Ndarray1DIndexer<uint8_t> mask_values;
+
+  bool have_mask = false;
+  if (mask != nullptr) {
+    mask_values.Init(mask);
+    have_mask = true;
+  }
+
+  for (; offset < objects.size(); ++offset) {
+    OwnedRef tmp_obj;
+    obj = objects[offset];
+    if ((have_mask && mask_values[offset]) || PandasObjectIsNull(obj)) {
+      RETURN_NOT_OK(builder->AppendNull());
+      continue;
+    } else if (PyUnicode_Check(obj)) {
+      obj = PyUnicode_AsUTF8String(obj);
+      if (obj == NULL) {
+        PyErr_Clear();
+        return Status::Invalid("failed converting unicode to UTF8");
+      }
+
+      tmp_obj.reset(obj);
+    } else if (!PyBytes_Check(obj)) {
+      std::stringstream ss;
+      ss << "Error converting to Python objects to FixedSizeBinary: ";
+      RETURN_NOT_OK(InvalidConversion(obj, "str, bytes", &ss));
+      return Status::Invalid(ss.str());
+    }
+
+    RETURN_NOT_OK(CheckPythonBytesAreFixedLength(obj, byte_width));
+    if (ARROW_PREDICT_FALSE(builder->value_data_length() + byte_width >
+                            kBinaryMemoryLimit)) {
+      break;
+    }
+    RETURN_NOT_OK(
+        builder->Append(reinterpret_cast<const uint8_t*>(PyBytes_AS_STRING(obj))));
+  }
+
+  // If we consumed the whole array, this will be the length of arr
+  *end_offset = offset;
+  return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// Conversion from NumPy-in-Pandas to Arrow
+
+class NumPyConverter {
+ public:
+  NumPyConverter(MemoryPool* pool, PyObject* ao, PyObject* mo,
+                 const std::shared_ptr<DataType>& type, bool use_pandas_null_sentinels)
+      : pool_(pool),
+        type_(type),
+        arr_(reinterpret_cast<PyArrayObject*>(ao)),
+        mask_(nullptr),
+        use_pandas_null_sentinels_(use_pandas_null_sentinels) {
+    if (mo != nullptr && mo != Py_None) {
+      mask_ = reinterpret_cast<PyArrayObject*>(mo);
+    }
+    length_ = static_cast<int64_t>(PyArray_SIZE(arr_));
+  }
+
+  bool is_strided() const {
+    npy_intp* astrides = PyArray_STRIDES(arr_);
+    return astrides[0] != PyArray_DESCR(arr_)->elsize;
+  }
+
+  Status Convert();
+
+  const std::vector<std::shared_ptr<Array>>& result() const { return out_arrays_; }
+
+  template <typename T>
+  typename std::enable_if<std::is_base_of<PrimitiveCType, T>::value ||
+                              std::is_same<BooleanType, T>::value,
+                          Status>::type
+  Visit(const T& type) {
+    return VisitNative<T>();
+  }
+
+  Status Visit(const Date32Type& type) { return VisitNative<Date32Type>(); }
+  Status Visit(const Date64Type& type) { return VisitNative<Int64Type>(); }
+  Status Visit(const TimestampType& type) { return VisitNative<TimestampType>(); }
+  Status Visit(const Time32Type& type) { return VisitNative<Int32Type>(); }
+  Status Visit(const Time64Type& type) { return VisitNative<Int64Type>(); }
+
+  Status Visit(const NullType& type) { return TypeNotImplemented(type.ToString()); }
+
+  Status Visit(const BinaryType& type) { return TypeNotImplemented(type.ToString()); }
+
+  Status Visit(const FixedSizeBinaryType& type) {
+    return TypeNotImplemented(type.ToString());
+  }
+
+  Status Visit(const DecimalType& type) { return TypeNotImplemented(type.ToString()); }
+
+  Status Visit(const DictionaryType& type) { return TypeNotImplemented(type.ToString()); }
+
+  Status Visit(const NestedType& type) { return TypeNotImplemented(type.ToString()); }
+
+ protected:
+  Status InitNullBitmap() {
+    int64_t null_bytes = BitUtil::BytesForBits(length_);
+
+    null_bitmap_ = std::make_shared<PoolBuffer>(pool_);
+    RETURN_NOT_OK(null_bitmap_->Resize(null_bytes));
+
+    null_bitmap_data_ = null_bitmap_->mutable_data();
+    memset(null_bitmap_data_, 0, static_cast<size_t>(null_bytes));
+
+    return Status::OK();
+  }
+
+  // ----------------------------------------------------------------------
+  // Traditional visitor conversion for non-object arrays
+
+  template <typename ArrowType>
+  Status ConvertData(std::shared_ptr<Buffer>* data);
+
+  template <typename T>
+  Status PushBuilderResult(T* builder) {
+    std::shared_ptr<Array> out;
+    RETURN_NOT_OK(builder->Finish(&out));
+    out_arrays_.emplace_back(out);
+    return Status::OK();
+  }
+
+  template <int TYPE, typename BuilderType>
+  Status AppendNdarrayToBuilder(PyArrayObject* array, BuilderType* builder) {
+    typedef internal::npy_traits<TYPE> traits;
+    typedef typename traits::value_type T;
+
+    const bool null_sentinels_possible =
+        (use_pandas_null_sentinels_ && traits::supports_nulls);
+
+    // TODO(wesm): Vector append when not strided
+    Ndarray1DIndexer<T> values(array);
+    if (null_sentinels_possible) {
+      for (int64_t i = 0; i < values.size(); ++i) {
+        if (traits::isnull(values[i])) {
+          RETURN_NOT_OK(builder->AppendNull());
+        } else {
+          RETURN_NOT_OK(builder->Append(values[i]));
+        }
+      }
+    } else {
+      for (int64_t i = 0; i < values.size(); ++i) {
+        RETURN_NOT_OK(builder->Append(values[i]));
+      }
+    }
+    return Status::OK();
+  }
+
+  Status PushArray(const std::shared_ptr<ArrayData>& data) {
+    std::shared_ptr<Array> result;
+    RETURN_NOT_OK(MakeArray(data, &result));
+    out_arrays_.emplace_back(std::move(result));
+    return Status::OK();
+  }
+
+  template <typename ArrowType>
+  Status VisitNative() {
+    using traits = internal::arrow_traits<ArrowType::type_id>;
+
+    const bool null_sentinels_possible =
+        (use_pandas_null_sentinels_ && traits::supports_nulls);
+
+    if (mask_ != nullptr || null_sentinels_possible) {
+      RETURN_NOT_OK(InitNullBitmap());
+    }
+
+    std::shared_ptr<Buffer> data;
+    RETURN_NOT_OK(ConvertData<ArrowType>(&data));
+
+    int64_t null_count = 0;
+    if (mask_ != nullptr) {
+      null_count = MaskToBitmap(mask_, length_, null_bitmap_data_);
+    } else if (null_sentinels_possible) {
+      // TODO(wesm): this presumes the NumPy C type and arrow C type are the
+      // same
+      null_count = ValuesToBitmap<traits::npy_type>(arr_, null_bitmap_data_);
+    }
+
+    BufferVector buffers = {null_bitmap_, data};
+    auto arr_data =
+        std::make_shared<ArrayData>(type_, length_, std::move(buffers), null_count, 0);
+    return PushArray(arr_data);
+  }
+
+  Status TypeNotImplemented(std::string type_name) {
+    std::stringstream ss;
+    ss << "NumPyConverter doesn't implement <" << type_name << "> conversion. ";
+    return Status::NotImplemented(ss.str());
+  }
+
+  // ----------------------------------------------------------------------
+  // Conversion logic for various object dtype arrays
+
+  Status ConvertObjects();
+
+  template <int ITEM_TYPE, typename ArrowType>
+  Status ConvertTypedLists(const std::shared_ptr<DataType>& type, ListBuilder* builder,
+                           PyObject* list);
+
+  template <typename ArrowType>
+  Status ConvertDates();
+
+  Status ConvertBooleans();
+  Status ConvertObjectStrings();
+  Status ConvertObjectFloats();
+  Status ConvertObjectFixedWidthBytes(const std::shared_ptr<DataType>& type);
+  Status ConvertObjectIntegers();
+  Status ConvertLists(const std::shared_ptr<DataType>& type);
+  Status ConvertLists(const std::shared_ptr<DataType>& type, ListBuilder* builder,
+                      PyObject* list);
+  Status ConvertDecimals();
+  Status ConvertTimes();
+  Status ConvertObjectsInfer();
+  Status ConvertObjectsInferAndCast();
+
+  MemoryPool* pool_;
+  std::shared_ptr<DataType> type_;
+  PyArrayObject* arr_;
+  PyArrayObject* mask_;
+  int64_t length_;
+
+  bool use_pandas_null_sentinels_;
+
+  // Used in visitor pattern
+  std::vector<std::shared_ptr<Array>> out_arrays_;
+
+  std::shared_ptr<ResizableBuffer> null_bitmap_;
+  uint8_t* null_bitmap_data_;
+};
+
+Status NumPyConverter::Convert() {
+  if (PyArray_NDIM(arr_) != 1) {
+    return Status::Invalid("only handle 1-dimensional arrays");
+  }
+
+  if (PyArray_DESCR(arr_)->type_num == NPY_OBJECT) {
+    return ConvertObjects();
+  }
+
+  if (type_ == nullptr) {
+    return Status::Invalid("Must pass data type for non-object arrays");
+  }
+
+  // Visit the type to perform conversion
+  return VisitTypeInline(*type_, this);
+}
+
+template <typename T, typename T2>
+void CopyStrided(T* input_data, int64_t length, int64_t stride, T2* output_data) {
+  // Passing input_data as non-const is a concession to PyObject*
+  int64_t j = 0;
+  for (int64_t i = 0; i < length; ++i) {
+    output_data[i] = static_cast<T2>(input_data[j]);
+    j += stride;
+  }
+}
+
+template <>
+void CopyStrided<PyObject*, PyObject*>(PyObject** input_data, int64_t length,
+                                       int64_t stride, PyObject** output_data) {
+  int64_t j = 0;
+  for (int64_t i = 0; i < length; ++i) {
+    output_data[i] = input_data[j];
+    if (output_data[i] != nullptr) {
+      Py_INCREF(output_data[i]);
+    }
+    j += stride;
+  }
+}
+
+static Status CastBuffer(const std::shared_ptr<Buffer>& input, const int64_t length,
+                         const std::shared_ptr<DataType>& in_type,
+                         const std::shared_ptr<DataType>& out_type, MemoryPool* pool,
+                         std::shared_ptr<Buffer>* out) {
+  // Must cast
+  std::vector<std::shared_ptr<Buffer>> buffers = {nullptr, input};
+  auto tmp_data = std::make_shared<ArrayData>(in_type, length, buffers, 0);
+
+  std::shared_ptr<Array> tmp_array, casted_array;
+  RETURN_NOT_OK(MakeArray(tmp_data, &tmp_array));
+
+  compute::FunctionContext context(pool);
+  compute::CastOptions cast_options;
+  cast_options.allow_int_overflow = false;
+
+  RETURN_NOT_OK(
+      compute::Cast(&context, *tmp_array, out_type, cast_options, &casted_array));
+  *out = casted_array->data()->buffers[1];
+  return Status::OK();
+}
+
+template <typename ArrowType>
+inline Status NumPyConverter::ConvertData(std::shared_ptr<Buffer>* data) {
+  using traits = internal::arrow_traits<ArrowType::type_id>;
+  using T = typename traits::T;
+
+  if (is_strided()) {
+    // Strided, must copy into new contiguous memory
+    const int64_t stride = PyArray_STRIDES(arr_)[0];
+    const int64_t stride_elements = stride / sizeof(T);
+
+    auto new_buffer = std::make_shared<PoolBuffer>(pool_);
+    RETURN_NOT_OK(new_buffer->Resize(sizeof(T) * length_));
+    CopyStrided(reinterpret_cast<T*>(PyArray_DATA(arr_)), length_, stride_elements,
+                reinterpret_cast<T*>(new_buffer->mutable_data()));
+    *data = new_buffer;
+  } else {
+    // Can zero-copy
+    *data = std::make_shared<NumPyBuffer>(reinterpret_cast<PyObject*>(arr_));
+  }
+
+  std::shared_ptr<DataType> input_type;
+  RETURN_NOT_OK(
+      NumPyDtypeToArrow(reinterpret_cast<PyObject*>(PyArray_DESCR(arr_)), &input_type));
+
+  if (!input_type->Equals(*type_)) {
+    RETURN_NOT_OK(CastBuffer(*data, length_, input_type, type_, pool_, data));
+  }
+
+  return Status::OK();
+}
+
+template <>
+inline Status NumPyConverter::ConvertData<Date32Type>(std::shared_ptr<Buffer>* data) {
+  // Handle LONGLONG->INT64 and other fun things
+  int type_num_compat = cast_npy_type_compat(PyArray_DESCR(arr_)->type_num);
+  int type_size = NumPyTypeSize(type_num_compat);
+
+  if (type_size == 4) {
+    // Source and target are INT32, so can refer to the main implementation.
+    return ConvertData<Int32Type>(data);
+  } else if (type_size == 8) {
+    // We need to scale down from int64 to int32
+    auto new_buffer = std::make_shared<PoolBuffer>(pool_);
+    RETURN_NOT_OK(new_buffer->Resize(sizeof(int32_t) * length_));
+
+    auto input = reinterpret_cast<const int64_t*>(PyArray_DATA(arr_));
+    auto output = reinterpret_cast<int32_t*>(new_buffer->mutable_data());
+
+    if (is_strided()) {
+      // Strided, must copy into new contiguous memory
+      const int64_t stride = PyArray_STRIDES(arr_)[0];
+      const int64_t stride_elements = stride / sizeof(int64_t);
+      CopyStrided(input, length_, stride_elements, output);
+    } else {
+      // TODO(wesm): int32 overflow checks
+      for (int64_t i = 0; i < length_; ++i) {
+        *output++ = static_cast<int32_t>(*input++);
+      }
+    }
+    *data = new_buffer;
+  } else {
+    std::stringstream ss;
+    ss << "Cannot convert NumPy array of element size ";
+    ss << type_size << " to a Date32 array";
+    return Status::NotImplemented(ss.str());
+  }
+
+  return Status::OK();
+}
+
+template <>
+inline Status NumPyConverter::ConvertData<BooleanType>(std::shared_ptr<Buffer>* data) {
+  int64_t nbytes = BitUtil::BytesForBits(length_);
+  auto buffer = std::make_shared<PoolBuffer>(pool_);
+  RETURN_NOT_OK(buffer->Resize(nbytes));
+
+  Ndarray1DIndexer<uint8_t> values(arr_);
+
+  uint8_t* bitmap = buffer->mutable_data();
+
+  memset(bitmap, 0, nbytes);
+  for (int i = 0; i < length_; ++i) {
+    if (values[i] > 0) {
+      BitUtil::SetBit(bitmap, i);
+    }
+  }
+
+  *data = buffer;
+  return Status::OK();
+}
+
+template <typename T>
+struct UnboxDate {};
+
+template <>
+struct UnboxDate<Date32Type> {
+  static int32_t Unbox(PyObject* obj) {
+    return PyDate_to_days(reinterpret_cast<PyDateTime_Date*>(obj));
+  }
+};
+
+template <>
+struct UnboxDate<Date64Type> {
+  static int64_t Unbox(PyObject* obj) {
+    return PyDate_to_ms(reinterpret_cast<PyDateTime_Date*>(obj));
+  }
+};
+
+template <typename ArrowType>
+Status NumPyConverter::ConvertDates() {
+  PyAcquireGIL lock;
+
+  using BuilderType = typename TypeTraits<ArrowType>::BuilderType;
+
+  Ndarray1DIndexer<PyObject*> objects(arr_);
+
+  if (mask_ != nullptr) {
+    return Status::NotImplemented("mask not supported in object conversions yet");
+  }
+
+  BuilderType builder(pool_);
+  RETURN_NOT_OK(builder.Resize(length_));
+
+  /// We have to run this in this compilation unit, since we cannot use the
+  /// datetime API otherwise
+  PyDateTime_IMPORT;
+
+  PyObject* obj;
+  for (int64_t i = 0; i < length_; ++i) {
+    obj = objects[i];
+    if (PyDate_CheckExact(obj)) {
+      RETURN_NOT_OK(builder.Append(UnboxDate<ArrowType>::Unbox(obj)));
+    } else if (PandasObjectIsNull(obj)) {
+      RETURN_NOT_OK(builder.AppendNull());
+    } else {
+      std::stringstream ss;
+      ss << "Error converting from Python objects to Date: ";
+      RETURN_NOT_OK(InvalidConversion(obj, "datetime.date", &ss));
+      return Status::Invalid(ss.str());
+    }
+  }
+
+  return PushBuilderResult(&builder);
+}
+
+Status NumPyConverter::ConvertDecimals() {
+  PyAcquireGIL lock;
+
+  // Import the decimal module and Decimal class
+  OwnedRef decimal;
+  OwnedRef Decimal;
+  RETURN_NOT_OK(ImportModule("decimal", &decimal));
+  RETURN_NOT_OK(ImportFromModule(decimal, "Decimal", &Decimal));
+
+  Ndarray1DIndexer<PyObject*> objects(arr_);
+  PyObject* object = objects[0];
+
+  int precision;
+  int scale;
+
+  RETURN_NOT_OK(InferDecimalPrecisionAndScale(object, &precision, &scale));
+
+  type_ = std::make_shared<DecimalType>(precision, scale);
+
+  DecimalBuilder builder(type_, pool_);
+  RETURN_NOT_OK(builder.Resize(length_));
+
+  for (int64_t i = 0; i < length_; ++i) {
+    object = objects[i];
+    if (PyObject_IsInstance(object, Decimal.obj())) {
+      std::string string;
+      RETURN_NOT_OK(PythonDecimalToString(object, &string));
+
+      Decimal128 value;
+      RETURN_NOT_OK(Decimal128::FromString(string, &value));
+      RETURN_NOT_OK(builder.Append(value));
+    } else if (PandasObjectIsNull(object)) {
+      RETURN_NOT_OK(builder.AppendNull());
+    } else {
+      std::stringstream ss;
+      ss << "Error converting from Python objects to Decimal: ";
+      RETURN_NOT_OK(InvalidConversion(object, "decimal.Decimal", &ss));
+      return Status::Invalid(ss.str());
+    }
+  }
+  return PushBuilderResult(&builder);
+}
+
+Status NumPyConverter::ConvertTimes() {
+  // Convert array of datetime.time objects to Arrow
+  PyAcquireGIL lock;
+  PyDateTime_IMPORT;
+
+  Ndarray1DIndexer<PyObject*> objects(arr_);
+
+  // datetime.time stores microsecond resolution
+  Time64Builder builder(::arrow::time64(TimeUnit::MICRO), pool_);
+  RETURN_NOT_OK(builder.Resize(length_));
+
+  PyObject* obj;
+  for (int64_t i = 0; i < length_; ++i) {
+    obj = objects[i];
+    if (PyTime_Check(obj)) {
+      RETURN_NOT_OK(builder.Append(PyTime_to_us(obj)));
+    } else if (PandasObjectIsNull(obj)) {
+      RETURN_NOT_OK(builder.AppendNull());
+    } else {
+      std::stringstream ss;
+      ss << "Error converting from Python objects to Time: ";
+      RETURN_NOT_OK(InvalidConversion(obj, "datetime.time", &ss));
+      return Status::Invalid(ss.str());
+    }
+  }
+  return PushBuilderResult(&builder);
+}
+
+Status NumPyConverter::ConvertObjectStrings() {
+  PyAcquireGIL lock;
+
+  // The output type at this point is inconclusive because there may be bytes
+  // and unicode mixed in the object array
+  StringBuilder builder(pool_);
+  RETURN_NOT_OK(builder.Resize(length_));
+
+  bool global_have_bytes = false;
+  int64_t offset = 0;
+  while (offset < length_) {
+    bool chunk_have_bytes = false;
+    RETURN_NOT_OK(
+        AppendObjectStrings(arr_, mask_, offset, &builder, &offset, &chunk_have_bytes));
+
+    global_have_bytes = global_have_bytes | chunk_have_bytes;
+    std::shared_ptr<Array> chunk;
+    RETURN_NOT_OK(builder.Finish(&chunk));
+    out_arrays_.emplace_back(std::move(chunk));
+  }
+
+  // If we saw PyBytes, convert everything to BinaryArray
+  if (global_have_bytes) {
+    for (size_t i = 0; i < out_arrays_.size(); ++i) {
+      auto binary_data = out_arrays_[i]->data()->ShallowCopy();
+      binary_data->type = ::arrow::binary();
+      out_arrays_[i] = std::make_shared<BinaryArray>(binary_data);
+    }
+  }
+  return Status::OK();
+}
+
+Status NumPyConverter::ConvertObjectFloats() {
+  PyAcquireGIL lock;
+
+  Ndarray1DIndexer<PyObject*> objects(arr_);
+  Ndarray1DIndexer<uint8_t> mask_values;
+
+  bool have_mask = false;
+  if (mask_ != nullptr) {
+    mask_values.Init(mask_);
+    have_mask = true;
+  }
+
+  DoubleBuilder builder(pool_);
+  RETURN_NOT_OK(builder.Resize(length_));
+
+  PyObject* obj;
+  for (int64_t i = 0; i < objects.size(); ++i) {
+    obj = objects[i];
+    if ((have_mask && mask_values[i]) || PandasObjectIsNull(obj)) {
+      RETURN_NOT_OK(builder.AppendNull());
+    } else if (PyFloat_Check(obj)) {
+      double val = PyFloat_AsDouble(obj);
+      RETURN_IF_PYERROR();
+      RETURN_NOT_OK(builder.Append(val));
+    } else {
+      std::stringstream ss;
+      ss << "Error converting from Python objects to Double: ";
+      RETURN_NOT_OK(InvalidConversion(obj, "float", &ss));
+      return Status::Invalid(ss.str());
+    }
+  }
+
+  return PushBuilderResult(&builder);
+}
+
+Status NumPyConverter::ConvertObjectIntegers() {
+  PyAcquireGIL lock;
+
+  Int64Builder builder(pool_);
+  RETURN_NOT_OK(builder.Resize(length_));
+
+  Ndarray1DIndexer<PyObject*> objects(arr_);
+  Ndarray1DIndexer<uint8_t> mask_values;
+
+  bool have_mask = false;
+  if (mask_ != nullptr) {
+    mask_values.Init(mask_);
+    have_mask = true;
+  }
+
+  PyObject* obj;
+  for (int64_t i = 0; i < objects.size(); ++i) {
+    obj = objects[i];
+    if ((have_mask && mask_values[i]) || PandasObjectIsNull(obj)) {
+      RETURN_NOT_OK(builder.AppendNull());
+    } else if (PyObject_is_integer(obj)) {
+      const int64_t val = static_cast<int64_t>(PyLong_AsLong(obj));
+      RETURN_IF_PYERROR();
+      RETURN_NOT_OK(builder.Append(val));
+    } else {
+      std::stringstream ss;
+      ss << "Error converting from Python objects to Int64: ";
+      RETURN_NOT_OK(InvalidConversion(obj, "integer", &ss));
+      return Status::Invalid(ss.str());
+    }
+  }
+
+  return PushBuilderResult(&builder);
+}
+
+Status NumPyConverter::ConvertObjectFixedWidthBytes(
+    const std::shared_ptr<DataType>& type) {
+  PyAcquireGIL lock;
+
+  int32_t byte_width = static_cast<const FixedSizeBinaryType&>(*type).byte_width();
+
+  // The output type at this point is inconclusive because there may be bytes
+  // and unicode mixed in the object array
+  FixedSizeBinaryBuilder builder(type, pool_);
+  RETURN_NOT_OK(builder.Resize(length_));
+
+  int64_t offset = 0;
+  while (offset < length_) {
+    RETURN_NOT_OK(
+        AppendObjectFixedWidthBytes(arr_, mask_, byte_width, offset, &builder, &offset));
+
+    std::shared_ptr<Array> chunk;
+    RETURN_NOT_OK(builder.Finish(&chunk));
+    out_arrays_.emplace_back(std::move(chunk));
+  }
+  return Status::OK();
+}
+
+Status NumPyConverter::ConvertBooleans() {
+  PyAcquireGIL lock;
+
+  Ndarray1DIndexer<PyObject*> objects(arr_);
+  Ndarray1DIndexer<uint8_t> mask_values;
+
+  bool have_mask = false;
+  if (mask_ != nullptr) {
+    mask_values.Init(mask_);
+    have_mask = true;
+  }
+
+  int64_t nbytes = BitUtil::BytesForBits(length_);
+  auto data = std::make_shared<PoolBuffer>(pool_);
+  RETURN_NOT_OK(data->Resize(nbytes));
+  uint8_t* bitmap = data->mutable_data();
+  memset(bitmap, 0, nbytes);
+
+  int64_t null_count = 0;
+  PyObject* obj;
+  for (int64_t i = 0; i < length_; ++i) {
+    obj = objects[i];
+    if ((have_mask && mask_values[i]) || PandasObjectIsNull(obj)) {
+      ++null_count;
+    } else if (obj == Py_True) {
+      BitUtil::SetBit(bitmap, i);
+      BitUtil::SetBit(null_bitmap_data_, i);
+    } else if (obj == Py_False) {
+      BitUtil::SetBit(null_bitmap_data_, i);
+    } else {
+      std::stringstream ss;
+      ss << "Error converting from Python objects to Boolean: ";
+      RETURN_NOT_OK(InvalidConversion(obj, "bool", &ss));
+      return Status::Invalid(ss.str());
+    }
+  }
+
+  out_arrays_.push_back(
+      std::make_shared<BooleanArray>(length_, data, null_bitmap_, null_count));
+  return Status::OK();
+}
+
+Status NumPyConverter::ConvertObjectsInfer() {
+  Ndarray1DIndexer<PyObject*> objects;
+
+  PyAcquireGIL lock;
+  objects.Init(arr_);
+  PyDateTime_IMPORT;
+
+  OwnedRef decimal;
+  OwnedRef Decimal;
+  RETURN_NOT_OK(ImportModule("decimal", &decimal));
+  RETURN_NOT_OK(ImportFromModule(decimal, "Decimal", &Decimal));
+
+  for (int64_t i = 0; i < length_; ++i) {
+    PyObject* obj = objects[i];
+    if (PandasObjectIsNull(obj)) {
+      continue;
+    } else if (PyObject_is_string(obj)) {
+      return ConvertObjectStrings();
+    } else if (PyObject_is_float(obj)) {
+      return ConvertObjectFloats();
+    } else if (PyBool_Check(obj)) {
+      return ConvertBooleans();
+    } else if (PyObject_is_integer(obj)) {
+      return ConvertObjectIntegers();
+    } else if (PyDate_CheckExact(obj)) {
+      // We could choose Date32 or Date64
+      return ConvertDates<Date32Type>();
+    } else if (PyTime_Check(obj)) {
+      return ConvertTimes();
+    } else if (PyObject_IsInstance(const_cast<PyObject*>(obj), Decimal.obj())) {
+      return ConvertDecimals();
+    } else if (PyList_Check(obj) || PyArray_Check(obj)) {
+      std::shared_ptr<DataType> inferred_type;
+      RETURN_NOT_OK(InferArrowType(obj, &inferred_type));
+      return ConvertLists(inferred_type);
+    } else {
+      const std::string supported_types =
+          "string, bool, float, int, date, time, decimal, list, array";
+      std::stringstream ss;
+      ss << "Error inferring Arrow type for Python object array. ";
+      RETURN_NOT_OK(InvalidConversion(obj, supported_types, &ss));
+      return Status::Invalid(ss.str());
+    }
+  }
+  out_arrays_.push_back(std::make_shared<NullArray>(length_));
+  return Status::OK();
+}
+
+Status NumPyConverter::ConvertObjectsInferAndCast() {
+  size_t position = out_arrays_.size();
+  RETURN_NOT_OK(ConvertObjectsInfer());
+
+  std::shared_ptr<Array> arr = out_arrays_[position];
+
+  // Perform cast
+  compute::FunctionContext context(pool_);
+  compute::CastOptions options;
+  options.allow_int_overflow = false;
+
+  std::shared_ptr<Array> casted;
+  RETURN_NOT_OK(compute::Cast(&context, *arr, type_, options, &casted));
+
+  // Replace with casted values
+  out_arrays_[position] = casted;
+
+  return Status::OK();
+}
+
+Status NumPyConverter::ConvertObjects() {
+  // Python object arrays are annoying, since we could have one of:
+  //
+  // * Strings
+  // * Booleans with nulls
+  // * decimal.Decimals
+  // * Mixed type (not supported at the moment by arrow format)
+  //
+  // Additionally, nulls may be encoded either as np.nan or None. So we have to
+  // do some type inference and conversion
+
+  RETURN_NOT_OK(InitNullBitmap());
+
+  // This means we received an explicit type from the user
+  if (type_) {
+    switch (type_->id()) {
+      case Type::STRING:
+        return ConvertObjectStrings();
+      case Type::FIXED_SIZE_BINARY:
+        return ConvertObjectFixedWidthBytes(type_);
+      case Type::BOOL:
+        return ConvertBooleans();
+      case Type::DATE32:
+        return ConvertDates<Date32Type>();
+      case Type::DATE64:
+        return ConvertDates<Date64Type>();
+      case Type::LIST: {
+        const auto& list_field = static_cast<const ListType&>(*type_);
+        return ConvertLists(list_field.value_field()->type());
+      }
+      case Type::DECIMAL:
+        return ConvertDecimals();
+      default:
+        return ConvertObjectsInferAndCast();
+    }
+  } else {
+    // Re-acquire GIL
+    return ConvertObjectsInfer();
+  }
+}
+
+template <typename T>
+Status LoopPySequence(PyObject* sequence, T func) {
+  if (PySequence_Check(sequence)) {
+    OwnedRef ref;
+    Py_ssize_t size = PySequence_Size(sequence);
+    if (PyArray_Check(sequence)) {
+      auto array = reinterpret_cast<PyArrayObject*>(sequence);
+      Ndarray1DIndexer<PyObject*> objects(array);
+      for (int64_t i = 0; i < size; ++i) {
+        RETURN_NOT_OK(func(objects[i]));
+      }
+    } else {
+      for (int64_t i = 0; i < size; ++i) {
+        ref.reset(PySequence_GetItem(sequence, i));
+        RETURN_NOT_OK(func(ref.obj()));
+      }
+    }
+  } else if (PyObject_HasAttrString(sequence, "__iter__")) {
+    OwnedRef iter = OwnedRef(PyObject_GetIter(sequence));
+    PyObject* item;
+    while ((item = PyIter_Next(iter.obj()))) {
+      OwnedRef ref = OwnedRef(item);
+      RETURN_NOT_OK(func(ref.obj()));
+    }
+  } else {
+    return Status::TypeError("Object is not a sequence or iterable");
+  }
+
+  return Status::OK();
+}
+
+template <int ITEM_TYPE, typename ArrowType>
+inline Status NumPyConverter::ConvertTypedLists(const std::shared_ptr<DataType>& type,
+                                                ListBuilder* builder, PyObject* list) {
+  typedef internal::npy_traits<ITEM_TYPE> traits;
+  typedef typename traits::BuilderClass BuilderT;
+
+  PyAcquireGIL lock;
+
+  // TODO: mask not supported here
+  if (mask_ != nullptr) {
+    return Status::NotImplemented("mask not supported in object conversions yet");
+  }
+
+  BuilderT* value_builder = static_cast<BuilderT*>(builder->value_builder());
+
+  auto foreach_item = [&](PyObject* object) {
+    if (PandasObjectIsNull(object)) {
+      return builder->AppendNull();
+    } else if (PyArray_Check(object)) {
+      auto numpy_array = reinterpret_cast<PyArrayObject*>(object);
+      RETURN_NOT_OK(builder->Append(true));
+
+      // TODO(uwe): Support more complex numpy array structures
+      RETURN_NOT_OK(CheckFlatNumpyArray(numpy_array, ITEM_TYPE));
+
+      return AppendNdarrayToBuilder<ITEM_TYPE, BuilderT>(numpy_array, value_builder);
+    } else if (PyList_Check(object)) {
+      int64_t size;
+      std::shared_ptr<DataType> inferred_type;
+      RETURN_NOT_OK(builder->Append(true));
+      RETURN_NOT_OK(InferArrowTypeAndSize(object, &size, &inferred_type));
+      if (inferred_type->id() != Type::NA && inferred_type->id() != type->id()) {
+        std::stringstream ss;
+        ss << inferred_type->ToString() << " cannot be converted to " << type->ToString();
+        return Status::TypeError(ss.str());
+      }
+      return AppendPySequence(object, size, type, value_builder);
+    } else {
+      return Status::TypeError("Unsupported Python type for list items");
+    }
+  };
+
+  return LoopPySequence(list, foreach_item);
+}
+
+template <>
+inline Status NumPyConverter::ConvertTypedLists<NPY_OBJECT, NullType>(
+    const std::shared_ptr<DataType>& type, ListBuilder* builder, PyObject* list) {
+  PyAcquireGIL lock;
+
+  // TODO: mask not supported here
+  if (mask_ != nullptr) {
+    return Status::NotImplemented("mask not supported in object conversions yet");
+  }
+
+  auto value_builder = static_cast<NullBuilder*>(builder->value_builder());
+
+  auto foreach_item = [&](PyObject* object) {
+    if (PandasObjectIsNull(object)) {
+      return builder->AppendNull();
+    } else if (PyArray_Check(object)) {
+      auto numpy_array = reinterpret_cast<PyArrayObject*>(object);
+      RETURN_NOT_OK(builder->Append(true));
+
+      // TODO(uwe): Support more complex numpy array structures
+      RETURN_NOT_OK(CheckFlatNumpyArray(numpy_array, NPY_OBJECT));
+
+      for (int64_t i = 0; i < static_cast<int64_t>(PyArray_SIZE(numpy_array)); ++i) {
+        RETURN_NOT_OK(value_builder->AppendNull());
+      }
+      return Status::OK();
+    } else if (PyList_Check(object)) {
+      RETURN_NOT_OK(builder->Append(true));
+      const Py_ssize_t size = PySequence_Size(object);
+      for (Py_ssize_t i = 0; i < size; ++i) {
+        RETURN_NOT_OK(value_builder->AppendNull());
+      }
+      return Status::OK();
+    } else {
+      return Status::TypeError("Unsupported Python type for list items");
+    }
+  };
+
+  return LoopPySequence(list, foreach_item);
+}
+
+template <>
+inline Status NumPyConverter::ConvertTypedLists<NPY_OBJECT, StringType>(
+    const std::shared_ptr<DataType>& type, ListBuilder* builder, PyObject* list) {
+  PyAcquireGIL lock;
+  // TODO: If there are bytes involed, convert to Binary representation
+  bool have_bytes = false;
+
+  // TODO: mask not supported here
+  if (mask_ != nullptr) {
+    return Status::NotImplemented("mask not supported in object conversions yet");
+  }
+
+  auto value_builder = static_cast<StringBuilder*>(builder->value_builder());
+
+  auto foreach_item = [&](PyObject* object) {
+    if (PandasObjectIsNull(object)) {
+      return builder->AppendNull();
+    } else if (PyArray_Check(object)) {
+      auto numpy_array = reinterpret_cast<PyArrayObject*>(object);
+      RETURN_NOT_OK(builder->Append(true));
+
+      // TODO(uwe): Support more complex numpy array structures
+      RETURN_NOT_OK(CheckFlatNumpyArray(numpy_array, NPY_OBJECT));
+
+      int64_t offset = 0;
+      RETURN_NOT_OK(AppendObjectStrings(numpy_array, nullptr, 0, value_builder, &offset,
+                                        &have_bytes));
+      if (offset < PyArray_SIZE(numpy_array)) {
+        return Status::Invalid("Array cell value exceeded 2GB");
+      }
+      return Status::OK();
+    } else if (PyList_Check(object)) {
+      int64_t size;
+      std::shared_ptr<DataType> inferred_type;
+      RETURN_NOT_OK(builder->Append(true));
+      RETURN_NOT_OK(InferArrowTypeAndSize(object, &size, &inferred_type));
+      if (inferred_type->id() != Type::NA && inferred_type->id() != Type::STRING) {
+        std::stringstream ss;
+        ss << inferred_type->ToString() << " cannot be converted to STRING.";
+        return Status::TypeError(ss.str());
+      }
+      return AppendPySequence(object, size, inferred_type, value_builder);
+    } else {
+      return Status::TypeError("Unsupported Python type for list items");
+    }
+  };
+
+  return LoopPySequence(list, foreach_item);
+}
+
+#define LIST_CASE(TYPE, NUMPY_TYPE, ArrowType)                            \
+  case Type::TYPE: {                                                      \
+    return ConvertTypedLists<NUMPY_TYPE, ArrowType>(type, builder, list); \
+  }
+
+Status NumPyConverter::ConvertLists(const std::shared_ptr<DataType>& type,
+                                    ListBuilder* builder, PyObject* list) {
+  switch (type->id()) {
+    LIST_CASE(NA, NPY_OBJECT, NullType)
+    LIST_CASE(UINT8, NPY_UINT8, UInt8Type)
+    LIST_CASE(INT8, NPY_INT8, Int8Type)
+    LIST_CASE(UINT16, NPY_UINT16, UInt16Type)
+    LIST_CASE(INT16, NPY_INT16, Int16Type)
+    LIST_CASE(UINT32, NPY_UINT32, UInt32Type)
+    LIST_CASE(INT32, NPY_INT32, Int32Type)
+    LIST_CASE(UINT64, NPY_UINT64, UInt64Type)
+    LIST_CASE(INT64, NPY_INT64, Int64Type)
+    LIST_CASE(TIMESTAMP, NPY_DATETIME, TimestampType)
+    LIST_CASE(FLOAT, NPY_FLOAT, FloatType)
+    LIST_CASE(DOUBLE, NPY_DOUBLE, DoubleType)
+    LIST_CASE(STRING, NPY_OBJECT, StringType)
+    case Type::LIST: {
+      const ListType& list_type = static_cast<const ListType&>(*type);
+      auto value_builder = static_cast<ListBuilder*>(builder->value_builder());
+
+      auto foreach_item = [&](PyObject* object) {
+        if (PandasObjectIsNull(object)) {
+          return builder->AppendNull();
+        } else {
+          RETURN_NOT_OK(builder->Append(true));
+          return ConvertLists(list_type.value_type(), value_builder, object);
+        }
+      };
+
+      return LoopPySequence(list, foreach_item);
+    }
+    default: {
+      std::stringstream ss;
+      ss << "Unknown list item type: ";
+      ss << type->ToString();
+      return Status::TypeError(ss.str());
+    }
+  }
+}
+
+Status NumPyConverter::ConvertLists(const std::shared_ptr<DataType>& type) {
+  std::unique_ptr<ArrayBuilder> array_builder;
+  RETURN_NOT_OK(MakeBuilder(pool_, arrow::list(type), &array_builder));
+  ListBuilder* list_builder = static_cast<ListBuilder*>(array_builder.get());
+  RETURN_NOT_OK(ConvertLists(type, list_builder, reinterpret_cast<PyObject*>(arr_)));
+  return PushBuilderResult(list_builder);
+}
+
+Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo,
+                      bool use_pandas_null_sentinels,
+                      const std::shared_ptr<DataType>& type,
+                      std::shared_ptr<ChunkedArray>* out) {
+  NumPyConverter converter(pool, ao, mo, type, use_pandas_null_sentinels);
+  RETURN_NOT_OK(converter.Convert());
+  DCHECK(converter.result()[0]);
+  *out = std::make_shared<ChunkedArray>(converter.result());
+  return Status::OK();
+}
+
+}  // namespace py
+}  // namespace arrow

http://git-wip-us.apache.org/repos/asf/arrow/blob/796129b4/cpp/src/arrow/python/numpy_to_arrow.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/numpy_to_arrow.h b/cpp/src/arrow/python/numpy_to_arrow.h
new file mode 100644
index 0000000..4a70b4b
--- /dev/null
+++ b/cpp/src/arrow/python/numpy_to_arrow.h
@@ -0,0 +1,56 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Converting from pandas memory representation to Arrow data structures
+
+#ifndef ARROW_PYTHON_NUMPY_TO_ARROW_H
+#define ARROW_PYTHON_NUMPY_TO_ARROW_H
+
+#include "arrow/python/platform.h"
+
+#include <memory>
+
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class Array;
+class ChunkedArray;
+class DataType;
+class MemoryPool;
+class Status;
+
+namespace py {
+
+/// Convert NumPy arrays to Arrow. If target data type is not known, pass a
+/// type with nullptr
+///
+/// \param[in] pool Memory pool for any memory allocations
+/// \param[in] ao an ndarray with the array data
+/// \param[in] mo an ndarray with a null mask (True is null), optional
+/// \param[in] type
+/// \param[out] out a ChunkedArray, to accommodate chunked output
+ARROW_EXPORT
+Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo,
+                      bool use_pandas_null_sentinels,
+                      const std::shared_ptr<DataType>& type,
+                      std::shared_ptr<ChunkedArray>* out);
+
+}  // namespace py
+}  // namespace arrow
+
+#endif  // ARROW_PYTHON_NUMPY_TO_ARROW_H