You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2016/03/28 18:36:28 UTC
arrow git commit: ARROW-30: [Python] Routines for converting between
arrow::Array/Table and pandas.DataFrame
Repository: arrow
Updated Branches:
refs/heads/master 017187749 -> 1fd0668a1
ARROW-30: [Python] Routines for converting between arrow::Array/Table and pandas.DataFrame
There is a lot to do here for maximum compatibility, but this gets things started.
Author: Wes McKinney <we...@apache.org>
Closes #46 from wesm/ARROW-30 and squashes the following commits:
0a9e747 [Wes McKinney] Invoke py.test with python -m pytest
4c9f766 [Wes McKinney] More scaffolding. Table wrapper. Initial unit tests passing
8475a0e [Wes McKinney] More pandas conversion scaffolding, enable libpyarrow to use the NumPy C API globally
d1f05c5 [Wes McKinney] cpplint
f0cc451 [Wes McKinney] Give libpyarrow a reference to numpy.nan
5e09bfe [Wes McKinney] Compiling, but untested draft of pandas <-> arrow converters
Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/1fd0668a
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/1fd0668a
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/1fd0668a
Branch: refs/heads/master
Commit: 1fd0668a1330e72b1b137d90d00906bc188243e0
Parents: 0171877
Author: Wes McKinney <we...@apache.org>
Authored: Mon Mar 28 09:36:20 2016 -0700
Committer: Wes McKinney <we...@apache.org>
Committed: Mon Mar 28 09:36:20 2016 -0700
----------------------------------------------------------------------
ci/travis_script_python.sh | 8 +-
cpp/README.md | 6 +-
cpp/src/arrow/array.h | 13 +-
cpp/src/arrow/types/string.cc | 10 +
cpp/src/arrow/types/string.h | 4 +-
cpp/src/arrow/util/buffer.h | 42 ++
python/CMakeLists.txt | 6 +-
python/pyarrow/__init__.py | 8 +-
python/pyarrow/array.pyx | 135 +++++
python/pyarrow/config.pyx | 13 +-
python/pyarrow/includes/common.pxd | 6 +
python/pyarrow/includes/libarrow.pxd | 52 +-
python/pyarrow/includes/pyarrow.pxd | 9 +-
python/pyarrow/tests/test_convert_pandas.py | 172 ++++++
python/src/pyarrow/adapters/pandas.cc | 714 +++++++++++++++++++++++
python/src/pyarrow/adapters/pandas.h | 21 +
python/src/pyarrow/common.h | 23 +-
python/src/pyarrow/config.cc | 34 ++
python/src/pyarrow/config.h | 39 ++
python/src/pyarrow/do_import_numpy.h | 21 +
python/src/pyarrow/init.cc | 25 -
python/src/pyarrow/init.h | 27 -
python/src/pyarrow/numpy_interop.h | 58 ++
23 files changed, 1355 insertions(+), 91 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/arrow/blob/1fd0668a/ci/travis_script_python.sh
----------------------------------------------------------------------
diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh
index af6b008..d45b895 100755
--- a/ci/travis_script_python.sh
+++ b/ci/travis_script_python.sh
@@ -48,17 +48,11 @@ python_version_tests() {
python setup.py build_ext --inplace
- py.test -vv -r sxX pyarrow
+ python -m pytest -vv -r sxX pyarrow
}
# run tests for python 2.7 and 3.5
python_version_tests 2.7
python_version_tests 3.5
-# if [ $TRAVIS_OS_NAME == "linux" ]; then
-# valgrind --tool=memcheck py.test -vv -r sxX arrow
-# else
-# py.test -vv -r sxX arrow
-# fi
-
popd
http://git-wip-us.apache.org/repos/asf/arrow/blob/1fd0668a/cpp/README.md
----------------------------------------------------------------------
diff --git a/cpp/README.md b/cpp/README.md
index 542cce4..9026cf9 100644
--- a/cpp/README.md
+++ b/cpp/README.md
@@ -42,12 +42,12 @@ Detailed unit test logs will be placed in the build directory under `build/test-
### Building/Running benchmarks
-Follow the directions for simple build except run cmake
+Follow the directions for simple build except run cmake
with the `--ARROW_BUILD_BENCHMARKS` parameter set correctly:
cmake -DARROW_BUILD_BENCHMARKS=ON ..
-and instead of make unittest run either `make; ctest` to run both unit tests
+and instead of make unittest run either `make; ctest` to run both unit tests
and benchmarks or `make runbenchmark` to run only the benchmark tests.
Benchmark logs will be placed in the build directory under `build/benchmark-logs`.
@@ -60,4 +60,4 @@ variables
* Googletest: `GTEST_HOME` (only required to build the unit tests)
* Google Benchmark: `GBENCHMARK_HOME` (only required if building benchmarks)
-
+* Flatbuffers: `FLATBUFFERS_HOME` (only required for the IPC extensions)
http://git-wip-us.apache.org/repos/asf/arrow/blob/1fd0668a/cpp/src/arrow/array.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h
index 133adf3..097634d 100644
--- a/cpp/src/arrow/array.h
+++ b/cpp/src/arrow/array.h
@@ -34,13 +34,10 @@ class Buffer;
//
// The base class is only required to have a null bitmap buffer if the null
// count is greater than 0
-//
-// Any buffers used to initialize the array have their references "stolen". If
-// you wish to use the buffer beyond the lifetime of the array, you need to
-// explicitly increment its reference count
class Array {
public:
- Array(const TypePtr& type, int32_t length, int32_t null_count = 0,
+ Array(const std::shared_ptr<DataType>& type, int32_t length,
+ int32_t null_count = 0,
const std::shared_ptr<Buffer>& null_bitmap = nullptr);
virtual ~Array() {}
@@ -60,11 +57,15 @@ class Array {
return null_bitmap_;
}
+ const uint8_t* null_bitmap_data() const {
+ return null_bitmap_data_;
+ }
+
bool EqualsExact(const Array& arr) const;
virtual bool Equals(const std::shared_ptr<Array>& arr) const = 0;
protected:
- TypePtr type_;
+ std::shared_ptr<DataType> type_;
int32_t null_count_;
int32_t length_;
http://git-wip-us.apache.org/repos/asf/arrow/blob/1fd0668a/cpp/src/arrow/types/string.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/string.cc b/cpp/src/arrow/types/string.cc
index dea42e1..80b075c 100644
--- a/cpp/src/arrow/types/string.cc
+++ b/cpp/src/arrow/types/string.cc
@@ -20,8 +20,18 @@
#include <sstream>
#include <string>
+#include "arrow/type.h"
+
namespace arrow {
+const std::shared_ptr<DataType> STRING(new StringType());
+
+StringArray::StringArray(int32_t length,
+ const std::shared_ptr<Buffer>& offsets,
+ const ArrayPtr& values, int32_t null_count,
+ const std::shared_ptr<Buffer>& null_bitmap) :
+ StringArray(STRING, length, offsets, values, null_count, null_bitmap) {}
+
std::string CharType::ToString() const {
std::stringstream s;
s << "char(" << size << ")";
http://git-wip-us.apache.org/repos/asf/arrow/blob/1fd0668a/cpp/src/arrow/types/string.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/string.h b/cpp/src/arrow/types/string.h
index fda722b..84cd032 100644
--- a/cpp/src/arrow/types/string.h
+++ b/cpp/src/arrow/types/string.h
@@ -79,9 +79,7 @@ class StringArray : public ListArray {
const std::shared_ptr<Buffer>& offsets,
const ArrayPtr& values,
int32_t null_count = 0,
- const std::shared_ptr<Buffer>& null_bitmap = nullptr) :
- StringArray(std::make_shared<StringType>(), length, offsets, values,
- null_count, null_bitmap) {}
+ const std::shared_ptr<Buffer>& null_bitmap = nullptr);
// Compute the pointer t
const uint8_t* GetValue(int i, int32_t* out_length) const {
http://git-wip-us.apache.org/repos/asf/arrow/blob/1fd0668a/cpp/src/arrow/util/buffer.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/util/buffer.h b/cpp/src/arrow/util/buffer.h
index 0c3e210..c15f9b6 100644
--- a/cpp/src/arrow/util/buffer.h
+++ b/cpp/src/arrow/util/buffer.h
@@ -18,11 +18,13 @@
#ifndef ARROW_UTIL_BUFFER_H
#define ARROW_UTIL_BUFFER_H
+#include <algorithm>
#include <cstdint>
#include <cstring>
#include <memory>
#include "arrow/util/macros.h"
+#include "arrow/util/status.h"
namespace arrow {
@@ -146,6 +148,46 @@ class PoolBuffer : public ResizableBuffer {
MemoryPool* pool_;
};
+static constexpr int64_t MIN_BUFFER_CAPACITY = 1024;
+
+class BufferBuilder {
+ public:
+ explicit BufferBuilder(MemoryPool* pool) :
+ pool_(pool),
+ capacity_(0),
+ size_(0) {}
+
+ Status Append(const uint8_t* data, int length) {
+ if (capacity_ < length + size_) {
+ if (capacity_ == 0) {
+ buffer_ = std::make_shared<PoolBuffer>(pool_);
+ }
+ capacity_ = std::max(MIN_BUFFER_CAPACITY, capacity_);
+ while (capacity_ < length + size_) {
+ capacity_ *= 2;
+ }
+ RETURN_NOT_OK(buffer_->Resize(capacity_));
+ data_ = buffer_->mutable_data();
+ }
+ memcpy(data_ + size_, data, length);
+ size_ += length;
+ return Status::OK();
+ }
+
+ std::shared_ptr<Buffer> Finish() {
+ auto result = buffer_;
+ buffer_ = nullptr;
+ return result;
+ }
+
+ private:
+ std::shared_ptr<PoolBuffer> buffer_;
+ MemoryPool* pool_;
+ uint8_t* data_;
+ int64_t capacity_;
+ int64_t size_;
+};
+
} // namespace arrow
#endif // ARROW_UTIL_BUFFER_H
http://git-wip-us.apache.org/repos/asf/arrow/blob/1fd0668a/python/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 0ecafc7..ebe825f 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -220,9 +220,12 @@ set(EXECUTABLE_OUTPUT_PATH "${BUILD_OUTPUT_ROOT_DIRECTORY}")
## Python and libraries
find_package(PythonLibsNew REQUIRED)
+find_package(NumPy REQUIRED)
include(UseCython)
include_directories(SYSTEM
+ ${NUMPY_INCLUDE_DIRS}
+ ${PYTHON_INCLUDE_DIRS}
src)
############################################################
@@ -409,11 +412,12 @@ add_subdirectory(src/pyarrow/util)
set(PYARROW_SRCS
src/pyarrow/common.cc
+ src/pyarrow/config.cc
src/pyarrow/helpers.cc
- src/pyarrow/init.cc
src/pyarrow/status.cc
src/pyarrow/adapters/builtin.cc
+ src/pyarrow/adapters/pandas.cc
)
set(LINK_LIBS
http://git-wip-us.apache.org/repos/asf/arrow/blob/1fd0668a/python/pyarrow/__init__.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index 9a08070..c343f5b 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -17,7 +17,11 @@
# flake8: noqa
-from pyarrow.array import (Array, from_pylist, total_allocated_bytes,
+import pyarrow.config
+
+from pyarrow.array import (Array,
+ from_pandas_series, from_pylist,
+ total_allocated_bytes,
BooleanArray, NumericArray,
Int8Array, UInt8Array,
ListArray, StringArray)
@@ -37,4 +41,4 @@ from pyarrow.schema import (null, bool_,
list_, struct, field,
DataType, Field, Schema, schema)
-from pyarrow.array import RowBatch
+from pyarrow.array import RowBatch, Table, from_pandas_dataframe
http://git-wip-us.apache.org/repos/asf/arrow/blob/1fd0668a/python/pyarrow/array.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/array.pyx b/python/pyarrow/array.pyx
index c5d40dd..88770cd 100644
--- a/python/pyarrow/array.pyx
+++ b/python/pyarrow/array.pyx
@@ -22,6 +22,8 @@
from pyarrow.includes.libarrow cimport *
cimport pyarrow.includes.pyarrow as pyarrow
+import pyarrow.config
+
from pyarrow.compat import frombytes, tobytes
from pyarrow.error cimport check_status
@@ -44,6 +46,10 @@ cdef class Array:
self.type = DataType()
self.type.init(self.sp_array.get().type())
+ @staticmethod
+ def from_pandas(obj, mask=None):
+ return from_pandas_series(obj, mask)
+
property null_count:
def __get__(self):
@@ -160,7 +166,15 @@ cdef class StringArray(Array):
cdef dict _array_classes = {
Type_NA: NullArray,
Type_BOOL: BooleanArray,
+ Type_UINT8: UInt8Array,
+ Type_UINT16: UInt16Array,
+ Type_UINT32: UInt32Array,
+ Type_UINT64: UInt64Array,
+ Type_INT8: Int8Array,
+ Type_INT16: Int16Array,
+ Type_INT32: Int32Array,
Type_INT64: Int64Array,
+ Type_FLOAT: FloatArray,
Type_DOUBLE: DoubleArray,
Type_LIST: ListArray,
Type_STRING: StringArray,
@@ -194,6 +208,49 @@ def from_pylist(object list_obj, DataType type=None):
return box_arrow_array(sp_array)
+
+def from_pandas_series(object series, object mask=None):
+ cdef:
+ shared_ptr[CArray] out
+
+ series_values = series_as_ndarray(series)
+
+ if mask is None:
+ check_status(pyarrow.PandasToArrow(pyarrow.GetMemoryPool(),
+ series_values, &out))
+ else:
+ mask = series_as_ndarray(mask)
+ check_status(pyarrow.PandasMaskedToArrow(
+ pyarrow.GetMemoryPool(), series_values, mask, &out))
+
+ return box_arrow_array(out)
+
+
+def from_pandas_dataframe(object df, name=None):
+ cdef:
+ list names = []
+ list arrays = []
+
+ for name in df.columns:
+ col = df[name]
+ arr = from_pandas_series(col)
+
+ names.append(name)
+ arrays.append(arr)
+
+ return Table.from_arrays(names, arrays, name=name)
+
+
+cdef object series_as_ndarray(object obj):
+ import pandas as pd
+
+ if isinstance(obj, pd.Series):
+ result = obj.values
+ else:
+ result = obj
+
+ return result
+
#----------------------------------------------------------------------
# Table-like data structures
@@ -225,3 +282,81 @@ cdef class RowBatch:
def __getitem__(self, i):
return self.arrays[i]
+
+
+cdef class Table:
+ '''
+ Do not call this class's constructor directly.
+ '''
+ cdef:
+ shared_ptr[CTable] sp_table
+ CTable* table
+
+ def __cinit__(self):
+ pass
+
+ cdef init(self, const shared_ptr[CTable]& table):
+ self.sp_table = table
+ self.table = table.get()
+
+ @staticmethod
+ def from_pandas(df, name=None):
+ pass
+
+ @staticmethod
+ def from_arrays(names, arrays, name=None):
+ cdef:
+ Array arr
+ Table result
+ c_string c_name
+ vector[shared_ptr[CField]] fields
+ vector[shared_ptr[CColumn]] columns
+ shared_ptr[CSchema] schema
+ shared_ptr[CTable] table
+
+ cdef int K = len(arrays)
+
+ fields.resize(K)
+ columns.resize(K)
+ for i in range(K):
+ arr = arrays[i]
+ c_name = tobytes(names[i])
+
+ fields[i].reset(new CField(c_name, arr.type.sp_type, True))
+ columns[i].reset(new CColumn(fields[i], arr.sp_array))
+
+ if name is None:
+ c_name = ''
+ else:
+ c_name = tobytes(name)
+
+ schema.reset(new CSchema(fields))
+ table.reset(new CTable(c_name, schema, columns))
+
+ result = Table()
+ result.init(table)
+
+ return result
+
+ def to_pandas(self):
+ """
+ Convert the arrow::Table to a pandas DataFrame
+ """
+ cdef:
+ PyObject* arr
+ shared_ptr[CColumn] col
+
+ import pandas as pd
+
+ names = []
+ data = []
+ for i in range(self.table.num_columns()):
+ col = self.table.column(i)
+ check_status(pyarrow.ArrowToPandas(col, &arr))
+ names.append(frombytes(col.get().name()))
+ data.append(<object> arr)
+
+ # One ref count too many
+ Py_XDECREF(arr)
+
+ return pd.DataFrame(dict(zip(names, data)), columns=names)
http://git-wip-us.apache.org/repos/asf/arrow/blob/1fd0668a/python/pyarrow/config.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/config.pyx b/python/pyarrow/config.pyx
index 521bc06..1047a47 100644
--- a/python/pyarrow/config.pyx
+++ b/python/pyarrow/config.pyx
@@ -2,7 +2,18 @@
# distutils: language = c++
# cython: embedsignature = True
-cdef extern from 'pyarrow/init.h' namespace 'pyarrow':
+cdef extern from 'pyarrow/do_import_numpy.h':
+ pass
+
+cdef extern from 'pyarrow/numpy_interop.h' namespace 'pyarrow':
+ int import_numpy()
+
+cdef extern from 'pyarrow/config.h' namespace 'pyarrow':
void pyarrow_init()
+ void pyarrow_set_numpy_nan(object o)
+import_numpy()
pyarrow_init()
+
+import numpy as np
+pyarrow_set_numpy_nan(np.nan)
http://git-wip-us.apache.org/repos/asf/arrow/blob/1fd0668a/python/pyarrow/includes/common.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/common.pxd b/python/pyarrow/includes/common.pxd
index 839427a..e86d5d7 100644
--- a/python/pyarrow/includes/common.pxd
+++ b/python/pyarrow/includes/common.pxd
@@ -22,10 +22,16 @@ from libcpp cimport bool as c_bool
from libcpp.string cimport string as c_string
from libcpp.vector cimport vector
+from cpython cimport PyObject
+cimport cpython
+
# This must be included for cerr and other things to work
cdef extern from "<iostream>":
pass
+cdef extern from "<Python.h>":
+ void Py_XDECREF(PyObject* o)
+
cdef extern from "<memory>" namespace "std" nogil:
cdef cppclass shared_ptr[T]:
http://git-wip-us.apache.org/repos/asf/arrow/blob/1fd0668a/python/pyarrow/includes/libarrow.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 943a08f..42f1f25 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -20,6 +20,25 @@
from pyarrow.includes.common cimport *
cdef extern from "arrow/api.h" namespace "arrow" nogil:
+ # We can later add more of the common status factory methods as needed
+ cdef CStatus CStatus_OK "Status::OK"()
+
+ cdef cppclass CStatus "arrow::Status":
+ CStatus()
+
+ c_string ToString()
+
+ c_bool ok()
+ c_bool IsOutOfMemory()
+ c_bool IsKeyError()
+ c_bool IsNotImplemented()
+ c_bool IsInvalid()
+
+ cdef cppclass Buffer:
+ uint8_t* data()
+ int64_t size()
+
+cdef extern from "arrow/api.h" namespace "arrow" nogil:
enum Type" arrow::Type::type":
Type_NA" arrow::Type::NA"
@@ -129,25 +148,30 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
cdef cppclass CStringArray" arrow::StringArray"(CListArray):
c_string GetString(int i)
+ cdef cppclass CChunkedArray" arrow::ChunkedArray":
+ pass
-cdef extern from "arrow/api.h" namespace "arrow" nogil:
- # We can later add more of the common status factory methods as needed
- cdef CStatus CStatus_OK "Status::OK"()
+ cdef cppclass CColumn" arrow::Column":
+ CColumn(const shared_ptr[CField]& field,
+ const shared_ptr[CArray]& data)
- cdef cppclass CStatus "arrow::Status":
- CStatus()
+ int64_t length()
+ int64_t null_count()
+ const c_string& name()
+ const shared_ptr[CDataType]& type()
+ const shared_ptr[CChunkedArray]& data()
- c_string ToString()
+ cdef cppclass CTable" arrow::Table":
+ CTable(const c_string& name, const shared_ptr[CSchema]& schema,
+ const vector[shared_ptr[CColumn]]& columns)
- c_bool ok()
- c_bool IsOutOfMemory()
- c_bool IsKeyError()
- c_bool IsNotImplemented()
- c_bool IsInvalid()
+ int num_columns()
+ int num_rows()
- cdef cppclass Buffer:
- uint8_t* data()
- int64_t size()
+ const c_string& name()
+
+ const shared_ptr[CSchema]& schema()
+ const shared_ptr[CColumn]& column(int i)
cdef extern from "arrow/ipc/metadata.h" namespace "arrow::ipc" nogil:
http://git-wip-us.apache.org/repos/asf/arrow/blob/1fd0668a/python/pyarrow/includes/pyarrow.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/pyarrow.pxd b/python/pyarrow/includes/pyarrow.pxd
index eedfc85..1066b80 100644
--- a/python/pyarrow/includes/pyarrow.pxd
+++ b/python/pyarrow/includes/pyarrow.pxd
@@ -18,7 +18,8 @@
# distutils: language = c++
from pyarrow.includes.common cimport *
-from pyarrow.includes.libarrow cimport CArray, CDataType, Type, MemoryPool
+from pyarrow.includes.libarrow cimport (CArray, CColumn, CDataType,
+ Type, MemoryPool)
cdef extern from "pyarrow/api.h" namespace "pyarrow" nogil:
# We can later add more of the common status factory methods as needed
@@ -41,4 +42,10 @@ cdef extern from "pyarrow/api.h" namespace "pyarrow" nogil:
shared_ptr[CDataType] GetPrimitiveType(Type type)
Status ConvertPySequence(object obj, shared_ptr[CArray]* out)
+ Status PandasToArrow(MemoryPool* pool, object ao, shared_ptr[CArray]* out)
+ Status PandasMaskedToArrow(MemoryPool* pool, object ao, object mo,
+ shared_ptr[CArray]* out)
+
+ Status ArrowToPandas(const shared_ptr[CColumn]& arr, PyObject** out)
+
MemoryPool* GetMemoryPool()
http://git-wip-us.apache.org/repos/asf/arrow/blob/1fd0668a/python/pyarrow/tests/test_convert_pandas.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
new file mode 100644
index 0000000..6dc9c68
--- /dev/null
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -0,0 +1,172 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import unittest
+
+import numpy as np
+
+import pandas as pd
+import pandas.util.testing as tm
+
+import pyarrow as A
+
+
+class TestPandasConversion(unittest.TestCase):
+
+ def setUp(self):
+ pass
+
+ def tearDown(self):
+ pass
+
+ def _check_pandas_roundtrip(self, df, expected=None):
+ table = A.from_pandas_dataframe(df)
+ result = table.to_pandas()
+ if expected is None:
+ expected = df
+ tm.assert_frame_equal(result, expected)
+
+ def test_float_no_nulls(self):
+ data = {}
+ numpy_dtypes = ['f4', 'f8']
+ num_values = 100
+
+ for dtype in numpy_dtypes:
+ values = np.random.randn(num_values)
+ data[dtype] = values.astype(dtype)
+
+ df = pd.DataFrame(data)
+ self._check_pandas_roundtrip(df)
+
+ def test_float_nulls(self):
+ num_values = 100
+
+ null_mask = np.random.randint(0, 10, size=num_values) < 3
+ dtypes = ['f4', 'f8']
+ expected_cols = []
+
+ arrays = []
+ for name in dtypes:
+ values = np.random.randn(num_values).astype(name)
+
+ arr = A.from_pandas_series(values, null_mask)
+ arrays.append(arr)
+
+ values[null_mask] = np.nan
+
+ expected_cols.append(values)
+
+ ex_frame = pd.DataFrame(dict(zip(dtypes, expected_cols)),
+ columns=dtypes)
+
+ table = A.Table.from_arrays(dtypes, arrays)
+ result = table.to_pandas()
+ tm.assert_frame_equal(result, ex_frame)
+
+ def test_integer_no_nulls(self):
+ data = {}
+
+ numpy_dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8']
+ num_values = 100
+
+ for dtype in numpy_dtypes:
+ info = np.iinfo(dtype)
+ values = np.random.randint(info.min,
+ min(info.max, np.iinfo('i8').max),
+ size=num_values)
+ data[dtype] = values.astype(dtype)
+
+ df = pd.DataFrame(data)
+ self._check_pandas_roundtrip(df)
+
+ def test_integer_with_nulls(self):
+ # pandas requires upcast to float dtype
+
+ int_dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8']
+ num_values = 100
+
+ null_mask = np.random.randint(0, 10, size=num_values) < 3
+
+ expected_cols = []
+ arrays = []
+ for name in int_dtypes:
+ values = np.random.randint(0, 100, size=num_values)
+
+ arr = A.from_pandas_series(values, null_mask)
+ arrays.append(arr)
+
+ expected = values.astype('f8')
+ expected[null_mask] = np.nan
+
+ expected_cols.append(expected)
+
+ ex_frame = pd.DataFrame(dict(zip(int_dtypes, expected_cols)),
+ columns=int_dtypes)
+
+ table = A.Table.from_arrays(int_dtypes, arrays)
+ result = table.to_pandas()
+
+ tm.assert_frame_equal(result, ex_frame)
+
+ def test_boolean_no_nulls(self):
+ num_values = 100
+
+ np.random.seed(0)
+
+ df = pd.DataFrame({'bools': np.random.randn(num_values) > 0})
+ self._check_pandas_roundtrip(df)
+
+ def test_boolean_nulls(self):
+ # pandas requires upcast to object dtype
+ num_values = 100
+ np.random.seed(0)
+
+ mask = np.random.randint(0, 10, size=num_values) < 3
+ values = np.random.randint(0, 10, size=num_values) < 5
+
+ arr = A.from_pandas_series(values, mask)
+
+ expected = values.astype(object)
+ expected[mask] = None
+
+ ex_frame = pd.DataFrame({'bools': expected})
+
+ table = A.Table.from_arrays(['bools'], [arr])
+ result = table.to_pandas()
+
+ tm.assert_frame_equal(result, ex_frame)
+
+ def test_boolean_object_nulls(self):
+ arr = np.array([False, None, True] * 100, dtype=object)
+ df = pd.DataFrame({'bools': arr})
+ self._check_pandas_roundtrip(df)
+
+ def test_strings(self):
+ repeats = 1000
+ values = [b'foo', None, u'bar', 'qux', np.nan]
+ df = pd.DataFrame({'strings': values * repeats})
+
+ values = ['foo', None, u'bar', 'qux', None]
+ expected = pd.DataFrame({'strings': values * repeats})
+ self._check_pandas_roundtrip(df, expected)
+
+ # def test_category(self):
+ # repeats = 1000
+ # values = [b'foo', None, u'bar', 'qux', np.nan]
+ # df = pd.DataFrame({'strings': values * repeats})
+ # df['strings'] = df['strings'].astype('category')
+ # self._check_pandas_roundtrip(df)
http://git-wip-us.apache.org/repos/asf/arrow/blob/1fd0668a/python/src/pyarrow/adapters/pandas.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/adapters/pandas.cc b/python/src/pyarrow/adapters/pandas.cc
new file mode 100644
index 0000000..22f1d75
--- /dev/null
+++ b/python/src/pyarrow/adapters/pandas.cc
@@ -0,0 +1,714 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Functions for pandas conversion via NumPy
+
+#include <Python.h>
+
+#include "pyarrow/numpy_interop.h"
+
+#include <cmath>
+#include <cstdint>
+#include <memory>
+#include <sstream>
+#include <string>
+
+#include "arrow/api.h"
+#include "arrow/util/bit-util.h"
+
+#include "pyarrow/common.h"
+#include "pyarrow/config.h"
+#include "pyarrow/status.h"
+
+namespace pyarrow {
+
+using arrow::Array;
+using arrow::Column;
+namespace util = arrow::util;
+
+// ----------------------------------------------------------------------
+// Serialization
+
+template <int TYPE>
+struct npy_traits {
+};
+
+template <>
+struct npy_traits<NPY_BOOL> {
+ typedef uint8_t value_type;
+ using ArrayType = arrow::BooleanArray;
+
+ static constexpr bool supports_nulls = false;
+ static inline bool isnull(uint8_t v) {
+ return false;
+ }
+};
+
+#define NPY_INT_DECL(TYPE, CapType, T) \
+ template <> \
+ struct npy_traits<NPY_##TYPE> { \
+ typedef T value_type; \
+ using ArrayType = arrow::CapType##Array; \
+ \
+ static constexpr bool supports_nulls = false; \
+ static inline bool isnull(T v) { \
+ return false; \
+ } \
+ };
+
+NPY_INT_DECL(INT8, Int8, int8_t);
+NPY_INT_DECL(INT16, Int16, int16_t);
+NPY_INT_DECL(INT32, Int32, int32_t);
+NPY_INT_DECL(INT64, Int64, int64_t);
+NPY_INT_DECL(UINT8, UInt8, uint8_t);
+NPY_INT_DECL(UINT16, UInt16, uint16_t);
+NPY_INT_DECL(UINT32, UInt32, uint32_t);
+NPY_INT_DECL(UINT64, UInt64, uint64_t);
+
+template <>
+struct npy_traits<NPY_FLOAT32> {
+ typedef float value_type;
+ using ArrayType = arrow::FloatArray;
+
+ static constexpr bool supports_nulls = true;
+
+ static inline bool isnull(float v) {
+ return v != v;
+ }
+};
+
+template <>
+struct npy_traits<NPY_FLOAT64> {
+ typedef double value_type;
+ using ArrayType = arrow::DoubleArray;
+
+ static constexpr bool supports_nulls = true;
+
+ static inline bool isnull(double v) {
+ return v != v;
+ }
+};
+
+template <>
+struct npy_traits<NPY_OBJECT> {
+ typedef PyObject* value_type;
+ static constexpr bool supports_nulls = true;
+};
+
+template <int TYPE>
+class ArrowSerializer {
+ public:
+ ArrowSerializer(arrow::MemoryPool* pool, PyArrayObject* arr, PyArrayObject* mask) :
+ pool_(pool),
+ arr_(arr),
+ mask_(mask) {
+ length_ = PyArray_SIZE(arr_);
+ }
+
+ Status Convert(std::shared_ptr<Array>* out);
+
+ int stride() const {
+ return PyArray_STRIDES(arr_)[0];
+ }
+
+ Status InitNullBitmap() {
+ int null_bytes = util::bytes_for_bits(length_);
+
+ null_bitmap_ = std::make_shared<arrow::PoolBuffer>(pool_);
+ RETURN_ARROW_NOT_OK(null_bitmap_->Resize(null_bytes));
+
+ null_bitmap_data_ = null_bitmap_->mutable_data();
+ memset(null_bitmap_data_, 0, null_bytes);
+
+ return Status::OK();
+ }
+
+ bool is_strided() const {
+ npy_intp* astrides = PyArray_STRIDES(arr_);
+ return astrides[0] != PyArray_DESCR(arr_)->elsize;
+ }
+
+ private:
+ Status ConvertData();
+
+ Status ConvertObjectStrings(std::shared_ptr<Array>* out) {
+ PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
+
+ auto offsets_buffer = std::make_shared<arrow::PoolBuffer>(pool_);
+ RETURN_ARROW_NOT_OK(offsets_buffer->Resize(sizeof(int32_t) * (length_ + 1)));
+ int32_t* offsets = reinterpret_cast<int32_t*>(offsets_buffer->mutable_data());
+
+ arrow::BufferBuilder data_builder(pool_);
+ arrow::Status s;
+ PyObject* obj;
+ int length;
+ int offset = 0;
+ int64_t null_count = 0;
+ for (int64_t i = 0; i < length_; ++i) {
+ obj = objects[i];
+ if (PyUnicode_Check(obj)) {
+ obj = PyUnicode_AsUTF8String(obj);
+ if (obj == NULL) {
+ PyErr_Clear();
+ return Status::TypeError("failed converting unicode to UTF8");
+ }
+ length = PyBytes_GET_SIZE(obj);
+ s = data_builder.Append(
+ reinterpret_cast<const uint8_t*>(PyBytes_AS_STRING(obj)), length);
+ Py_DECREF(obj);
+ if (!s.ok()) {
+ return Status::ArrowError(s.ToString());
+ }
+ util::set_bit(null_bitmap_data_, i);
+ } else if (PyBytes_Check(obj)) {
+ length = PyBytes_GET_SIZE(obj);
+ RETURN_ARROW_NOT_OK(data_builder.Append(
+ reinterpret_cast<const uint8_t*>(PyBytes_AS_STRING(obj)), length));
+ util::set_bit(null_bitmap_data_, i);
+ } else {
+ // NULL
+ // No change to offset
+ length = 0;
+ ++null_count;
+ }
+ offsets[i] = offset;
+ offset += length;
+ }
+ // End offset
+ offsets[length_] = offset;
+
+ std::shared_ptr<arrow::Buffer> data_buffer = data_builder.Finish();
+
+ auto values = std::make_shared<arrow::UInt8Array>(data_buffer->size(),
+ data_buffer);
+ *out = std::shared_ptr<arrow::Array>(
+ new arrow::StringArray(length_, offsets_buffer, values, null_count,
+ null_bitmap_));
+
+ return Status::OK();
+ }
+
+ Status ConvertBooleans(std::shared_ptr<Array>* out) {
+ PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
+
+ int nbytes = util::bytes_for_bits(length_);
+ auto data = std::make_shared<arrow::PoolBuffer>(pool_);
+ RETURN_ARROW_NOT_OK(data->Resize(nbytes));
+ uint8_t* bitmap = data->mutable_data();
+ memset(bitmap, 0, nbytes);
+
+ int64_t null_count = 0;
+ for (int64_t i = 0; i < length_; ++i) {
+ if (objects[i] == Py_True) {
+ util::set_bit(bitmap, i);
+ util::set_bit(null_bitmap_data_, i);
+ } else if (objects[i] != Py_False) {
+ ++null_count;
+ } else {
+ util::set_bit(null_bitmap_data_, i);
+ }
+ }
+
+ *out = std::make_shared<arrow::BooleanArray>(length_, data, null_count,
+ null_bitmap_);
+
+ return Status::OK();
+ }
+
+ arrow::MemoryPool* pool_;
+
+ PyArrayObject* arr_;
+ PyArrayObject* mask_;
+
+ int64_t length_;
+
+ std::shared_ptr<arrow::Buffer> data_;
+ std::shared_ptr<arrow::ResizableBuffer> null_bitmap_;
+ uint8_t* null_bitmap_data_;
+};
+
+// Returns null count
+static int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap) {
+ int64_t null_count = 0;
+ const uint8_t* mask_values = static_cast<const uint8_t*>(PyArray_DATA(mask));
+ // TODO(wesm): strided null mask
+ for (int i = 0; i < length; ++i) {
+ if (mask_values[i]) {
+ ++null_count;
+ } else {
+ util::set_bit(bitmap, i);
+ }
+ }
+ return null_count;
+}
+
+template <int TYPE>
+static int64_t ValuesToBitmap(const void* data, int64_t length, uint8_t* bitmap) {
+ typedef npy_traits<TYPE> traits;
+ typedef typename traits::value_type T;
+
+ int64_t null_count = 0;
+ const T* values = reinterpret_cast<const T*>(data);
+
+ // TODO(wesm): striding
+ for (int i = 0; i < length; ++i) {
+ if (traits::isnull(values[i])) {
+ ++null_count;
+ } else {
+ util::set_bit(bitmap, i);
+ }
+ }
+
+ return null_count;
+}
+
+template <int TYPE>
+inline Status ArrowSerializer<TYPE>::Convert(std::shared_ptr<Array>* out) {
+ typedef npy_traits<TYPE> traits;
+
+ if (mask_ != nullptr || traits::supports_nulls) {
+ RETURN_NOT_OK(InitNullBitmap());
+ }
+
+ int64_t null_count = 0;
+ if (mask_ != nullptr) {
+ null_count = MaskToBitmap(mask_, length_, null_bitmap_data_);
+ } else if (traits::supports_nulls) {
+ null_count = ValuesToBitmap<TYPE>(PyArray_DATA(arr_), length_, null_bitmap_data_);
+ }
+
+ RETURN_NOT_OK(ConvertData());
+ *out = std::make_shared<typename traits::ArrayType>(length_, data_, null_count,
+ null_bitmap_);
+
+ return Status::OK();
+}
+
+static inline bool PyObject_is_null(const PyObject* obj) {
+ return obj == Py_None || obj == numpy_nan;
+}
+
+static inline bool PyObject_is_string(const PyObject* obj) {
+#if PY_MAJOR_VERSION >= 3
+ return PyUnicode_Check(obj) || PyBytes_Check(obj);
+#else
+ return PyString_Check(obj) || PyUnicode_Check(obj);
+#endif
+}
+
+static inline bool PyObject_is_bool(const PyObject* obj) {
+#if PY_MAJOR_VERSION >= 3
+ return PyString_Check(obj) || PyBytes_Check(obj);
+#else
+ return PyString_Check(obj) || PyUnicode_Check(obj);
+#endif
+}
+
+template <>
+inline Status ArrowSerializer<NPY_OBJECT>::Convert(std::shared_ptr<Array>* out) {
+ // Python object arrays are annoying, since we could have one of:
+ //
+ // * Strings
+ // * Booleans with nulls
+ // * Mixed type (not supported at the moment by arrow format)
+ //
+ // Additionally, nulls may be encoded either as np.nan or None. So we have to
+ // do some type inference and conversion
+
+ RETURN_NOT_OK(InitNullBitmap());
+
+ // TODO: mask not supported here
+ const PyObject** objects = reinterpret_cast<const PyObject**>(PyArray_DATA(arr_));
+
+ for (int64_t i = 0; i < length_; ++i) {
+ if (PyObject_is_null(objects[i])) {
+ continue;
+ } else if (PyObject_is_string(objects[i])) {
+ return ConvertObjectStrings(out);
+ } else if (PyBool_Check(objects[i])) {
+ return ConvertBooleans(out);
+ } else {
+ return Status::TypeError("unhandled python type");
+ }
+ }
+
+ return Status::TypeError("Unable to infer type of object array, were all null");
+}
+
+template <int TYPE>
+inline Status ArrowSerializer<TYPE>::ConvertData() {
+ // TODO(wesm): strided arrays
+ if (is_strided()) {
+ return Status::ValueError("no support for strided data yet");
+ }
+
+ data_ = std::make_shared<NumPyBuffer>(arr_);
+ return Status::OK();
+}
+
+template <>
+inline Status ArrowSerializer<NPY_BOOL>::ConvertData() {
+ if (is_strided()) {
+ return Status::ValueError("no support for strided data yet");
+ }
+
+ int nbytes = util::bytes_for_bits(length_);
+ auto buffer = std::make_shared<arrow::PoolBuffer>(pool_);
+ RETURN_ARROW_NOT_OK(buffer->Resize(nbytes));
+
+ const uint8_t* values = reinterpret_cast<const uint8_t*>(PyArray_DATA(arr_));
+
+ uint8_t* bitmap = buffer->mutable_data();
+
+ memset(bitmap, 0, nbytes);
+ for (int i = 0; i < length_; ++i) {
+ if (values[i] > 0) {
+ util::set_bit(bitmap, i);
+ }
+ }
+
+ data_ = buffer;
+
+ return Status::OK();
+}
+
+template <>
+inline Status ArrowSerializer<NPY_OBJECT>::ConvertData() {
+ return Status::TypeError("NYI");
+}
+
+
+#define TO_ARROW_CASE(TYPE) \
+ case NPY_##TYPE: \
+ { \
+ ArrowSerializer<NPY_##TYPE> converter(pool, arr, mask); \
+ RETURN_NOT_OK(converter.Convert(out)); \
+ } \
+ break;
+
+Status PandasMaskedToArrow(arrow::MemoryPool* pool, PyObject* ao, PyObject* mo,
+ std::shared_ptr<Array>* out) {
+ PyArrayObject* arr = reinterpret_cast<PyArrayObject*>(ao);
+ PyArrayObject* mask = nullptr;
+
+ if (mo != nullptr) {
+ mask = reinterpret_cast<PyArrayObject*>(mo);
+ }
+
+ if (PyArray_NDIM(arr) != 1) {
+ return Status::ValueError("only handle 1-dimensional arrays");
+ }
+
+ switch(PyArray_DESCR(arr)->type_num) {
+ TO_ARROW_CASE(BOOL);
+ TO_ARROW_CASE(INT8);
+ TO_ARROW_CASE(INT16);
+ TO_ARROW_CASE(INT32);
+ TO_ARROW_CASE(INT64);
+ TO_ARROW_CASE(UINT8);
+ TO_ARROW_CASE(UINT16);
+ TO_ARROW_CASE(UINT32);
+ TO_ARROW_CASE(UINT64);
+ TO_ARROW_CASE(FLOAT32);
+ TO_ARROW_CASE(FLOAT64);
+ TO_ARROW_CASE(OBJECT);
+ default:
+ std::stringstream ss;
+ ss << "unsupported type " << PyArray_DESCR(arr)->type_num
+ << std::endl;
+ return Status::NotImplemented(ss.str());
+ }
+ return Status::OK();
+}
+
+Status PandasToArrow(arrow::MemoryPool* pool, PyObject* ao,
+ std::shared_ptr<Array>* out) {
+ return PandasMaskedToArrow(pool, ao, nullptr, out);
+}
+
+// ----------------------------------------------------------------------
+// Deserialization
+
+template <int TYPE>
+struct arrow_traits {
+};
+
+template <>
+struct arrow_traits<arrow::Type::BOOL> {
+ static constexpr int npy_type = NPY_BOOL;
+ static constexpr bool supports_nulls = false;
+ static constexpr bool is_boolean = true;
+ static constexpr bool is_integer = false;
+ static constexpr bool is_floating = false;
+};
+
+#define INT_DECL(TYPE) \
+ template <> \
+ struct arrow_traits<arrow::Type::TYPE> { \
+ static constexpr int npy_type = NPY_##TYPE; \
+ static constexpr bool supports_nulls = false; \
+ static constexpr double na_value = NAN; \
+ static constexpr bool is_boolean = false; \
+ static constexpr bool is_integer = true; \
+ static constexpr bool is_floating = false; \
+ typedef typename npy_traits<NPY_##TYPE>::value_type T; \
+ };
+
+INT_DECL(INT8);
+INT_DECL(INT16);
+INT_DECL(INT32);
+INT_DECL(INT64);
+INT_DECL(UINT8);
+INT_DECL(UINT16);
+INT_DECL(UINT32);
+INT_DECL(UINT64);
+
+template <>
+struct arrow_traits<arrow::Type::FLOAT> {
+ static constexpr int npy_type = NPY_FLOAT32;
+ static constexpr bool supports_nulls = true;
+ static constexpr float na_value = NAN;
+ static constexpr bool is_boolean = false;
+ static constexpr bool is_integer = false;
+ static constexpr bool is_floating = true;
+ typedef typename npy_traits<NPY_FLOAT32>::value_type T;
+};
+
+template <>
+struct arrow_traits<arrow::Type::DOUBLE> {
+ static constexpr int npy_type = NPY_FLOAT64;
+ static constexpr bool supports_nulls = true;
+ static constexpr double na_value = NAN;
+ static constexpr bool is_boolean = false;
+ static constexpr bool is_integer = false;
+ static constexpr bool is_floating = true;
+ typedef typename npy_traits<NPY_FLOAT64>::value_type T;
+};
+
+template <>
+struct arrow_traits<arrow::Type::STRING> {
+ static constexpr int npy_type = NPY_OBJECT;
+ static constexpr bool supports_nulls = true;
+ static constexpr bool is_boolean = false;
+ static constexpr bool is_integer = false;
+ static constexpr bool is_floating = false;
+};
+
+
+static inline PyObject* make_pystring(const uint8_t* data, int32_t length) {
+#if PY_MAJOR_VERSION >= 3
+ return PyUnicode_FromStringAndSize(reinterpret_cast<const char*>(data), length);
+#else
+ return PyString_FromStringAndSize(reinterpret_cast<const char*>(data), length);
+#endif
+}
+
+template <int TYPE>
+class ArrowDeserializer {
+ public:
+ ArrowDeserializer(const std::shared_ptr<Column>& col) :
+ col_(col) {}
+
+ Status Convert(PyObject** out) {
+ const std::shared_ptr<arrow::ChunkedArray> data = col_->data();
+ if (data->num_chunks() > 1) {
+ return Status::NotImplemented("Chunked column conversion NYI");
+ }
+
+ auto chunk = data->chunk(0);
+
+ RETURN_NOT_OK(ConvertValues<TYPE>(chunk));
+ *out = reinterpret_cast<PyObject*>(out_);
+ return Status::OK();
+ }
+
+ Status AllocateOutput(int type) {
+ npy_intp dims[1] = {col_->length()};
+ out_ = reinterpret_cast<PyArrayObject*>(PyArray_SimpleNew(1, dims, type));
+
+ if (out_ == NULL) {
+ // Error occurred, trust that SimpleNew set the error state
+ return Status::OK();
+ }
+
+ return Status::OK();
+ }
+
+ template <int T2>
+ inline typename std::enable_if<
+ arrow_traits<T2>::is_floating, Status>::type
+ ConvertValues(const std::shared_ptr<Array>& arr) {
+ typedef typename arrow_traits<T2>::T T;
+
+ arrow::PrimitiveArray* prim_arr = static_cast<arrow::PrimitiveArray*>(
+ arr.get());
+
+ RETURN_NOT_OK(AllocateOutput(arrow_traits<T2>::npy_type));
+
+ if (arr->null_count() > 0) {
+ T* out_values = reinterpret_cast<T*>(PyArray_DATA(out_));
+ const T* in_values = reinterpret_cast<const T*>(prim_arr->data()->data());
+ for (int64_t i = 0; i < arr->length(); ++i) {
+ out_values[i] = arr->IsNull(i) ? NAN : in_values[i];
+ }
+ } else {
+ memcpy(PyArray_DATA(out_), prim_arr->data()->data(),
+ arr->length() * arr->type()->value_size());
+ }
+
+ return Status::OK();
+ }
+
+ // Integer specialization
+ template <int T2>
+ inline typename std::enable_if<
+ arrow_traits<T2>::is_integer, Status>::type
+ ConvertValues(const std::shared_ptr<Array>& arr) {
+ typedef typename arrow_traits<T2>::T T;
+
+ arrow::PrimitiveArray* prim_arr = static_cast<arrow::PrimitiveArray*>(
+ arr.get());
+
+ const T* in_values = reinterpret_cast<const T*>(prim_arr->data()->data());
+
+ if (arr->null_count() > 0) {
+ RETURN_NOT_OK(AllocateOutput(NPY_FLOAT64));
+
+ // Upcast to double, set NaN as appropriate
+ double* out_values = reinterpret_cast<double*>(PyArray_DATA(out_));
+ for (int i = 0; i < arr->length(); ++i) {
+ out_values[i] = prim_arr->IsNull(i) ? NAN : in_values[i];
+ }
+ } else {
+ RETURN_NOT_OK(AllocateOutput(arrow_traits<TYPE>::npy_type));
+
+ memcpy(PyArray_DATA(out_), in_values,
+ arr->length() * arr->type()->value_size());
+ }
+
+ return Status::OK();
+ }
+
+ // Boolean specialization
+ template <int T2>
+ inline typename std::enable_if<
+ arrow_traits<T2>::is_boolean, Status>::type
+ ConvertValues(const std::shared_ptr<Array>& arr) {
+ arrow::BooleanArray* bool_arr = static_cast<arrow::BooleanArray*>(arr.get());
+
+ if (arr->null_count() > 0) {
+ RETURN_NOT_OK(AllocateOutput(NPY_OBJECT));
+
+ PyObject** out_values = reinterpret_cast<PyObject**>(PyArray_DATA(out_));
+ for (int64_t i = 0; i < arr->length(); ++i) {
+ if (bool_arr->IsNull(i)) {
+ Py_INCREF(Py_None);
+ out_values[i] = Py_None;
+ } else if (bool_arr->Value(i)) {
+ // True
+ Py_INCREF(Py_True);
+ out_values[i] = Py_True;
+ } else {
+ // False
+ Py_INCREF(Py_False);
+ out_values[i] = Py_False;
+ }
+ }
+ } else {
+ RETURN_NOT_OK(AllocateOutput(arrow_traits<TYPE>::npy_type));
+
+ uint8_t* out_values = reinterpret_cast<uint8_t*>(PyArray_DATA(out_));
+ for (int64_t i = 0; i < arr->length(); ++i) {
+ out_values[i] = static_cast<uint8_t>(bool_arr->Value(i));
+ }
+ }
+
+ return Status::OK();
+ }
+
+ // UTF8
+ template <int T2>
+ inline typename std::enable_if<
+ T2 == arrow::Type::STRING, Status>::type
+ ConvertValues(const std::shared_ptr<Array>& arr) {
+ RETURN_NOT_OK(AllocateOutput(NPY_OBJECT));
+
+ PyObject** out_values = reinterpret_cast<PyObject**>(PyArray_DATA(out_));
+
+ arrow::StringArray* string_arr = static_cast<arrow::StringArray*>(arr.get());
+
+ const uint8_t* data;
+ int32_t length;
+ if (arr->null_count() > 0) {
+ for (int64_t i = 0; i < arr->length(); ++i) {
+ if (string_arr->IsNull(i)) {
+ Py_INCREF(Py_None);
+ out_values[i] = Py_None;
+ } else {
+ data = string_arr->GetValue(i, &length);
+
+ out_values[i] = make_pystring(data, length);
+ if (out_values[i] == nullptr) {
+ return Status::OK();
+ }
+ }
+ }
+ } else {
+ for (int64_t i = 0; i < arr->length(); ++i) {
+ data = string_arr->GetValue(i, &length);
+ out_values[i] = make_pystring(data, length);
+ if (out_values[i] == nullptr) {
+ return Status::OK();
+ }
+ }
+ }
+ return Status::OK();
+ }
+ private:
+ std::shared_ptr<Column> col_;
+ PyArrayObject* out_;
+};
+
+#define FROM_ARROW_CASE(TYPE) \
+ case arrow::Type::TYPE: \
+ { \
+ ArrowDeserializer<arrow::Type::TYPE> converter(col); \
+ return converter.Convert(out); \
+ } \
+ break;
+
+Status ArrowToPandas(const std::shared_ptr<Column>& col, PyObject** out) {
+ switch(col->type()->type) {
+ FROM_ARROW_CASE(BOOL);
+ FROM_ARROW_CASE(INT8);
+ FROM_ARROW_CASE(INT16);
+ FROM_ARROW_CASE(INT32);
+ FROM_ARROW_CASE(INT64);
+ FROM_ARROW_CASE(UINT8);
+ FROM_ARROW_CASE(UINT16);
+ FROM_ARROW_CASE(UINT32);
+ FROM_ARROW_CASE(UINT64);
+ FROM_ARROW_CASE(FLOAT);
+ FROM_ARROW_CASE(DOUBLE);
+ FROM_ARROW_CASE(STRING);
+ default:
+ return Status::NotImplemented("Arrow type reading not implemented");
+ }
+ return Status::OK();
+}
+
+} // namespace pyarrow
http://git-wip-us.apache.org/repos/asf/arrow/blob/1fd0668a/python/src/pyarrow/adapters/pandas.h
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/adapters/pandas.h b/python/src/pyarrow/adapters/pandas.h
index a4f4163..58eb3ca 100644
--- a/python/src/pyarrow/adapters/pandas.h
+++ b/python/src/pyarrow/adapters/pandas.h
@@ -21,8 +21,29 @@
#ifndef PYARROW_ADAPTERS_PANDAS_H
#define PYARROW_ADAPTERS_PANDAS_H
+#include <Python.h>
+
+#include <memory>
+
+namespace arrow {
+
+class Array;
+class Column;
+
+} // namespace arrow
+
namespace pyarrow {
+class Status;
+
+Status ArrowToPandas(const std::shared_ptr<arrow::Column>& col, PyObject** out);
+
+Status PandasMaskedToArrow(arrow::MemoryPool* pool, PyObject* ao, PyObject* mo,
+ std::shared_ptr<arrow::Array>* out);
+
+Status PandasToArrow(arrow::MemoryPool* pool, PyObject* ao,
+ std::shared_ptr<arrow::Array>* out);
+
} // namespace pyarrow
#endif // PYARROW_ADAPTERS_PANDAS_H
http://git-wip-us.apache.org/repos/asf/arrow/blob/1fd0668a/python/src/pyarrow/common.h
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/common.h b/python/src/pyarrow/common.h
index db63613..cc9ad9e 100644
--- a/python/src/pyarrow/common.h
+++ b/python/src/pyarrow/common.h
@@ -18,7 +18,9 @@
#ifndef PYARROW_COMMON_H
#define PYARROW_COMMON_H
-#include <Python.h>
+#include "pyarrow/config.h"
+
+#include "arrow/util/buffer.h"
namespace arrow { class MemoryPool; }
@@ -90,6 +92,25 @@ struct PyObjectStringify {
arrow::MemoryPool* GetMemoryPool();
+class NumPyBuffer : public arrow::Buffer {
+ public:
+ NumPyBuffer(PyArrayObject* arr) :
+ Buffer(nullptr, 0) {
+ arr_ = arr;
+ Py_INCREF(arr);
+
+ data_ = reinterpret_cast<const uint8_t*>(PyArray_DATA(arr_));
+ size_ = PyArray_SIZE(arr_);
+ }
+
+ virtual ~NumPyBuffer() {
+ Py_XDECREF(arr_);
+ }
+
+ private:
+ PyArrayObject* arr_;
+};
+
} // namespace pyarrow
#endif // PYARROW_COMMON_H
http://git-wip-us.apache.org/repos/asf/arrow/blob/1fd0668a/python/src/pyarrow/config.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/config.cc b/python/src/pyarrow/config.cc
new file mode 100644
index 0000000..730d2db
--- /dev/null
+++ b/python/src/pyarrow/config.cc
@@ -0,0 +1,34 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <Python.h>
+
+#include "pyarrow/config.h"
+
+namespace pyarrow {
+
+void pyarrow_init() {
+}
+
+PyObject* numpy_nan = nullptr;
+
+void pyarrow_set_numpy_nan(PyObject* obj) {
+ Py_INCREF(obj);
+ numpy_nan = obj;
+}
+
+} // namespace pyarrow
http://git-wip-us.apache.org/repos/asf/arrow/blob/1fd0668a/python/src/pyarrow/config.h
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/config.h b/python/src/pyarrow/config.h
new file mode 100644
index 0000000..48ae715
--- /dev/null
+++ b/python/src/pyarrow/config.h
@@ -0,0 +1,39 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef PYARROW_CONFIG_H
+#define PYARROW_CONFIG_H
+
+#include <Python.h>
+
+#include "pyarrow/numpy_interop.h"
+
+#if PY_MAJOR_VERSION >= 3
+ #define PyString_Check PyUnicode_Check
+#endif
+
+namespace pyarrow {
+
+extern PyObject* numpy_nan;
+
+void pyarrow_init();
+
+void pyarrow_set_numpy_nan(PyObject* obj);
+
+} // namespace pyarrow
+
+#endif // PYARROW_CONFIG_H
http://git-wip-us.apache.org/repos/asf/arrow/blob/1fd0668a/python/src/pyarrow/do_import_numpy.h
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/do_import_numpy.h b/python/src/pyarrow/do_import_numpy.h
new file mode 100644
index 0000000..bb4a382
--- /dev/null
+++ b/python/src/pyarrow/do_import_numpy.h
@@ -0,0 +1,21 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Trick borrowed from dynd-python for initializing the NumPy array API
+
+// Trigger the array import (inversion of NO_IMPORT_ARRAY)
+#define NUMPY_IMPORT_ARRAY
http://git-wip-us.apache.org/repos/asf/arrow/blob/1fd0668a/python/src/pyarrow/init.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/init.cc b/python/src/pyarrow/init.cc
deleted file mode 100644
index acd851e..0000000
--- a/python/src/pyarrow/init.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "pyarrow/init.h"
-
-namespace pyarrow {
-
-void pyarrow_init() {
-}
-
-} // namespace pyarrow
http://git-wip-us.apache.org/repos/asf/arrow/blob/1fd0668a/python/src/pyarrow/init.h
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/init.h b/python/src/pyarrow/init.h
deleted file mode 100644
index 71e67a2..0000000
--- a/python/src/pyarrow/init.h
+++ /dev/null
@@ -1,27 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#ifndef PYARROW_INIT_H
-#define PYARROW_INIT_H
-
-namespace pyarrow {
-
-void pyarrow_init();
-
-} // namespace pyarrow
-
-#endif // PYARROW_INIT_H
http://git-wip-us.apache.org/repos/asf/arrow/blob/1fd0668a/python/src/pyarrow/numpy_interop.h
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/numpy_interop.h b/python/src/pyarrow/numpy_interop.h
new file mode 100644
index 0000000..882d287
--- /dev/null
+++ b/python/src/pyarrow/numpy_interop.h
@@ -0,0 +1,58 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef PYARROW_NUMPY_INTEROP_H
+#define PYARROW_NUMPY_INTEROP_H
+
+#include <Python.h>
+
+#include <numpy/numpyconfig.h>
+
+// Don't use the deprecated Numpy functions
+#ifdef NPY_1_7_API_VERSION
+#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
+#else
+#define NPY_ARRAY_NOTSWAPPED NPY_NOTSWAPPED
+#define NPY_ARRAY_ALIGNED NPY_ALIGNED
+#define NPY_ARRAY_WRITEABLE NPY_WRITEABLE
+#define NPY_ARRAY_UPDATEIFCOPY NPY_UPDATEIFCOPY
+#endif
+
+// This is required to be able to access the NumPy C API properly in C++ files
+// other than this main one
+#define PY_ARRAY_UNIQUE_SYMBOL pyarrow_ARRAY_API
+#ifndef NUMPY_IMPORT_ARRAY
+#define NO_IMPORT_ARRAY
+#endif
+
+#include <numpy/arrayobject.h>
+#include <numpy/ufuncobject.h>
+
+namespace pyarrow {
+
+inline int import_numpy() {
+#ifdef NUMPY_IMPORT_ARRAY
+ import_array1(-1);
+ import_umath1(-1);
+#endif
+
+ return 0;
+}
+
+} // namespace pyarrow
+
+#endif // PYARROW_NUMPY_INTEROP_H