You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/01/19 14:27:42 UTC
[3/3] arrow git commit: ARROW-461: [Python] Add Python interfaces to
DictionaryArray data, pandas interop
ARROW-461: [Python] Add Python interfaces to DictionaryArray data, pandas interop
Author: Wes McKinney <we...@twosigma.com>
Closes #291 from wesm/ARROW-461 and squashes the following commits:
b3efe96 [Wes McKinney] Fix cpp unit test, code review comments
285f863 [Wes McKinney] Accept list input in Array.from_pandas
16aa9d6 [Wes McKinney] Add Categorical conversion for single array or column. Required moving code around a little bit in pandas.cc
3d409e8 [Wes McKinney] First round of DataFrame-level dictionary to Categorical conversion
0b20c38 [Wes McKinney] Draft Python wrapper classes for DictionaryType, DictionaryArray. Avoid segfault when conversion to UTF8 fails. Starting on CategoricalBlock implementation
Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/9b1b3979
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/9b1b3979
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/9b1b3979
Branch: refs/heads/master
Commit: 9b1b3979b499dc06b71a31b2696534550503d6e2
Parents: 353772f
Author: Wes McKinney <we...@twosigma.com>
Authored: Thu Jan 19 09:27:32 2017 -0500
Committer: Wes McKinney <we...@twosigma.com>
Committed: Thu Jan 19 09:27:32 2017 -0500
----------------------------------------------------------------------
cpp/src/arrow/array-dictionary-test.cc | 2 +-
cpp/src/arrow/array.h | 2 +
cpp/src/arrow/builder.cc | 9 +-
cpp/src/arrow/builder.h | 7 +-
cpp/src/arrow/type.cc | 4 +-
python/CMakeLists.txt | 10 +
python/cmake_modules/FindPythonLibsNew.cmake | 3 +-
python/pyarrow/__init__.py | 11 +-
python/pyarrow/array.pxd | 44 +-
python/pyarrow/array.pyx | 230 +-
python/pyarrow/includes/libarrow.pxd | 38 +-
python/pyarrow/includes/pyarrow.pxd | 6 +-
python/pyarrow/schema.pxd | 8 +-
python/pyarrow/schema.pyx | 28 +
python/pyarrow/table.pyx | 74 +-
python/pyarrow/tests/test_column.py | 1 -
python/pyarrow/tests/test_convert_pandas.py | 61 +-
python/setup.py | 1 +
python/src/pyarrow/CMakeLists.txt | 2 +
python/src/pyarrow/adapters/pandas-test.cc | 64 +
python/src/pyarrow/adapters/pandas.cc | 2694 +++++++++++----------
python/src/pyarrow/adapters/pandas.h | 6 +-
python/src/pyarrow/common.cc | 2 +-
python/src/pyarrow/common.h | 43 +-
python/src/pyarrow/io.cc | 18 +-
python/src/pyarrow/util/CMakeLists.txt | 2 +-
python/src/pyarrow/util/test_main.cc | 10 +
27 files changed, 1881 insertions(+), 1499 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/arrow/blob/9b1b3979/cpp/src/arrow/array-dictionary-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array-dictionary-test.cc b/cpp/src/arrow/array-dictionary-test.cc
index c290153..1a0d49a 100644
--- a/cpp/src/arrow/array-dictionary-test.cc
+++ b/cpp/src/arrow/array-dictionary-test.cc
@@ -46,7 +46,7 @@ TEST(TestDictionary, Basics) {
ASSERT_TRUE(int16()->Equals(type2.index_type()));
ASSERT_TRUE(type2.dictionary()->Equals(dict));
- ASSERT_EQ("dictionary<int32, int16>", type1->ToString());
+ ASSERT_EQ("dictionary<values=int32, indices=int16>", type1->ToString());
}
TEST(TestDictionary, Equals) {
http://git-wip-us.apache.org/repos/asf/arrow/blob/9b1b3979/cpp/src/arrow/array.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h
index 45f8ab9..4f4b727 100644
--- a/cpp/src/arrow/array.h
+++ b/cpp/src/arrow/array.h
@@ -553,6 +553,8 @@ class ARROW_EXPORT DictionaryArray : public Array {
std::shared_ptr<Array> indices() const { return indices_; }
std::shared_ptr<Array> dictionary() const;
+ const DictionaryType* dict_type() { return dict_type_; }
+
bool EqualsExact(const DictionaryArray& other) const;
bool Equals(const std::shared_ptr<Array>& arr) const override;
bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx,
http://git-wip-us.apache.org/repos/asf/arrow/blob/9b1b3979/cpp/src/arrow/builder.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc
index a308ea5..b0dc41b 100644
--- a/cpp/src/arrow/builder.cc
+++ b/cpp/src/arrow/builder.cc
@@ -421,9 +421,12 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
BUILDER_CASE(FLOAT, FloatBuilder);
BUILDER_CASE(DOUBLE, DoubleBuilder);
- BUILDER_CASE(STRING, StringBuilder);
- BUILDER_CASE(BINARY, BinaryBuilder);
-
+ case Type::STRING:
+ out->reset(new StringBuilder(pool));
+ return Status::OK();
+ case Type::BINARY:
+ out->reset(new BinaryBuilder(pool, type));
+ return Status::OK();
case Type::LIST: {
std::shared_ptr<ArrayBuilder> value_builder;
std::shared_ptr<DataType> value_type =
http://git-wip-us.apache.org/repos/asf/arrow/blob/9b1b3979/cpp/src/arrow/builder.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h
index 1837340..735bca1 100644
--- a/cpp/src/arrow/builder.h
+++ b/cpp/src/arrow/builder.h
@@ -24,6 +24,7 @@
#include <vector>
#include "arrow/buffer.h"
+#include "arrow/memory_pool.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/util/bit-util.h"
@@ -33,7 +34,6 @@
namespace arrow {
class Array;
-class MemoryPool;
static constexpr int32_t kMinBuilderCapacity = 1 << 5;
@@ -378,7 +378,10 @@ class ARROW_EXPORT BinaryBuilder : public ListBuilder {
// String builder
class ARROW_EXPORT StringBuilder : public BinaryBuilder {
public:
- explicit StringBuilder(MemoryPool* pool, const TypePtr& type)
+ explicit StringBuilder(MemoryPool* pool = default_memory_pool())
+ : BinaryBuilder(pool, utf8()) {}
+
+ explicit StringBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type)
: BinaryBuilder(pool, type) {}
using BinaryBuilder::Append;
http://git-wip-us.apache.org/repos/asf/arrow/blob/9b1b3979/cpp/src/arrow/type.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc
index 954fba7..ba77584 100644
--- a/cpp/src/arrow/type.cc
+++ b/cpp/src/arrow/type.cc
@@ -148,8 +148,8 @@ bool DictionaryType::Equals(const DataType& other) const {
std::string DictionaryType::ToString() const {
std::stringstream ss;
- ss << "dictionary<" << dictionary_->type()->ToString() << ", "
- << index_type_->ToString() << ">";
+ ss << "dictionary<values=" << dictionary_->type()->ToString()
+ << ", indices=" << index_type_->ToString() << ">";
return ss.str();
}
http://git-wip-us.apache.org/repos/asf/arrow/blob/9b1b3979/python/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 45115d4..0a2d4e9 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -55,6 +55,10 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}")
OFF)
endif()
+if(NOT PYARROW_BUILD_TESTS)
+ set(NO_TESTS 1)
+endif()
+
find_program(CCACHE_FOUND ccache)
if(CCACHE_FOUND)
set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache)
@@ -339,6 +343,12 @@ set(PYARROW_MIN_TEST_LIBS
pyarrow
${PYARROW_BASE_LIBS})
+if(NOT APPLE)
+ ADD_THIRDPARTY_LIB(python
+ SHARED_LIB "${PYTHON_LIBRARIES}")
+ list(APPEND PYARROW_MIN_TEST_LIBS python)
+endif()
+
set(PYARROW_TEST_LINK_LIBS ${PYARROW_MIN_TEST_LIBS})
############################################################
http://git-wip-us.apache.org/repos/asf/arrow/blob/9b1b3979/python/cmake_modules/FindPythonLibsNew.cmake
----------------------------------------------------------------------
diff --git a/python/cmake_modules/FindPythonLibsNew.cmake b/python/cmake_modules/FindPythonLibsNew.cmake
index 5cb65c9..1000a95 100644
--- a/python/cmake_modules/FindPythonLibsNew.cmake
+++ b/python/cmake_modules/FindPythonLibsNew.cmake
@@ -161,6 +161,7 @@ else()
set(_PYTHON_LIBS_SEARCH "${PYTHON_PREFIX}/lib" "${PYTHON_LIBRARY_PATH}")
endif()
message(STATUS "Searching for Python libs in ${_PYTHON_LIBS_SEARCH}")
+ message(STATUS "Looking for python${PYTHON_LIBRARY_SUFFIX}")
# Probably this needs to be more involved. It would be nice if the config
# information the python interpreter itself gave us were more complete.
find_library(PYTHON_LIBRARY
@@ -237,4 +238,4 @@ FUNCTION(PYTHON_ADD_MODULE _NAME )
ENDIF()
ENDIF(PYTHON_ENABLE_MODULE_${_NAME})
-ENDFUNCTION(PYTHON_ADD_MODULE)
\ No newline at end of file
+ENDFUNCTION(PYTHON_ADD_MODULE)
http://git-wip-us.apache.org/repos/asf/arrow/blob/9b1b3979/python/pyarrow/__init__.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index a8c3e8e..efffbf2 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -31,9 +31,14 @@ from pyarrow.config import cpu_count, set_cpu_count
from pyarrow.array import (Array,
from_pandas_series, from_pylist,
total_allocated_bytes,
- BooleanArray, NumericArray,
+ NumericArray, IntegerArray, FloatingPointArray,
+ BooleanArray,
Int8Array, UInt8Array,
- ListArray, StringArray)
+ Int16Array, UInt16Array,
+ Int32Array, UInt32Array,
+ Int64Array, UInt64Array,
+ ListArray, StringArray,
+ DictionaryArray)
from pyarrow.error import ArrowException
@@ -52,7 +57,7 @@ from pyarrow.schema import (null, bool_,
uint8, uint16, uint32, uint64,
timestamp, date,
float_, double, binary, string,
- list_, struct, field,
+ list_, struct, dictionary, field,
DataType, Field, Schema, schema)
from pyarrow.table import Column, RecordBatch, Table, concat_tables
http://git-wip-us.apache.org/repos/asf/arrow/blob/9b1b3979/python/pyarrow/array.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/array.pxd b/python/pyarrow/array.pxd
index 8cd15cd..af10535 100644
--- a/python/pyarrow/array.pxd
+++ b/python/pyarrow/array.pxd
@@ -22,6 +22,8 @@ from pyarrow.scalar import NA
from pyarrow.schema cimport DataType
+from cpython cimport PyObject
+
cdef extern from "Python.h":
int PySlice_Check(object)
@@ -47,35 +49,50 @@ cdef class NumericArray(Array):
pass
-cdef class Int8Array(NumericArray):
+cdef class IntegerArray(NumericArray):
+ pass
+
+cdef class FloatingPointArray(NumericArray):
+ pass
+
+
+cdef class Int8Array(IntegerArray):
+ pass
+
+
+cdef class UInt8Array(IntegerArray):
+ pass
+
+
+cdef class Int16Array(IntegerArray):
pass
-cdef class UInt8Array(NumericArray):
+cdef class UInt16Array(IntegerArray):
pass
-cdef class Int16Array(NumericArray):
+cdef class Int32Array(IntegerArray):
pass
-cdef class UInt16Array(NumericArray):
+cdef class UInt32Array(IntegerArray):
pass
-cdef class Int32Array(NumericArray):
+cdef class Int64Array(IntegerArray):
pass
-cdef class UInt32Array(NumericArray):
+cdef class UInt64Array(IntegerArray):
pass
-cdef class Int64Array(NumericArray):
+cdef class FloatArray(FloatingPointArray):
pass
-cdef class UInt64Array(NumericArray):
+cdef class DoubleArray(FloatingPointArray):
pass
@@ -85,3 +102,14 @@ cdef class ListArray(Array):
cdef class StringArray(Array):
pass
+
+
+cdef class BinaryArray(Array):
+ pass
+
+
+cdef class DictionaryArray(Array):
+ pass
+
+
+cdef wrap_array_output(PyObject* output)
http://git-wip-us.apache.org/repos/asf/arrow/blob/9b1b3979/python/pyarrow/array.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/array.pyx b/python/pyarrow/array.pyx
index 4299ba6..92206f2 100644
--- a/python/pyarrow/array.pyx
+++ b/python/pyarrow/array.pyx
@@ -33,12 +33,17 @@ from pyarrow.error cimport check_status
cimport pyarrow.scalar as scalar
from pyarrow.scalar import NA
-from pyarrow.schema cimport Field, Schema
+from pyarrow.schema cimport Field, Schema, DictionaryType
import pyarrow.schema as schema
cimport cpython
+cdef _pandas():
+ import pandas as pd
+ return pd
+
+
def total_allocated_bytes():
cdef MemoryPool* pool = pyarrow.get_memory_pool()
return pool.bytes_allocated()
@@ -53,20 +58,22 @@ cdef class Array:
self.type.init(self.sp_array.get().type())
@staticmethod
- def from_pandas(obj, mask=None):
+ def from_pandas(obj, mask=None, timestamps_to_ms=False, Field field=None):
"""
- Create an array from a pandas.Series
+ Convert pandas.Series to an Arrow Array.
Parameters
----------
- obj : pandas.Series or numpy.ndarray
- vector holding the data
- mask : numpy.ndarray, optional
+ series : pandas.Series or numpy.ndarray
+
+ mask : pandas.Series or numpy.ndarray, optional
boolean mask if the object is valid or null
- Returns
- -------
- pyarrow.Array
+ timestamps_to_ms : bool, optional
+ Convert datetime columns to ms resolution. This is needed for
+ compability with other functionality like Parquet I/O which
+ only supports milliseconds.
+
Examples
--------
@@ -80,16 +87,47 @@ cdef class Array:
2
]
-
>>> import numpy as np
- >>> pa.Array.from_pandas(pd.Series([1, 2]), np.array([0, 1], dtype=bool))
+ >>> pa.Array.from_pandas(pd.Series([1, 2]), np.array([0, 1],
+ ... dtype=bool))
<pyarrow.array.Int64Array object at 0x7f9019e11208>
[
1,
NA
]
+
+ Returns
+ -------
+ pyarrow.array.Array
"""
- return from_pandas_series(obj, mask)
+ cdef:
+ shared_ptr[CArray] out
+ shared_ptr[CField] c_field
+
+ pd = _pandas()
+
+ if field is not None:
+ c_field = field.sp_field
+
+ if mask is not None:
+ mask = get_series_values(mask)
+
+ series_values = get_series_values(obj)
+
+ if isinstance(series_values, pd.Categorical):
+ return DictionaryArray.from_arrays(series_values.codes,
+ series_values.categories.values,
+ mask=mask)
+ else:
+ if series_values.dtype.type == np.datetime64 and timestamps_to_ms:
+ series_values = series_values.astype('datetime64[ms]')
+
+ with nogil:
+ check_status(pyarrow.PandasToArrow(
+ pyarrow.get_memory_pool(), series_values, mask,
+ c_field, &out))
+
+ return box_arrow_array(out)
@staticmethod
def from_list(object list_obj, DataType type=None):
@@ -183,12 +221,13 @@ cdef class Array:
RecordBatch.to_pandas
"""
cdef:
- PyObject* np_arr
-
- check_status(pyarrow.ConvertArrayToPandas(
- self.sp_array, <PyObject*> self, &np_arr))
+ PyObject* out
- return PyObject_to_object(np_arr)
+ with nogil:
+ check_status(
+ pyarrow.ConvertArrayToPandas(self.sp_array, <PyObject*> self,
+ &out))
+ return wrap_array_output(out)
def to_pylist(self):
"""
@@ -197,6 +236,17 @@ cdef class Array:
return [x.as_py() for x in self]
+cdef wrap_array_output(PyObject* output):
+ cdef object obj = PyObject_to_object(output)
+
+ if isinstance(obj, dict):
+ return _pandas().Categorical(obj['indices'],
+ categories=obj['dictionary'],
+ fastpath=True)
+ else:
+ return obj
+
+
cdef class NullArray(Array):
pass
@@ -209,35 +259,43 @@ cdef class NumericArray(Array):
pass
-cdef class Int8Array(NumericArray):
+cdef class IntegerArray(NumericArray):
+ pass
+
+
+cdef class FloatingPointArray(NumericArray):
+ pass
+
+
+cdef class Int8Array(IntegerArray):
pass
-cdef class UInt8Array(NumericArray):
+cdef class UInt8Array(IntegerArray):
pass
-cdef class Int16Array(NumericArray):
+cdef class Int16Array(IntegerArray):
pass
-cdef class UInt16Array(NumericArray):
+cdef class UInt16Array(IntegerArray):
pass
-cdef class Int32Array(NumericArray):
+cdef class Int32Array(IntegerArray):
pass
-cdef class UInt32Array(NumericArray):
+cdef class UInt32Array(IntegerArray):
pass
-cdef class Int64Array(NumericArray):
+cdef class Int64Array(IntegerArray):
pass
-cdef class UInt64Array(NumericArray):
+cdef class UInt64Array(IntegerArray):
pass
@@ -245,11 +303,11 @@ cdef class DateArray(NumericArray):
pass
-cdef class FloatArray(NumericArray):
+cdef class FloatArray(FloatingPointArray):
pass
-cdef class DoubleArray(NumericArray):
+cdef class DoubleArray(FloatingPointArray):
pass
@@ -265,6 +323,46 @@ cdef class BinaryArray(Array):
pass
+cdef class DictionaryArray(Array):
+
+ @staticmethod
+ def from_arrays(indices, dictionary, mask=None):
+ """
+ Construct Arrow DictionaryArray from array of indices (must be
+ non-negative integers) and corresponding array of dictionary values
+
+ Parameters
+ ----------
+ indices : ndarray or pandas.Series, integer type
+ dictionary : ndarray or pandas.Series
+ mask : ndarray or pandas.Series, boolean type
+ True values indicate that indices are actually null
+
+ Returns
+ -------
+ dict_array : DictionaryArray
+ """
+ cdef:
+ Array arrow_indices, arrow_dictionary
+ DictionaryArray result
+ shared_ptr[CDataType] c_type
+ shared_ptr[CArray] c_result
+
+ arrow_indices = Array.from_pandas(indices, mask=mask)
+ arrow_dictionary = Array.from_pandas(dictionary)
+
+ if not isinstance(arrow_indices, IntegerArray):
+ raise ValueError('Indices must be integer type')
+
+ c_type.reset(new CDictionaryType(arrow_indices.type.sp_type,
+ arrow_dictionary.sp_array))
+ c_result.reset(new CDictionaryArray(c_type, arrow_indices.sp_array))
+
+ result = DictionaryArray()
+ result.init(c_result)
+ return result
+
+
cdef dict _array_classes = {
Type_NA: NullArray,
Type_BOOL: BooleanArray,
@@ -283,6 +381,7 @@ cdef dict _array_classes = {
Type_BINARY: BinaryArray,
Type_STRING: StringArray,
Type_TIMESTAMP: Int64Array,
+ Type_DICTIONARY: DictionaryArray
}
cdef object box_arrow_array(const shared_ptr[CArray]& sp_array):
@@ -299,83 +398,18 @@ cdef object box_arrow_array(const shared_ptr[CArray]& sp_array):
return arr
-def from_pylist(object list_obj, DataType type=None):
- """
- Convert Python list to Arrow array
-
- Parameters
- ----------
- list_obj : array_like
-
- Returns
- -------
- pyarrow.array.Array
- """
- cdef:
- shared_ptr[CArray] sp_array
-
- if type is None:
- check_status(pyarrow.ConvertPySequence(list_obj, &sp_array))
- else:
- raise NotImplementedError()
-
- return box_arrow_array(sp_array)
-
-
-def from_pandas_series(object series, object mask=None, timestamps_to_ms=False, Field field=None):
- """
- Convert pandas.Series to an Arrow Array.
-
- Parameters
- ----------
- series : pandas.Series or numpy.ndarray
-
- mask : pandas.Series or numpy.ndarray, optional
- array to mask null entries in the series
-
- timestamps_to_ms : bool, optional
- Convert datetime columns to ms resolution. This is needed for
- compability with other functionality like Parquet I/O which
- only supports milliseconds.
-
- field: pyarrow.Field
- Schema indicator to what type this column should render in Arrow
-
- Returns
- -------
- pyarrow.array.Array
- """
- cdef:
- shared_ptr[CArray] out
- shared_ptr[CField] c_field
-
- series_values = series_as_ndarray(series)
- if series_values.dtype.type == np.datetime64 and timestamps_to_ms:
- series_values = series_values.astype('datetime64[ms]')
- if field is not None:
- c_field = field.sp_field
-
- if mask is None:
- with nogil:
- check_status(pyarrow.PandasToArrow(pyarrow.get_memory_pool(),
- series_values, c_field, &out))
- else:
- mask = series_as_ndarray(mask)
- with nogil:
- check_status(pyarrow.PandasMaskedToArrow(
- pyarrow.get_memory_pool(), series_values, mask, c_field, &out))
-
- return box_arrow_array(out)
-
-
-cdef object series_as_ndarray(object obj):
+cdef object get_series_values(object obj):
import pandas as pd
if isinstance(obj, pd.Series):
result = obj.values
- else:
+ elif isinstance(obj, np.ndarray):
result = obj
+ else:
+ result = pd.Series(obj).values
return result
+
from_pylist = Array.from_list
+from_pandas_series = Array.from_pandas
http://git-wip-us.apache.org/repos/asf/arrow/blob/9b1b3979/python/pyarrow/includes/libarrow.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 8b0e3b6..6284ad3 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -45,6 +45,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
Type_LIST" arrow::Type::LIST"
Type_STRUCT" arrow::Type::STRUCT"
+ Type_DICTIONARY" arrow::Type::DICTIONARY"
enum TimeUnit" arrow::TimeUnit":
TimeUnit_SECOND" arrow::TimeUnit::SECOND"
@@ -60,6 +61,33 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
c_string ToString()
+ cdef cppclass CArray" arrow::Array":
+ shared_ptr[CDataType] type()
+
+ int32_t length()
+ int32_t null_count()
+ Type type_enum()
+
+ c_bool Equals(const shared_ptr[CArray]& arr)
+ c_bool IsNull(int i)
+
+ cdef cppclass CFixedWidthType" arrow::FixedWidthType"(CDataType):
+ int bit_width()
+
+ cdef cppclass CDictionaryArray" arrow::DictionaryArray"(CArray):
+ CDictionaryArray(const shared_ptr[CDataType]& type,
+ const shared_ptr[CArray]& indices)
+
+ shared_ptr[CArray] indices()
+ shared_ptr[CArray] dictionary()
+
+ cdef cppclass CDictionaryType" arrow::DictionaryType"(CFixedWidthType):
+ CDictionaryType(const shared_ptr[CDataType]& index_type,
+ const shared_ptr[CArray]& dictionary)
+
+ shared_ptr[CDataType] index_type()
+ shared_ptr[CArray] dictionary()
+
shared_ptr[CDataType] timestamp(TimeUnit unit)
cdef cppclass MemoryPool" arrow::MemoryPool":
@@ -111,16 +139,6 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
int num_fields()
c_string ToString()
- cdef cppclass CArray" arrow::Array":
- shared_ptr[CDataType] type()
-
- int32_t length()
- int32_t null_count()
- Type type_enum()
-
- c_bool Equals(const shared_ptr[CArray]& arr)
- c_bool IsNull(int i)
-
cdef cppclass CBooleanArray" arrow::BooleanArray"(CArray):
c_bool Value(int i)
http://git-wip-us.apache.org/repos/asf/arrow/blob/9b1b3979/python/pyarrow/includes/pyarrow.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/pyarrow.pxd b/python/pyarrow/includes/pyarrow.pxd
index b7b8d7c..04ad4f3 100644
--- a/python/pyarrow/includes/pyarrow.pxd
+++ b/python/pyarrow/includes/pyarrow.pxd
@@ -30,11 +30,9 @@ cdef extern from "pyarrow/api.h" namespace "pyarrow" nogil:
shared_ptr[CDataType] GetTimestampType(TimeUnit unit)
CStatus ConvertPySequence(object obj, shared_ptr[CArray]* out)
- CStatus PandasToArrow(MemoryPool* pool, object ao, shared_ptr[CField] field,
+ CStatus PandasToArrow(MemoryPool* pool, object ao, object mo,
+ shared_ptr[CField] field,
shared_ptr[CArray]* out)
- CStatus PandasMaskedToArrow(MemoryPool* pool, object ao, object mo,
- shared_ptr[CField] field,
- shared_ptr[CArray]* out)
CStatus ConvertArrayToPandas(const shared_ptr[CArray]& arr,
PyObject* py_ref, PyObject** out)
http://git-wip-us.apache.org/repos/asf/arrow/blob/9b1b3979/python/pyarrow/schema.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/schema.pxd b/python/pyarrow/schema.pxd
index 42588d4..390954c 100644
--- a/python/pyarrow/schema.pxd
+++ b/python/pyarrow/schema.pxd
@@ -16,7 +16,8 @@
# under the License.
from pyarrow.includes.common cimport *
-from pyarrow.includes.libarrow cimport CDataType, CField, CSchema
+from pyarrow.includes.libarrow cimport (CDataType, CDictionaryType,
+ CField, CSchema)
cdef class DataType:
cdef:
@@ -25,6 +26,11 @@ cdef class DataType:
cdef init(self, const shared_ptr[CDataType]& type)
+
+cdef class DictionaryType(DataType):
+ cdef:
+ const CDictionaryType* dict_type
+
cdef class Field:
cdef:
shared_ptr[CField] sp_field
http://git-wip-us.apache.org/repos/asf/arrow/blob/9b1b3979/python/pyarrow/schema.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/schema.pyx b/python/pyarrow/schema.pyx
index 85b1617..2bcfec1 100644
--- a/python/pyarrow/schema.pyx
+++ b/python/pyarrow/schema.pyx
@@ -25,6 +25,7 @@
from cython.operator cimport dereference as deref
from pyarrow.compat import frombytes, tobytes
+from pyarrow.array cimport Array
from pyarrow.includes.libarrow cimport (CDataType, CStructType, CListType,
Type_NA, Type_BOOL,
Type_UINT8, Type_INT8,
@@ -66,6 +67,19 @@ cdef class DataType:
raise TypeError('Invalid comparison')
+cdef class DictionaryType(DataType):
+
+ cdef init(self, const shared_ptr[CDataType]& type):
+ DataType.init(self, type)
+ self.dict_type = <const CDictionaryType*> type.get()
+
+ def __str__(self):
+ return frombytes(self.type.ToString())
+
+ def __repr__(self):
+ return 'DictionaryType({0})'.format(str(self))
+
+
cdef class Field:
def __cinit__(self):
@@ -269,6 +283,7 @@ def binary():
"""
return primitive_type(Type_BINARY)
+
def list_(DataType value_type):
cdef DataType out = DataType()
cdef shared_ptr[CDataType] list_type
@@ -276,6 +291,19 @@ def list_(DataType value_type):
out.init(list_type)
return out
+
+def dictionary(DataType index_type, Array dictionary):
+ """
+ Dictionary (categorical, or simply encoded) type
+ """
+ cdef DictionaryType out = DictionaryType()
+ cdef shared_ptr[CDataType] dict_type
+ dict_type.reset(new CDictionaryType(index_type.sp_type,
+ dictionary.sp_array))
+ out.init(dict_type)
+ return out
+
+
def struct(fields):
"""
http://git-wip-us.apache.org/repos/asf/arrow/blob/9b1b3979/python/pyarrow/table.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/table.pyx b/python/pyarrow/table.pyx
index b720a47..0e3b2bd 100644
--- a/python/pyarrow/table.pyx
+++ b/python/pyarrow/table.pyx
@@ -27,7 +27,7 @@ cimport pyarrow.includes.pyarrow as pyarrow
import pyarrow.config
-from pyarrow.array cimport Array, box_arrow_array
+from pyarrow.array cimport Array, box_arrow_array, wrap_array_output
from pyarrow.error import ArrowException
from pyarrow.error cimport check_status
from pyarrow.schema cimport box_data_type, box_schema, Field
@@ -39,6 +39,11 @@ cimport cpython
from collections import OrderedDict
+cdef _pandas():
+ import pandas as pd
+ return pd
+
+
cdef class ChunkedArray:
"""
Array backed via one or more memory chunks.
@@ -146,14 +151,12 @@ cdef class Column:
pandas.Series
"""
cdef:
- PyObject* arr
-
- import pandas as pd
+ PyObject* out
check_status(pyarrow.ConvertColumnToPandas(self.sp_column,
- <PyObject*> self, &arr))
+ <PyObject*> self, &out))
- return pd.Series(PyObject_to_object(arr), name=self.name)
+ return _pandas().Series(wrap_array_output(out), name=self.name)
def equals(self, Column other):
"""
@@ -278,8 +281,6 @@ cdef _schema_from_arrays(arrays, names, shared_ptr[CSchema]* schema):
cdef _dataframe_to_arrays(df, name, timestamps_to_ms, Schema schema):
- from pyarrow.array import from_pandas_series
-
cdef:
list names = []
list arrays = []
@@ -289,9 +290,8 @@ cdef _dataframe_to_arrays(df, name, timestamps_to_ms, Schema schema):
col = df[name]
if schema is not None:
field = schema.field_by_name(name)
- arr = from_pandas_series(col, timestamps_to_ms=timestamps_to_ms,
- field=field)
-
+ arr = Array.from_pandas(col, timestamps_to_ms=timestamps_to_ms,
+ field=field)
names.append(name)
arrays.append(arr)
@@ -304,7 +304,8 @@ cdef class RecordBatch:
Warning
-------
- Do not call this class's constructor directly, use one of the ``from_*`` methods instead.
+ Do not call this class's constructor directly, use one of the ``from_*``
+ methods instead.
"""
def __cinit__(self):
@@ -401,7 +402,7 @@ cdef class RecordBatch:
return OrderedDict(entries)
- def to_pandas(self):
+ def to_pandas(self, nthreads=None):
"""
Convert the arrow::RecordBatch to a pandas DataFrame
@@ -409,23 +410,7 @@ cdef class RecordBatch:
-------
pandas.DataFrame
"""
- cdef:
- PyObject* np_arr
- shared_ptr[CArray] arr
- Column column
-
- import pandas as pd
-
- names = []
- data = []
- for i in range(self.batch.num_columns()):
- arr = self.batch.column(i)
- check_status(pyarrow.ConvertArrayToPandas(arr, <PyObject*> self,
- &np_arr))
- names.append(frombytes(self.batch.column_name(i)))
- data.append(PyObject_to_object(np_arr))
-
- return pd.DataFrame(dict(zip(names, data)), columns=names)
+ return Table.from_batches([self]).to_pandas(nthreads=nthreads)
@classmethod
def from_pandas(cls, df, schema=None):
@@ -490,8 +475,8 @@ cdef table_to_blockmanager(const shared_ptr[CTable]& table, int nthreads):
CColumn* col
int i
- from pandas.core.internals import BlockManager, make_block
- from pandas import RangeIndex
+ import pandas.core.internals as _int
+ from pandas import RangeIndex, Categorical
with nogil:
check_status(pyarrow.ConvertTableToPandas(table, nthreads,
@@ -500,8 +485,19 @@ cdef table_to_blockmanager(const shared_ptr[CTable]& table, int nthreads):
result = PyObject_to_object(result_obj)
blocks = []
- for block_arr, placement_arr in result:
- blocks.append(make_block(block_arr, placement=placement_arr))
+ for item in result:
+ block_arr = item['block']
+ placement = item['placement']
+ if 'dictionary' in item:
+ cat = Categorical(block_arr,
+ categories=item['dictionary'],
+ ordered=False, fastpath=True)
+ block = _int.make_block(cat, placement=placement,
+ klass=_int.CategoricalBlock,
+ fastpath=True)
+ else:
+ block = _int.make_block(block_arr, placement=placement)
+ blocks.append(block)
names = []
for i in range(table.get().num_columns()):
@@ -509,7 +505,7 @@ cdef table_to_blockmanager(const shared_ptr[CTable]& table, int nthreads):
names.append(frombytes(col.name()))
axes = [names, RangeIndex(table.get().num_rows())]
- return BlockManager(blocks, axes)
+ return _int.BlockManager(blocks, axes)
cdef class Table:
@@ -518,7 +514,8 @@ cdef class Table:
Warning
-------
- Do not call this class's constructor directly, use one of the ``from_*`` methods instead.
+ Do not call this class's constructor directly, use one of the ``from_*``
+ methods instead.
"""
def __cinit__(self):
@@ -688,13 +685,11 @@ cdef class Table:
-------
pandas.DataFrame
"""
- import pandas as pd
-
if nthreads is None:
nthreads = pyarrow.config.cpu_count()
mgr = table_to_blockmanager(self.sp_table, nthreads)
- return pd.DataFrame(mgr)
+ return _pandas().DataFrame(mgr)
def to_pydict(self):
"""
@@ -835,6 +830,7 @@ cdef api object table_from_ctable(const shared_ptr[CTable]& ctable):
table.init(ctable)
return table
+
cdef api object batch_from_cbatch(const shared_ptr[CRecordBatch]& cbatch):
cdef RecordBatch batch = RecordBatch()
batch.init(cbatch)
http://git-wip-us.apache.org/repos/asf/arrow/blob/9b1b3979/python/pyarrow/tests/test_column.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_column.py b/python/pyarrow/tests/test_column.py
index 32202cb..1a507c8 100644
--- a/python/pyarrow/tests/test_column.py
+++ b/python/pyarrow/tests/test_column.py
@@ -47,4 +47,3 @@ class TestColumn(unittest.TestCase):
assert series.name == 'a'
assert series.shape == (5,)
assert series.iloc[0] == -10
-
http://git-wip-us.apache.org/repos/asf/arrow/blob/9b1b3979/python/pyarrow/tests/test_convert_pandas.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index 3928a1f..a2f5062 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -62,8 +62,10 @@ class TestPandasConversion(unittest.TestCase):
pass
def _check_pandas_roundtrip(self, df, expected=None, nthreads=1,
- timestamps_to_ms=False, expected_schema=None, schema=None):
- table = A.Table.from_pandas(df, timestamps_to_ms=timestamps_to_ms, schema=schema)
+ timestamps_to_ms=False, expected_schema=None,
+ schema=None):
+ table = A.Table.from_pandas(df, timestamps_to_ms=timestamps_to_ms,
+ schema=schema)
result = table.to_pandas(nthreads=nthreads)
if expected_schema:
assert table.schema.equals(expected_schema)
@@ -71,6 +73,13 @@ class TestPandasConversion(unittest.TestCase):
expected = df
tm.assert_frame_equal(result, expected)
+ def _check_array_roundtrip(self, values, expected=None,
+ timestamps_to_ms=False, field=None):
+ arr = A.Array.from_pandas(values, timestamps_to_ms=timestamps_to_ms,
+ field=field)
+ result = arr.to_pandas()
+ tm.assert_series_equal(pd.Series(result), pd.Series(values))
+
def test_float_no_nulls(self):
data = {}
fields = []
@@ -235,7 +244,8 @@ class TestPandasConversion(unittest.TestCase):
})
field = A.Field.from_py('datetime64', A.timestamp('ms'))
schema = A.Schema.from_fields([field])
- self._check_pandas_roundtrip(df, timestamps_to_ms=True, expected_schema=schema)
+ self._check_pandas_roundtrip(df, timestamps_to_ms=True,
+ expected_schema=schema)
df = pd.DataFrame({
'datetime64': np.array([
@@ -246,7 +256,8 @@ class TestPandasConversion(unittest.TestCase):
})
field = A.Field.from_py('datetime64', A.timestamp('ns'))
schema = A.Schema.from_fields([field])
- self._check_pandas_roundtrip(df, timestamps_to_ms=False, expected_schema=schema)
+ self._check_pandas_roundtrip(df, timestamps_to_ms=False,
+ expected_schema=schema)
def test_timestamps_notimezone_nulls(self):
df = pd.DataFrame({
@@ -258,7 +269,8 @@ class TestPandasConversion(unittest.TestCase):
})
field = A.Field.from_py('datetime64', A.timestamp('ms'))
schema = A.Schema.from_fields([field])
- self._check_pandas_roundtrip(df, timestamps_to_ms=True, expected_schema=schema)
+ self._check_pandas_roundtrip(df, timestamps_to_ms=True,
+ expected_schema=schema)
df = pd.DataFrame({
'datetime64': np.array([
@@ -269,7 +281,8 @@ class TestPandasConversion(unittest.TestCase):
})
field = A.Field.from_py('datetime64', A.timestamp('ns'))
schema = A.Schema.from_fields([field])
- self._check_pandas_roundtrip(df, timestamps_to_ms=False, expected_schema=schema)
+ self._check_pandas_roundtrip(df, timestamps_to_ms=False,
+ expected_schema=schema)
def test_date(self):
df = pd.DataFrame({
@@ -317,13 +330,13 @@ class TestPandasConversion(unittest.TestCase):
np.array(['2007-07-13T01:23:34.123456789',
None,
'2010-08-13T05:46:57.437699912'],
- dtype='datetime64[ns]'),
+ dtype='datetime64[ns]'),
None,
None,
np.array(['2007-07-13T02',
None,
'2010-08-13T05:46:57.437699912'],
- dtype='datetime64[ns]'),
+ dtype='datetime64[ns]'),
]
df = pd.DataFrame(arrays)
@@ -331,16 +344,34 @@ class TestPandasConversion(unittest.TestCase):
self._check_pandas_roundtrip(df, schema=schema, expected_schema=schema)
table = A.Table.from_pandas(df, schema=schema)
assert table.schema.equals(schema)
- df_new = table.to_pandas(nthreads=1)
+
+ # it works!
+ table.to_pandas(nthreads=1)
def test_threaded_conversion(self):
df = _alltypes_example()
self._check_pandas_roundtrip(df, nthreads=2,
timestamps_to_ms=False)
- # def test_category(self):
- # repeats = 1000
- # values = [b'foo', None, u'bar', 'qux', np.nan]
- # df = pd.DataFrame({'strings': values * repeats})
- # df['strings'] = df['strings'].astype('category')
- # self._check_pandas_roundtrip(df)
+ def test_category(self):
+ repeats = 5
+ v1 = ['foo', None, 'bar', 'qux', np.nan]
+ v2 = [4, 5, 6, 7, 8]
+ v3 = [b'foo', None, b'bar', b'qux', np.nan]
+ df = pd.DataFrame({'cat_strings': pd.Categorical(v1 * repeats),
+ 'cat_ints': pd.Categorical(v2 * repeats),
+ 'cat_binary': pd.Categorical(v3 * repeats),
+ 'ints': v2 * repeats,
+ 'ints2': v2 * repeats,
+ 'strings': v1 * repeats,
+ 'strings2': v1 * repeats,
+ 'strings3': v3 * repeats})
+ self._check_pandas_roundtrip(df)
+
+ arrays = [
+ pd.Categorical(v1 * repeats),
+ pd.Categorical(v2 * repeats),
+ pd.Categorical(v3 * repeats)
+ ]
+ for values in arrays:
+ self._check_array_roundtrip(values)
http://git-wip-us.apache.org/repos/asf/arrow/blob/9b1b3979/python/setup.py
----------------------------------------------------------------------
diff --git a/python/setup.py b/python/setup.py
index 72ff584..de59a92 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -128,6 +128,7 @@ class build_ext(_build_ext):
cmake_options = [
'-DPYTHON_EXECUTABLE=%s' % sys.executable,
+ '-DPYARROW_BUILD_TESTS=off',
static_lib_option,
build_tests_option,
]
http://git-wip-us.apache.org/repos/asf/arrow/blob/9b1b3979/python/src/pyarrow/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/CMakeLists.txt b/python/src/pyarrow/CMakeLists.txt
index e20c323..9e69718 100644
--- a/python/src/pyarrow/CMakeLists.txt
+++ b/python/src/pyarrow/CMakeLists.txt
@@ -18,3 +18,5 @@
#######################################
# Unit tests
#######################################
+
+ADD_PYARROW_TEST(adapters/pandas-test)
http://git-wip-us.apache.org/repos/asf/arrow/blob/9b1b3979/python/src/pyarrow/adapters/pandas-test.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/adapters/pandas-test.cc b/python/src/pyarrow/adapters/pandas-test.cc
new file mode 100644
index 0000000..e286ccc
--- /dev/null
+++ b/python/src/pyarrow/adapters/pandas-test.cc
@@ -0,0 +1,64 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gtest/gtest.h"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/builder.h"
+#include "arrow/schema.h"
+#include "arrow/table.h"
+#include "arrow/test-util.h"
+#include "arrow/type.h"
+#include "pyarrow/adapters/pandas.h"
+
+using namespace arrow;
+
+namespace pyarrow {
+
+TEST(PandasConversionTest, TestObjectBlockWriteFails) {
+ StringBuilder builder;
+ const char value[] = {'\xf1', '\0'};
+
+ for (int i = 0; i < 1000; ++i) {
+ builder.Append(value, strlen(value));
+ }
+
+ std::shared_ptr<Array> arr;
+ ASSERT_OK(builder.Finish(&arr));
+
+ auto f1 = field("f1", utf8());
+ auto f2 = field("f2", utf8());
+ auto f3 = field("f3", utf8());
+ std::vector<std::shared_ptr<Field>> fields = {f1, f2, f3};
+ std::vector<std::shared_ptr<Column>> cols = {std::make_shared<Column>(f1, arr),
+ std::make_shared<Column>(f2, arr), std::make_shared<Column>(f3, arr)};
+
+ auto schema = std::make_shared<Schema>(fields);
+ auto table = std::make_shared<Table>("", schema, cols);
+
+ PyObject* out;
+ Py_BEGIN_ALLOW_THREADS;
+ ASSERT_RAISES(UnknownError, ConvertTableToPandas(table, 2, &out));
+ Py_END_ALLOW_THREADS;
+}
+
+} // namespace arrow