You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2019/08/06 18:41:53 UTC
[arrow] branch master updated: ARROW-6084: [Python] Support
LargeList
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 2774cfb ARROW-6084: [Python] Support LargeList
2774cfb is described below
commit 2774cfb2a2a88af8a72e02010cab7db36b451844
Author: Antoine Pitrou <an...@python.org>
AuthorDate: Tue Aug 6 13:41:41 2019 -0500
ARROW-6084: [Python] Support LargeList
Closes #4979 from pitrou/ARROW-6084-py-large-list and squashes the following commits:
4266ea2c6 <Antoine Pitrou> ARROW-6084: Support LargeList
Authored-by: Antoine Pitrou <an...@python.org>
Signed-off-by: Wes McKinney <we...@apache.org>
---
cpp/src/arrow/python/python_to_arrow.cc | 27 +++++++----
docs/source/python/api/arrays.rst | 2 +
docs/source/python/api/datatypes.rst | 2 +
python/pyarrow/__init__.py | 10 ++--
python/pyarrow/array.pxi | 47 +++++++++++++++++++
python/pyarrow/includes/libarrow.pxd | 18 +++++++
python/pyarrow/lib.pxd | 20 ++++++++
python/pyarrow/lib.pyx | 1 +
python/pyarrow/public-api.pxi | 2 +
python/pyarrow/scalar.pxi | 52 +++++++++++++++++++++
python/pyarrow/tests/strategies.py | 12 +++--
python/pyarrow/tests/test_array.py | 70 +++++++++++++++++++++++-----
python/pyarrow/tests/test_compute.py | 1 +
python/pyarrow/tests/test_convert_builtin.py | 12 +++++
python/pyarrow/tests/test_scalars.py | 23 +++++++++
python/pyarrow/tests/test_types.py | 21 ++++++++-
python/pyarrow/types.pxi | 55 ++++++++++++++++++++++
python/pyarrow/types.py | 10 +++-
18 files changed, 356 insertions(+), 29 deletions(-)
diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc
index 424e309..a990aec 100644
--- a/cpp/src/arrow/python/python_to_arrow.cc
+++ b/cpp/src/arrow/python/python_to_arrow.cc
@@ -582,19 +582,22 @@ class StringConverter
// ----------------------------------------------------------------------
// Convert lists (NumPy arrays containing lists or ndarrays as values)
-class ListConverter : public TypedConverter<ListType, ListConverter> {
+template <typename TypeClass>
+class ListConverter : public TypedConverter<TypeClass, ListConverter<TypeClass>> {
public:
+ using BuilderType = typename TypeTraits<TypeClass>::BuilderType;
+
explicit ListConverter(bool from_pandas, bool strict_conversions)
: from_pandas_(from_pandas), strict_conversions_(strict_conversions) {}
Status Init(ArrayBuilder* builder) {
- builder_ = builder;
- typed_builder_ = checked_cast<ListBuilder*>(builder);
+ this->builder_ = builder;
+ this->typed_builder_ = checked_cast<BuilderType*>(builder);
- value_type_ = checked_cast<const ListType&>(*builder->type()).value_type();
+ value_type_ = checked_cast<const TypeClass&>(*builder->type()).value_type();
RETURN_NOT_OK(
GetConverter(value_type_, from_pandas_, strict_conversions_, &value_converter_));
- return value_converter_->Init(typed_builder_->value_builder());
+ return value_converter_->Init(this->typed_builder_->value_builder());
}
template <int NUMPY_TYPE, typename Type>
@@ -602,7 +605,7 @@ class ListConverter : public TypedConverter<ListType, ListConverter> {
Status AppendNdarrayItem(PyObject* arr);
Status AppendItem(PyObject* obj) {
- RETURN_NOT_OK(typed_builder_->Append());
+ RETURN_NOT_OK(this->typed_builder_->Append());
if (PyArray_Check(obj)) {
return AppendNdarrayItem(obj);
}
@@ -625,8 +628,9 @@ class ListConverter : public TypedConverter<ListType, ListConverter> {
bool strict_conversions_;
};
+template <typename TypeClass>
template <int NUMPY_TYPE, typename Type>
-Status ListConverter::AppendNdarrayTypedItem(PyArrayObject* arr) {
+Status ListConverter<TypeClass>::AppendNdarrayTypedItem(PyArrayObject* arr) {
using traits = internal::npy_traits<NUMPY_TYPE>;
using T = typename traits::value_type;
using ValueBuilderType = typename TypeTraits<Type>::BuilderType;
@@ -673,7 +677,8 @@ Status ListConverter::AppendNdarrayTypedItem(PyArrayObject* arr) {
return value_converter_->AppendMultiple(obj, value_length); \
}
-Status ListConverter::AppendNdarrayItem(PyObject* obj) {
+template <typename TypeClass>
+Status ListConverter<TypeClass>::AppendNdarrayItem(PyObject* obj) {
PyArrayObject* arr = reinterpret_cast<PyArrayObject*>(obj);
if (PyArray_NDIM(arr) != 1) {
@@ -914,7 +919,11 @@ Status GetConverter(const std::shared_ptr<DataType>& type, bool from_pandas,
}
case Type::LIST:
*out = std::unique_ptr<SeqConverter>(
- new ListConverter(from_pandas, strict_conversions));
+ new ListConverter<ListType>(from_pandas, strict_conversions));
+ break;
+ case Type::LARGE_LIST:
+ *out = std::unique_ptr<SeqConverter>(
+ new ListConverter<LargeListType>(from_pandas, strict_conversions));
break;
case Type::STRUCT:
*out = std::unique_ptr<SeqConverter>(
diff --git a/docs/source/python/api/arrays.rst b/docs/source/python/api/arrays.rst
index e10b5af..ca7d1e7 100644
--- a/docs/source/python/api/arrays.rst
+++ b/docs/source/python/api/arrays.rst
@@ -67,6 +67,7 @@ may expose data type-specific methods or properties.
Decimal128Array
DictionaryArray
ListArray
+ LargeListArray
StructArray
UnionArray
@@ -109,5 +110,6 @@ any of those classes directly.
DecimalValue
DictionaryValue
ListValue
+ LargeListValue
StructValue
UnionValue
diff --git a/docs/source/python/api/datatypes.rst b/docs/source/python/api/datatypes.rst
index 327bcf6..6502ae2 100644
--- a/docs/source/python/api/datatypes.rst
+++ b/docs/source/python/api/datatypes.rst
@@ -55,6 +55,7 @@ These should be used to create Arrow data types and schemas.
large_utf8
decimal128
list_
+ large_list
struct
dictionary
field
@@ -117,6 +118,7 @@ represents a given data type (such as ``int32``) or general category
is_float64
is_decimal
is_list
+ is_large_list
is_struct
is_union
is_nested
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index 51afa0f..4e7707a 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -55,9 +55,10 @@ from pyarrow.lib import (null, bool_,
binary, string, utf8,
large_binary, large_string, large_utf8,
decimal128,
- list_, struct, union, dictionary, field,
+ list_, large_list, struct, union, dictionary, field,
type_for_alias,
- DataType, DictionaryType, ListType, StructType,
+ DataType, DictionaryType, StructType,
+ ListType, LargeListType,
UnionType, TimestampType, Time32Type, Time64Type,
FixedSizeBinaryType, Decimal128Type,
BaseExtensionType, ExtensionType,
@@ -77,7 +78,7 @@ from pyarrow.lib import (null, bool_,
Int16Array, UInt16Array,
Int32Array, UInt32Array,
Int64Array, UInt64Array,
- ListArray, UnionArray,
+ ListArray, LargeListArray, UnionArray,
BinaryArray, StringArray,
LargeBinaryArray, LargeStringArray,
FixedSizeBinaryArray,
@@ -89,7 +90,8 @@ from pyarrow.lib import (null, bool_,
BooleanValue,
Int8Value, Int16Value, Int32Value, Int64Value,
UInt8Value, UInt16Value, UInt32Value, UInt64Value,
- HalfFloatValue, FloatValue, DoubleValue, ListValue,
+ HalfFloatValue, FloatValue, DoubleValue,
+ ListValue, LargeListValue,
BinaryValue, StringValue,
LargeBinaryValue, LargeStringValue,
FixedSizeBinaryValue,
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index ecb8ff5..b93cf10 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -1070,6 +1070,52 @@ cdef class ListArray(Array):
return pyarrow_wrap_array(arr.values())
+cdef class LargeListArray(Array):
+ """
+ Concrete class for Arrow arrays of a large list data type
+ (like ListArray, but 64-bit offsets).
+ """
+
+ @staticmethod
+ def from_arrays(offsets, values, MemoryPool pool=None):
+ """
+ Construct LargeListArray from arrays of int64 offsets and values
+
+ Parameters
+ ----------
+ offset : Array (int64 type)
+ values : Array (any type)
+
+ Returns
+ -------
+ list_array : LargeListArray
+ """
+ cdef:
+ Array _offsets, _values
+ shared_ptr[CArray] out
+ cdef CMemoryPool* cpool = maybe_unbox_memory_pool(pool)
+
+ _offsets = asarray(offsets, type='int64')
+ _values = asarray(values)
+
+ with nogil:
+ check_status(CLargeListArray.FromArrays(_offsets.ap[0],
+ _values.ap[0],
+ cpool, &out))
+ return pyarrow_wrap_array(out)
+
+ def flatten(self):
+ """
+ Unnest this LargeListArray by one level
+
+ Returns
+ -------
+ result : Array
+ """
+ cdef CLargeListArray* arr = <CLargeListArray*> self.ap
+ return pyarrow_wrap_array(arr.values())
+
+
cdef class UnionArray(Array):
"""
Concrete class for Arrow arrays of a Union data type.
@@ -1511,6 +1557,7 @@ cdef dict _array_classes = {
_Type_FLOAT: FloatArray,
_Type_DOUBLE: DoubleArray,
_Type_LIST: ListArray,
+ _Type_LARGE_LIST: LargeListArray,
_Type_UNION: UnionArray,
_Type_BINARY: BinaryArray,
_Type_STRING: StringArray,
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 4dc6427..ad0fa09 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -70,6 +70,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
_Type_FIXED_SIZE_BINARY" arrow::Type::FIXED_SIZE_BINARY"
_Type_LIST" arrow::Type::LIST"
+ _Type_LARGE_LIST" arrow::Type::LARGE_LIST"
_Type_STRUCT" arrow::Type::STRUCT"
_Type_UNION" arrow::Type::UNION"
_Type_DICTIONARY" arrow::Type::DICTIONARY"
@@ -252,6 +253,12 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
shared_ptr[CDataType] value_type()
shared_ptr[CField] value_field()
+ cdef cppclass CLargeListType" arrow::LargeListType"(CDataType):
+ CLargeListType(const shared_ptr[CDataType]& value_type)
+ CLargeListType(const shared_ptr[CField]& field)
+ shared_ptr[CDataType] value_type()
+ shared_ptr[CField] value_field()
+
cdef cppclass CStringType" arrow::StringType"(CDataType):
pass
@@ -419,6 +426,17 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
shared_ptr[CArray] values()
shared_ptr[CDataType] value_type()
+ cdef cppclass CLargeListArray" arrow::LargeListArray"(CArray):
+ @staticmethod
+ CStatus FromArrays(const CArray& offsets, const CArray& values,
+ CMemoryPool* pool, shared_ptr[CArray]* out)
+
+ const int64_t* raw_value_offsets()
+ int64_t value_offset(int i)
+ int64_t value_length(int i)
+ shared_ptr[CArray] values()
+ shared_ptr[CDataType] value_type()
+
cdef cppclass CUnionArray" arrow::UnionArray"(CArray):
@staticmethod
CStatus MakeSparse(const CArray& type_ids,
diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd
index 0931463..b59f3f8 100644
--- a/python/pyarrow/lib.pxd
+++ b/python/pyarrow/lib.pxd
@@ -64,6 +64,11 @@ cdef class ListType(DataType):
const CListType* list_type
+cdef class LargeListType(DataType):
+ cdef:
+ const CLargeListType* list_type
+
+
cdef class StructType(DataType):
cdef:
const CStructType* struct_type
@@ -184,6 +189,17 @@ cdef class ListValue(ArrayValue):
cdef int64_t length(self)
+cdef class LargeListValue(ArrayValue):
+ cdef readonly:
+ DataType value_type
+
+ cdef:
+ CLargeListArray* ap
+
+ cdef getitem(self, int64_t i)
+ cdef int64_t length(self)
+
+
cdef class StructValue(ArrayValue):
cdef:
CStructArray* ap
@@ -336,6 +352,10 @@ cdef class ListArray(Array):
pass
+cdef class LargeListArray(Array):
+ pass
+
+
cdef class UnionArray(Array):
pass
diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx
index 0b33f39..8baaab3 100644
--- a/python/pyarrow/lib.pyx
+++ b/python/pyarrow/lib.pyx
@@ -91,6 +91,7 @@ Type_LARGE_BINARY = _Type_LARGE_BINARY
Type_LARGE_STRING = _Type_LARGE_STRING
Type_FIXED_SIZE_BINARY = _Type_FIXED_SIZE_BINARY
Type_LIST = _Type_LIST
+Type_LARGE_LIST = _Type_LARGE_LIST
Type_STRUCT = _Type_STRUCT
Type_UNION = _Type_UNION
Type_DICTIONARY = _Type_DICTIONARY
diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi
index f6ef2c9..bb5bdca 100644
--- a/python/pyarrow/public-api.pxi
+++ b/python/pyarrow/public-api.pxi
@@ -79,6 +79,8 @@ cdef api object pyarrow_wrap_data_type(
out = DictionaryType.__new__(DictionaryType)
elif type.get().id() == _Type_LIST:
out = ListType.__new__(ListType)
+ elif type.get().id() == _Type_LARGE_LIST:
+ out = LargeListType.__new__(LargeListType)
elif type.get().id() == _Type_STRUCT:
out = StructType.__new__(StructType)
elif type.get().id() == _Type_UNION:
diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi
index 0ead3e5..aa100ca 100644
--- a/python/pyarrow/scalar.pxi
+++ b/python/pyarrow/scalar.pxi
@@ -586,6 +586,57 @@ cdef class ListValue(ArrayValue):
return result
+cdef class LargeListValue(ArrayValue):
+ """
+ Concrete class for large list array elements.
+ """
+
+ def __len__(self):
+ """
+ Return the number of values.
+ """
+ return self.length()
+
+ def __getitem__(self, i):
+ """
+ Return the value at the given index.
+ """
+ return self.getitem(_normalize_index(i, self.length()))
+
+ def __iter__(self):
+ """
+ Iterate over this element's values.
+ """
+ for i in range(len(self)):
+ yield self.getitem(i)
+ raise StopIteration
+
+ cdef void _set_array(self, const shared_ptr[CArray]& sp_array):
+ self.sp_array = sp_array
+ self.ap = <CLargeListArray*> sp_array.get()
+ self.value_type = pyarrow_wrap_data_type(self.ap.value_type())
+
+ cdef getitem(self, int64_t i):
+ cdef int64_t j = self.ap.value_offset(self.index) + i
+ return box_scalar(self.value_type, self.ap.values(), j)
+
+ cdef int64_t length(self):
+ return self.ap.value_length(self.index)
+
+ def as_py(self):
+ """
+ Return this value as a Python list.
+ """
+ cdef:
+ int64_t j
+ list result = []
+
+ for j in range(len(self)):
+ result.append(self.getitem(j).as_py())
+
+ return result
+
+
cdef class UnionValue(ArrayValue):
"""
Concrete class for union array elements.
@@ -729,6 +780,7 @@ cdef dict _array_value_classes = {
_Type_FLOAT: FloatValue,
_Type_DOUBLE: DoubleValue,
_Type_LIST: ListValue,
+ _Type_LARGE_LIST: LargeListValue,
_Type_UNION: UnionValue,
_Type_BINARY: BinaryValue,
_Type_STRING: StringValue,
diff --git a/python/pyarrow/tests/strategies.py b/python/pyarrow/tests/strategies.py
index 498d738..4bbe86d 100644
--- a/python/pyarrow/tests/strategies.py
+++ b/python/pyarrow/tests/strategies.py
@@ -104,7 +104,10 @@ def fields(type_strategy=primitive_types):
def list_types(item_strategy=primitive_types):
- return st.builds(pa.list_, item_strategy)
+ return (
+ st.builds(pa.list_, item_strategy) |
+ st.builds(pa.large_list, item_strategy)
+ )
def struct_types(item_strategy=primitive_types):
@@ -159,11 +162,14 @@ def arrays(draw, type, size=None):
shape = (size,)
- if pa.types.is_list(type):
+ if pa.types.is_list(type) or pa.types.is_large_list(type):
offsets = draw(npst.arrays(np.uint8(), shape=shape)).cumsum() // 20
offsets = np.insert(offsets, 0, 0, axis=0) # prepend with zero
values = draw(arrays(type.value_type, size=int(offsets.sum())))
- return pa.ListArray.from_arrays(offsets, values)
+ array_type = (
+ pa.LargeListArray if pa.types.is_large_list(type)
+ else pa.ListArray)
+ return array_type.from_arrays(offsets, values)
if pa.types.is_struct(type):
h.assume(len(type) > 0)
diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py
index c591544..03db7e9 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -346,9 +346,11 @@ def test_string_binary_from_buffers():
assert copied.null_count == 0
-def test_list_from_buffers():
- ty = pa.list_(pa.int16())
+@pytest.mark.parametrize('list_type_factory', [pa.list_, pa.large_list])
+def test_list_from_buffers(list_type_factory):
+ ty = list_type_factory(pa.int16())
array = pa.array([[0, 1, 2], None, [], [3, 4, 5]], type=ty)
+ assert array.type == ty
buffers = array.buffers()
@@ -486,31 +488,36 @@ def test_dictionary_from_arrays_boundscheck():
pa.DictionaryArray.from_arrays(indices2, dictionary, safe=False)
-def test_list_from_arrays():
+@pytest.mark.parametrize(('list_array_type', 'list_type_factory'),
+ [(pa.ListArray, pa.list_),
+ (pa.LargeListArray, pa.large_list)])
+def test_list_from_arrays(list_array_type, list_type_factory):
offsets_arr = np.array([0, 2, 5, 8], dtype='i4')
offsets = pa.array(offsets_arr, type='int32')
pyvalues = [b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h']
values = pa.array(pyvalues, type='binary')
- result = pa.ListArray.from_arrays(offsets, values)
- expected = pa.array([pyvalues[:2], pyvalues[2:5], pyvalues[5:8]])
+ result = list_array_type.from_arrays(offsets, values)
+ expected = pa.array([pyvalues[:2], pyvalues[2:5], pyvalues[5:8]],
+ type=list_type_factory(pa.binary()))
assert result.equals(expected)
# With nulls
offsets = [0, None, 2, 6]
+ values = [b'a', b'b', b'c', b'd', b'e', b'f']
- values = ['a', 'b', 'c', 'd', 'e', 'f']
-
- result = pa.ListArray.from_arrays(offsets, values)
- expected = pa.array([values[:2], None, values[2:]])
+ result = list_array_type.from_arrays(offsets, values)
+ expected = pa.array([values[:2], None, values[2:]],
+ type=list_type_factory(pa.binary()))
assert result.equals(expected)
# Another edge case
offsets2 = [0, 2, None, 6]
- result = pa.ListArray.from_arrays(offsets2, values)
- expected = pa.array([values[:2], values[2:], None])
+ result = list_array_type.from_arrays(offsets2, values)
+ expected = pa.array([values[:2], values[2:], None],
+ type=list_type_factory(pa.binary()))
assert result.equals(expected)
@@ -767,6 +774,7 @@ def test_cast_from_null():
pa.binary(),
pa.binary(10),
pa.list_(pa.int16()),
+ pa.large_list(pa.uint8()),
pa.decimal128(19, 4),
pa.timestamp('us'),
pa.timestamp('us', tz='UTC'),
@@ -925,6 +933,7 @@ pickle_test_parametrize = pytest.mark.parametrize(
(['a', None, 'b'], pa.string()),
([], None),
([[1, 2], [3]], pa.list_(pa.int64())),
+ ([[4, 5], [6]], pa.large_list(pa.int16())),
([['a'], None, ['b', 'c']], pa.list_(pa.string())),
([(1, 'a'), (2, 'c'), None],
pa.struct([pa.field('a', pa.int64()), pa.field('b', pa.string())]))
@@ -1307,6 +1316,45 @@ def test_list_array_flatten():
assert arr2.flatten().flatten().equals(arr0)
+def test_large_list_array_flatten():
+ typ2 = pa.large_list(
+ pa.large_list(
+ pa.int16()
+ )
+ )
+ arr2 = pa.array([
+ None,
+ [
+ [1, None, 2],
+ None,
+ [3, 4]
+ ],
+ [],
+ [
+ [],
+ [5, 6],
+ None
+ ],
+ [
+ [7, 8]
+ ]
+ ], type=typ2)
+
+ typ1 = pa.large_list(pa.int16())
+ assert typ1 == typ2.value_type
+ arr1 = pa.array([
+ [1, None, 2],
+ None,
+ [3, 4],
+ [],
+ [5, 6],
+ None,
+ [7, 8]
+ ], type=typ1)
+
+ assert arr2.flatten().equals(arr1)
+
+
def test_struct_array_flatten():
ty = pa.struct([pa.field('x', pa.int16()),
pa.field('y', pa.float32())])
diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py
index 37da62c..2520bbd 100644
--- a/python/pyarrow/tests/test_compute.py
+++ b/python/pyarrow/tests/test_compute.py
@@ -53,6 +53,7 @@ def test_sum(arrow_type):
('binary', [b'a', b'b', b'c', b'ddd', b'ee']),
(pa.binary(3), [b'abc', b'bcd', b'cde', b'def', b'efg']),
(pa.list_(pa.int8()), [[1, 2], [3, 4], [5, 6], None, [9, 16]]),
+ (pa.large_list(pa.int16()), [[1], [2, 3, 4], [5, 6], None, [9, 16]]),
(pa.struct([('a', pa.int8()), ('b', pa.int8())]), [
{'a': 1, 'b': 2}, None, {'a': 3, 'b': 4}, None, {'a': 5, 'b': 6}]),
])
diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py
index f39706e..dec5993 100644
--- a/python/pyarrow/tests/test_convert_builtin.py
+++ b/python/pyarrow/tests/test_convert_builtin.py
@@ -206,10 +206,22 @@ def test_nested_lists(seq):
@parametrize_with_iterable_types
+def test_nested_large_lists(seq):
+ data = [[], [1, 2], None]
+ arr = pa.array(seq(data), type=pa.large_list(pa.int16()))
+ assert len(arr) == 3
+ assert arr.null_count == 1
+ assert arr.type == pa.large_list(pa.int16())
+ assert arr.to_pylist() == data
+
+
+@parametrize_with_iterable_types
def test_list_with_non_list(seq):
# List types don't accept non-sequences
with pytest.raises(TypeError):
pa.array(seq([[], [1, 2], 3]), type=pa.list_(pa.int64()))
+ with pytest.raises(TypeError):
+ pa.array(seq([[], [1, 2], 3]), type=pa.large_list(pa.int64()))
@parametrize_with_iterable_types
diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py
index ca7a10e..b319c6f 100644
--- a/python/pyarrow/tests/test_scalars.py
+++ b/python/pyarrow/tests/test_scalars.py
@@ -195,6 +195,29 @@ class TestScalars(unittest.TestCase):
v = arr[3]
assert len(v) == 0
+ def test_large_list(self):
+ arr = pa.array([[123, None], None, [456], []],
+ type=pa.large_list(pa.int16()))
+
+ v = arr[0]
+ assert len(v) == 2
+ assert isinstance(v, pa.LargeListValue)
+ assert repr(v) == "[123, None]"
+ assert v.as_py() == [123, None]
+ assert v[0].as_py() == 123
+ assert v[1] is pa.NA
+ assert v[-1] == v[1]
+ assert v[-2] == v[0]
+ with pytest.raises(IndexError):
+ v[-3]
+ with pytest.raises(IndexError):
+ v[2]
+
+ assert arr[1] is pa.NA
+
+ v = arr[3]
+ assert len(v) == 0
+
@pytest.mark.pandas
def test_timestamp(self):
import pandas as pd
diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py
index de532e6..fb1437d 100644
--- a/python/pyarrow/tests/test_types.py
+++ b/python/pyarrow/tests/test_types.py
@@ -52,6 +52,7 @@ def get_many_types():
pa.large_string(),
pa.large_binary(),
pa.list_(pa.int32()),
+ pa.large_list(pa.uint16()),
pa.struct([pa.field('a', pa.int32()),
pa.field('b', pa.int8()),
pa.field('c', pa.string())]),
@@ -110,7 +111,14 @@ def test_is_decimal():
def test_is_list():
- assert types.is_list(pa.list_(pa.int32()))
+ a = pa.list_(pa.int32())
+ b = pa.large_list(pa.int32())
+
+ assert types.is_list(a)
+ assert not types.is_large_list(a)
+ assert types.is_large_list(b)
+ assert not types.is_list(b)
+
assert not types.is_list(pa.int32())
@@ -129,6 +137,7 @@ def test_is_nested_or_struct():
assert types.is_nested(struct_ex)
assert types.is_nested(pa.list_(pa.int32()))
+ assert types.is_nested(pa.large_list(pa.int32()))
assert not types.is_nested(pa.int32())
@@ -237,12 +246,22 @@ def test_time64_units():
def test_list_type():
ty = pa.list_(pa.int64())
+ assert isinstance(ty, pa.ListType)
assert ty.value_type == pa.int64()
with pytest.raises(TypeError):
pa.list_(None)
+def test_large_list_type():
+ ty = pa.large_list(pa.utf8())
+ assert isinstance(ty, pa.LargeListType)
+ assert ty.value_type == pa.utf8()
+
+ with pytest.raises(TypeError):
+ pa.large_list(None)
+
+
def test_struct_type():
fields = [
# Duplicate field name on purpose
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index 0db15d5..03a4b3d 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -259,6 +259,27 @@ cdef class ListType(DataType):
return pyarrow_wrap_data_type(self.list_type.value_type())
+cdef class LargeListType(DataType):
+ """
+ Concrete class for large list data types
+ (like ListType, but with 64-bit offsets).
+ """
+
+ cdef void init(self, const shared_ptr[CDataType]& type) except *:
+ DataType.init(self, type)
+ self.list_type = <const CLargeListType*> type.get()
+
+ def __reduce__(self):
+ return large_list, (self.value_type,)
+
+ @property
+ def value_type(self):
+ """
+ The data type of large list values.
+ """
+ return pyarrow_wrap_data_type(self.list_type.value_type())
+
+
cdef class StructType(DataType):
"""
Concrete class for struct data types.
@@ -1589,6 +1610,40 @@ cpdef ListType list_(value_type):
return out
+cpdef LargeListType large_list(value_type):
+ """
+ Create LargeListType instance from child data type or field
+
+ This data type may not be supported by all Arrow implementations.
+ Unless you need to represent data larger than 2**31 elements, you should
+ prefer list_().
+
+ Parameters
+ ----------
+ value_type : DataType or Field
+
+ Returns
+ -------
+ list_type : DataType
+ """
+ cdef:
+ DataType data_type
+ Field _field
+ shared_ptr[CDataType] list_type
+ LargeListType out = LargeListType.__new__(LargeListType)
+
+ if isinstance(value_type, DataType):
+ _field = field('item', value_type)
+ elif isinstance(value_type, Field):
+ _field = value_type
+ else:
+ raise TypeError('List requires DataType or Field')
+
+ list_type.reset(new CLargeListType(_field.sp_field))
+ out.init(list_type)
+ return out
+
+
cpdef DictionaryType dictionary(index_type, value_type, bint ordered=False):
"""
Dictionary (categorical, or simply encoded) type
diff --git a/python/pyarrow/types.py b/python/pyarrow/types.py
index dc314e8..d4dffb7 100644
--- a/python/pyarrow/types.py
+++ b/python/pyarrow/types.py
@@ -35,7 +35,8 @@ _FLOATING_TYPES = {lib.Type_HALF_FLOAT, lib.Type_FLOAT, lib.Type_DOUBLE}
_DATE_TYPES = {lib.Type_DATE32, lib.Type_DATE64}
_TIME_TYPES = {lib.Type_TIME32, lib.Type_TIME64}
_TEMPORAL_TYPES = {lib.Type_TIMESTAMP} | _TIME_TYPES | _DATE_TYPES
-_NESTED_TYPES = {lib.Type_LIST, lib.Type_STRUCT, lib.Type_UNION, lib.Type_MAP}
+_NESTED_TYPES = {lib.Type_LIST, lib.Type_LARGE_LIST, lib.Type_STRUCT,
+ lib.Type_UNION, lib.Type_MAP}
def is_null(t):
@@ -164,6 +165,13 @@ def is_list(t):
return t.id == lib.Type_LIST
+def is_large_list(t):
+ """
+ Return True if value is an instance of a large list type
+ """
+ return t.id == lib.Type_LARGE_LIST
+
+
def is_struct(t):
"""
Return True if value is an instance of a struct type