You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2019/08/06 18:41:53 UTC

[arrow] branch master updated: ARROW-6084: [Python] Support LargeList

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 2774cfb  ARROW-6084: [Python] Support LargeList
2774cfb is described below

commit 2774cfb2a2a88af8a72e02010cab7db36b451844
Author: Antoine Pitrou <an...@python.org>
AuthorDate: Tue Aug 6 13:41:41 2019 -0500

    ARROW-6084: [Python] Support LargeList
    
    Closes #4979 from pitrou/ARROW-6084-py-large-list and squashes the following commits:
    
    4266ea2c6 <Antoine Pitrou> ARROW-6084:  Support LargeList
    
    Authored-by: Antoine Pitrou <an...@python.org>
    Signed-off-by: Wes McKinney <we...@apache.org>
---
 cpp/src/arrow/python/python_to_arrow.cc      | 27 +++++++----
 docs/source/python/api/arrays.rst            |  2 +
 docs/source/python/api/datatypes.rst         |  2 +
 python/pyarrow/__init__.py                   | 10 ++--
 python/pyarrow/array.pxi                     | 47 +++++++++++++++++++
 python/pyarrow/includes/libarrow.pxd         | 18 +++++++
 python/pyarrow/lib.pxd                       | 20 ++++++++
 python/pyarrow/lib.pyx                       |  1 +
 python/pyarrow/public-api.pxi                |  2 +
 python/pyarrow/scalar.pxi                    | 52 +++++++++++++++++++++
 python/pyarrow/tests/strategies.py           | 12 +++--
 python/pyarrow/tests/test_array.py           | 70 +++++++++++++++++++++++-----
 python/pyarrow/tests/test_compute.py         |  1 +
 python/pyarrow/tests/test_convert_builtin.py | 12 +++++
 python/pyarrow/tests/test_scalars.py         | 23 +++++++++
 python/pyarrow/tests/test_types.py           | 21 ++++++++-
 python/pyarrow/types.pxi                     | 55 ++++++++++++++++++++++
 python/pyarrow/types.py                      | 10 +++-
 18 files changed, 356 insertions(+), 29 deletions(-)

diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc
index 424e309..a990aec 100644
--- a/cpp/src/arrow/python/python_to_arrow.cc
+++ b/cpp/src/arrow/python/python_to_arrow.cc
@@ -582,19 +582,22 @@ class StringConverter
 // ----------------------------------------------------------------------
 // Convert lists (NumPy arrays containing lists or ndarrays as values)
 
-class ListConverter : public TypedConverter<ListType, ListConverter> {
+template <typename TypeClass>
+class ListConverter : public TypedConverter<TypeClass, ListConverter<TypeClass>> {
  public:
+  using BuilderType = typename TypeTraits<TypeClass>::BuilderType;
+
   explicit ListConverter(bool from_pandas, bool strict_conversions)
       : from_pandas_(from_pandas), strict_conversions_(strict_conversions) {}
 
   Status Init(ArrayBuilder* builder) {
-    builder_ = builder;
-    typed_builder_ = checked_cast<ListBuilder*>(builder);
+    this->builder_ = builder;
+    this->typed_builder_ = checked_cast<BuilderType*>(builder);
 
-    value_type_ = checked_cast<const ListType&>(*builder->type()).value_type();
+    value_type_ = checked_cast<const TypeClass&>(*builder->type()).value_type();
     RETURN_NOT_OK(
         GetConverter(value_type_, from_pandas_, strict_conversions_, &value_converter_));
-    return value_converter_->Init(typed_builder_->value_builder());
+    return value_converter_->Init(this->typed_builder_->value_builder());
   }
 
   template <int NUMPY_TYPE, typename Type>
@@ -602,7 +605,7 @@ class ListConverter : public TypedConverter<ListType, ListConverter> {
   Status AppendNdarrayItem(PyObject* arr);
 
   Status AppendItem(PyObject* obj) {
-    RETURN_NOT_OK(typed_builder_->Append());
+    RETURN_NOT_OK(this->typed_builder_->Append());
     if (PyArray_Check(obj)) {
       return AppendNdarrayItem(obj);
     }
@@ -625,8 +628,9 @@ class ListConverter : public TypedConverter<ListType, ListConverter> {
   bool strict_conversions_;
 };
 
+template <typename TypeClass>
 template <int NUMPY_TYPE, typename Type>
-Status ListConverter::AppendNdarrayTypedItem(PyArrayObject* arr) {
+Status ListConverter<TypeClass>::AppendNdarrayTypedItem(PyArrayObject* arr) {
   using traits = internal::npy_traits<NUMPY_TYPE>;
   using T = typename traits::value_type;
   using ValueBuilderType = typename TypeTraits<Type>::BuilderType;
@@ -673,7 +677,8 @@ Status ListConverter::AppendNdarrayTypedItem(PyArrayObject* arr) {
     return value_converter_->AppendMultiple(obj, value_length); \
   }
 
-Status ListConverter::AppendNdarrayItem(PyObject* obj) {
+template <typename TypeClass>
+Status ListConverter<TypeClass>::AppendNdarrayItem(PyObject* obj) {
   PyArrayObject* arr = reinterpret_cast<PyArrayObject*>(obj);
 
   if (PyArray_NDIM(arr) != 1) {
@@ -914,7 +919,11 @@ Status GetConverter(const std::shared_ptr<DataType>& type, bool from_pandas,
     }
     case Type::LIST:
       *out = std::unique_ptr<SeqConverter>(
-          new ListConverter(from_pandas, strict_conversions));
+          new ListConverter<ListType>(from_pandas, strict_conversions));
+      break;
+    case Type::LARGE_LIST:
+      *out = std::unique_ptr<SeqConverter>(
+          new ListConverter<LargeListType>(from_pandas, strict_conversions));
       break;
     case Type::STRUCT:
       *out = std::unique_ptr<SeqConverter>(
diff --git a/docs/source/python/api/arrays.rst b/docs/source/python/api/arrays.rst
index e10b5af..ca7d1e7 100644
--- a/docs/source/python/api/arrays.rst
+++ b/docs/source/python/api/arrays.rst
@@ -67,6 +67,7 @@ may expose data type-specific methods or properties.
    Decimal128Array
    DictionaryArray
    ListArray
+   LargeListArray
    StructArray
    UnionArray
 
@@ -109,5 +110,6 @@ any of those classes directly.
    DecimalValue
    DictionaryValue
    ListValue
+   LargeListValue
    StructValue
    UnionValue
diff --git a/docs/source/python/api/datatypes.rst b/docs/source/python/api/datatypes.rst
index 327bcf6..6502ae2 100644
--- a/docs/source/python/api/datatypes.rst
+++ b/docs/source/python/api/datatypes.rst
@@ -55,6 +55,7 @@ These should be used to create Arrow data types and schemas.
    large_utf8
    decimal128
    list_
+   large_list
    struct
    dictionary
    field
@@ -117,6 +118,7 @@ represents a given data type (such as ``int32``) or general category
    is_float64
    is_decimal
    is_list
+   is_large_list
    is_struct
    is_union
    is_nested
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index 51afa0f..4e7707a 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -55,9 +55,10 @@ from pyarrow.lib import (null, bool_,
                          binary, string, utf8,
                          large_binary, large_string, large_utf8,
                          decimal128,
-                         list_, struct, union, dictionary, field,
+                         list_, large_list, struct, union, dictionary, field,
                          type_for_alias,
-                         DataType, DictionaryType, ListType, StructType,
+                         DataType, DictionaryType, StructType,
+                         ListType, LargeListType,
                          UnionType, TimestampType, Time32Type, Time64Type,
                          FixedSizeBinaryType, Decimal128Type,
                          BaseExtensionType, ExtensionType,
@@ -77,7 +78,7 @@ from pyarrow.lib import (null, bool_,
                          Int16Array, UInt16Array,
                          Int32Array, UInt32Array,
                          Int64Array, UInt64Array,
-                         ListArray, UnionArray,
+                         ListArray, LargeListArray, UnionArray,
                          BinaryArray, StringArray,
                          LargeBinaryArray, LargeStringArray,
                          FixedSizeBinaryArray,
@@ -89,7 +90,8 @@ from pyarrow.lib import (null, bool_,
                          BooleanValue,
                          Int8Value, Int16Value, Int32Value, Int64Value,
                          UInt8Value, UInt16Value, UInt32Value, UInt64Value,
-                         HalfFloatValue, FloatValue, DoubleValue, ListValue,
+                         HalfFloatValue, FloatValue, DoubleValue,
+                         ListValue, LargeListValue,
                          BinaryValue, StringValue,
                          LargeBinaryValue, LargeStringValue,
                          FixedSizeBinaryValue,
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index ecb8ff5..b93cf10 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -1070,6 +1070,52 @@ cdef class ListArray(Array):
         return pyarrow_wrap_array(arr.values())
 
 
+cdef class LargeListArray(Array):
+    """
+    Concrete class for Arrow arrays of a large list data type
+    (like ListArray, but 64-bit offsets).
+    """
+
+    @staticmethod
+    def from_arrays(offsets, values, MemoryPool pool=None):
+        """
+        Construct LargeListArray from arrays of int64 offsets and values
+
+        Parameters
+        ----------
+        offset : Array (int64 type)
+        values : Array (any type)
+
+        Returns
+        -------
+        list_array : LargeListArray
+        """
+        cdef:
+            Array _offsets, _values
+            shared_ptr[CArray] out
+        cdef CMemoryPool* cpool = maybe_unbox_memory_pool(pool)
+
+        _offsets = asarray(offsets, type='int64')
+        _values = asarray(values)
+
+        with nogil:
+            check_status(CLargeListArray.FromArrays(_offsets.ap[0],
+                                                    _values.ap[0],
+                                                    cpool, &out))
+        return pyarrow_wrap_array(out)
+
+    def flatten(self):
+        """
+        Unnest this LargeListArray by one level
+
+        Returns
+        -------
+        result : Array
+        """
+        cdef CLargeListArray* arr = <CLargeListArray*> self.ap
+        return pyarrow_wrap_array(arr.values())
+
+
 cdef class UnionArray(Array):
     """
     Concrete class for Arrow arrays of a Union data type.
@@ -1511,6 +1557,7 @@ cdef dict _array_classes = {
     _Type_FLOAT: FloatArray,
     _Type_DOUBLE: DoubleArray,
     _Type_LIST: ListArray,
+    _Type_LARGE_LIST: LargeListArray,
     _Type_UNION: UnionArray,
     _Type_BINARY: BinaryArray,
     _Type_STRING: StringArray,
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 4dc6427..ad0fa09 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -70,6 +70,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
         _Type_FIXED_SIZE_BINARY" arrow::Type::FIXED_SIZE_BINARY"
 
         _Type_LIST" arrow::Type::LIST"
+        _Type_LARGE_LIST" arrow::Type::LARGE_LIST"
         _Type_STRUCT" arrow::Type::STRUCT"
         _Type_UNION" arrow::Type::UNION"
         _Type_DICTIONARY" arrow::Type::DICTIONARY"
@@ -252,6 +253,12 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
         shared_ptr[CDataType] value_type()
         shared_ptr[CField] value_field()
 
+    cdef cppclass CLargeListType" arrow::LargeListType"(CDataType):
+        CLargeListType(const shared_ptr[CDataType]& value_type)
+        CLargeListType(const shared_ptr[CField]& field)
+        shared_ptr[CDataType] value_type()
+        shared_ptr[CField] value_field()
+
     cdef cppclass CStringType" arrow::StringType"(CDataType):
         pass
 
@@ -419,6 +426,17 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
         shared_ptr[CArray] values()
         shared_ptr[CDataType] value_type()
 
+    cdef cppclass CLargeListArray" arrow::LargeListArray"(CArray):
+        @staticmethod
+        CStatus FromArrays(const CArray& offsets, const CArray& values,
+                           CMemoryPool* pool, shared_ptr[CArray]* out)
+
+        const int64_t* raw_value_offsets()
+        int64_t value_offset(int i)
+        int64_t value_length(int i)
+        shared_ptr[CArray] values()
+        shared_ptr[CDataType] value_type()
+
     cdef cppclass CUnionArray" arrow::UnionArray"(CArray):
         @staticmethod
         CStatus MakeSparse(const CArray& type_ids,
diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd
index 0931463..b59f3f8 100644
--- a/python/pyarrow/lib.pxd
+++ b/python/pyarrow/lib.pxd
@@ -64,6 +64,11 @@ cdef class ListType(DataType):
         const CListType* list_type
 
 
+cdef class LargeListType(DataType):
+    cdef:
+        const CLargeListType* list_type
+
+
 cdef class StructType(DataType):
     cdef:
         const CStructType* struct_type
@@ -184,6 +189,17 @@ cdef class ListValue(ArrayValue):
     cdef int64_t length(self)
 
 
+cdef class LargeListValue(ArrayValue):
+    cdef readonly:
+        DataType value_type
+
+    cdef:
+        CLargeListArray* ap
+
+    cdef getitem(self, int64_t i)
+    cdef int64_t length(self)
+
+
 cdef class StructValue(ArrayValue):
     cdef:
         CStructArray* ap
@@ -336,6 +352,10 @@ cdef class ListArray(Array):
     pass
 
 
+cdef class LargeListArray(Array):
+    pass
+
+
 cdef class UnionArray(Array):
     pass
 
diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx
index 0b33f39..8baaab3 100644
--- a/python/pyarrow/lib.pyx
+++ b/python/pyarrow/lib.pyx
@@ -91,6 +91,7 @@ Type_LARGE_BINARY = _Type_LARGE_BINARY
 Type_LARGE_STRING = _Type_LARGE_STRING
 Type_FIXED_SIZE_BINARY = _Type_FIXED_SIZE_BINARY
 Type_LIST = _Type_LIST
+Type_LARGE_LIST = _Type_LARGE_LIST
 Type_STRUCT = _Type_STRUCT
 Type_UNION = _Type_UNION
 Type_DICTIONARY = _Type_DICTIONARY
diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi
index f6ef2c9..bb5bdca 100644
--- a/python/pyarrow/public-api.pxi
+++ b/python/pyarrow/public-api.pxi
@@ -79,6 +79,8 @@ cdef api object pyarrow_wrap_data_type(
         out = DictionaryType.__new__(DictionaryType)
     elif type.get().id() == _Type_LIST:
         out = ListType.__new__(ListType)
+    elif type.get().id() == _Type_LARGE_LIST:
+        out = LargeListType.__new__(LargeListType)
     elif type.get().id() == _Type_STRUCT:
         out = StructType.__new__(StructType)
     elif type.get().id() == _Type_UNION:
diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi
index 0ead3e5..aa100ca 100644
--- a/python/pyarrow/scalar.pxi
+++ b/python/pyarrow/scalar.pxi
@@ -586,6 +586,57 @@ cdef class ListValue(ArrayValue):
         return result
 
 
+cdef class LargeListValue(ArrayValue):
+    """
+    Concrete class for large list array elements.
+    """
+
+    def __len__(self):
+        """
+        Return the number of values.
+        """
+        return self.length()
+
+    def __getitem__(self, i):
+        """
+        Return the value at the given index.
+        """
+        return self.getitem(_normalize_index(i, self.length()))
+
+    def __iter__(self):
+        """
+        Iterate over this element's values.
+        """
+        for i in range(len(self)):
+            yield self.getitem(i)
+        raise StopIteration
+
+    cdef void _set_array(self, const shared_ptr[CArray]& sp_array):
+        self.sp_array = sp_array
+        self.ap = <CLargeListArray*> sp_array.get()
+        self.value_type = pyarrow_wrap_data_type(self.ap.value_type())
+
+    cdef getitem(self, int64_t i):
+        cdef int64_t j = self.ap.value_offset(self.index) + i
+        return box_scalar(self.value_type, self.ap.values(), j)
+
+    cdef int64_t length(self):
+        return self.ap.value_length(self.index)
+
+    def as_py(self):
+        """
+        Return this value as a Python list.
+        """
+        cdef:
+            int64_t j
+            list result = []
+
+        for j in range(len(self)):
+            result.append(self.getitem(j).as_py())
+
+        return result
+
+
 cdef class UnionValue(ArrayValue):
     """
     Concrete class for union array elements.
@@ -729,6 +780,7 @@ cdef dict _array_value_classes = {
     _Type_FLOAT: FloatValue,
     _Type_DOUBLE: DoubleValue,
     _Type_LIST: ListValue,
+    _Type_LARGE_LIST: LargeListValue,
     _Type_UNION: UnionValue,
     _Type_BINARY: BinaryValue,
     _Type_STRING: StringValue,
diff --git a/python/pyarrow/tests/strategies.py b/python/pyarrow/tests/strategies.py
index 498d738..4bbe86d 100644
--- a/python/pyarrow/tests/strategies.py
+++ b/python/pyarrow/tests/strategies.py
@@ -104,7 +104,10 @@ def fields(type_strategy=primitive_types):
 
 
 def list_types(item_strategy=primitive_types):
-    return st.builds(pa.list_, item_strategy)
+    return (
+        st.builds(pa.list_, item_strategy) |
+        st.builds(pa.large_list, item_strategy)
+        )
 
 
 def struct_types(item_strategy=primitive_types):
@@ -159,11 +162,14 @@ def arrays(draw, type, size=None):
 
     shape = (size,)
 
-    if pa.types.is_list(type):
+    if pa.types.is_list(type) or pa.types.is_large_list(type):
         offsets = draw(npst.arrays(np.uint8(), shape=shape)).cumsum() // 20
         offsets = np.insert(offsets, 0, 0, axis=0)  # prepend with zero
         values = draw(arrays(type.value_type, size=int(offsets.sum())))
-        return pa.ListArray.from_arrays(offsets, values)
+        array_type = (
+            pa.LargeListArray if pa.types.is_large_list(type)
+            else pa.ListArray)
+        return array_type.from_arrays(offsets, values)
 
     if pa.types.is_struct(type):
         h.assume(len(type) > 0)
diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py
index c591544..03db7e9 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -346,9 +346,11 @@ def test_string_binary_from_buffers():
     assert copied.null_count == 0
 
 
-def test_list_from_buffers():
-    ty = pa.list_(pa.int16())
+@pytest.mark.parametrize('list_type_factory', [pa.list_, pa.large_list])
+def test_list_from_buffers(list_type_factory):
+    ty = list_type_factory(pa.int16())
     array = pa.array([[0, 1, 2], None, [], [3, 4, 5]], type=ty)
+    assert array.type == ty
 
     buffers = array.buffers()
 
@@ -486,31 +488,36 @@ def test_dictionary_from_arrays_boundscheck():
     pa.DictionaryArray.from_arrays(indices2, dictionary, safe=False)
 
 
-def test_list_from_arrays():
+@pytest.mark.parametrize(('list_array_type', 'list_type_factory'),
+                         [(pa.ListArray, pa.list_),
+                          (pa.LargeListArray, pa.large_list)])
+def test_list_from_arrays(list_array_type, list_type_factory):
     offsets_arr = np.array([0, 2, 5, 8], dtype='i4')
     offsets = pa.array(offsets_arr, type='int32')
     pyvalues = [b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h']
     values = pa.array(pyvalues, type='binary')
 
-    result = pa.ListArray.from_arrays(offsets, values)
-    expected = pa.array([pyvalues[:2], pyvalues[2:5], pyvalues[5:8]])
+    result = list_array_type.from_arrays(offsets, values)
+    expected = pa.array([pyvalues[:2], pyvalues[2:5], pyvalues[5:8]],
+                        type=list_type_factory(pa.binary()))
 
     assert result.equals(expected)
 
     # With nulls
     offsets = [0, None, 2, 6]
+    values = [b'a', b'b', b'c', b'd', b'e', b'f']
 
-    values = ['a', 'b', 'c', 'd', 'e', 'f']
-
-    result = pa.ListArray.from_arrays(offsets, values)
-    expected = pa.array([values[:2], None, values[2:]])
+    result = list_array_type.from_arrays(offsets, values)
+    expected = pa.array([values[:2], None, values[2:]],
+                        type=list_type_factory(pa.binary()))
 
     assert result.equals(expected)
 
     # Another edge case
     offsets2 = [0, 2, None, 6]
-    result = pa.ListArray.from_arrays(offsets2, values)
-    expected = pa.array([values[:2], values[2:], None])
+    result = list_array_type.from_arrays(offsets2, values)
+    expected = pa.array([values[:2], values[2:], None],
+                        type=list_type_factory(pa.binary()))
     assert result.equals(expected)
 
 
@@ -767,6 +774,7 @@ def test_cast_from_null():
         pa.binary(),
         pa.binary(10),
         pa.list_(pa.int16()),
+        pa.large_list(pa.uint8()),
         pa.decimal128(19, 4),
         pa.timestamp('us'),
         pa.timestamp('us', tz='UTC'),
@@ -925,6 +933,7 @@ pickle_test_parametrize = pytest.mark.parametrize(
         (['a', None, 'b'], pa.string()),
         ([], None),
         ([[1, 2], [3]], pa.list_(pa.int64())),
+        ([[4, 5], [6]], pa.large_list(pa.int16())),
         ([['a'], None, ['b', 'c']], pa.list_(pa.string())),
         ([(1, 'a'), (2, 'c'), None],
             pa.struct([pa.field('a', pa.int64()), pa.field('b', pa.string())]))
@@ -1307,6 +1316,45 @@ def test_list_array_flatten():
     assert arr2.flatten().flatten().equals(arr0)
 
 
+def test_large_list_array_flatten():
+    typ2 = pa.large_list(
+        pa.large_list(
+            pa.int16()
+        )
+    )
+    arr2 = pa.array([
+        None,
+        [
+            [1, None, 2],
+            None,
+            [3, 4]
+        ],
+        [],
+        [
+            [],
+            [5, 6],
+            None
+        ],
+        [
+            [7, 8]
+        ]
+    ], type=typ2)
+
+    typ1 = pa.large_list(pa.int16())
+    assert typ1 == typ2.value_type
+    arr1 = pa.array([
+        [1, None, 2],
+        None,
+        [3, 4],
+        [],
+        [5, 6],
+        None,
+        [7, 8]
+    ], type=typ1)
+
+    assert arr2.flatten().equals(arr1)
+
+
 def test_struct_array_flatten():
     ty = pa.struct([pa.field('x', pa.int16()),
                     pa.field('y', pa.float32())])
diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py
index 37da62c..2520bbd 100644
--- a/python/pyarrow/tests/test_compute.py
+++ b/python/pyarrow/tests/test_compute.py
@@ -53,6 +53,7 @@ def test_sum(arrow_type):
     ('binary', [b'a', b'b', b'c', b'ddd', b'ee']),
     (pa.binary(3), [b'abc', b'bcd', b'cde', b'def', b'efg']),
     (pa.list_(pa.int8()), [[1, 2], [3, 4], [5, 6], None, [9, 16]]),
+    (pa.large_list(pa.int16()), [[1], [2, 3, 4], [5, 6], None, [9, 16]]),
     (pa.struct([('a', pa.int8()), ('b', pa.int8())]), [
      {'a': 1, 'b': 2}, None, {'a': 3, 'b': 4}, None, {'a': 5, 'b': 6}]),
 ])
diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py
index f39706e..dec5993 100644
--- a/python/pyarrow/tests/test_convert_builtin.py
+++ b/python/pyarrow/tests/test_convert_builtin.py
@@ -206,10 +206,22 @@ def test_nested_lists(seq):
 
 
 @parametrize_with_iterable_types
+def test_nested_large_lists(seq):
+    data = [[], [1, 2], None]
+    arr = pa.array(seq(data), type=pa.large_list(pa.int16()))
+    assert len(arr) == 3
+    assert arr.null_count == 1
+    assert arr.type == pa.large_list(pa.int16())
+    assert arr.to_pylist() == data
+
+
+@parametrize_with_iterable_types
 def test_list_with_non_list(seq):
     # List types don't accept non-sequences
     with pytest.raises(TypeError):
         pa.array(seq([[], [1, 2], 3]), type=pa.list_(pa.int64()))
+    with pytest.raises(TypeError):
+        pa.array(seq([[], [1, 2], 3]), type=pa.large_list(pa.int64()))
 
 
 @parametrize_with_iterable_types
diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py
index ca7a10e..b319c6f 100644
--- a/python/pyarrow/tests/test_scalars.py
+++ b/python/pyarrow/tests/test_scalars.py
@@ -195,6 +195,29 @@ class TestScalars(unittest.TestCase):
         v = arr[3]
         assert len(v) == 0
 
+    def test_large_list(self):
+        arr = pa.array([[123, None], None, [456], []],
+                       type=pa.large_list(pa.int16()))
+
+        v = arr[0]
+        assert len(v) == 2
+        assert isinstance(v, pa.LargeListValue)
+        assert repr(v) == "[123, None]"
+        assert v.as_py() == [123, None]
+        assert v[0].as_py() == 123
+        assert v[1] is pa.NA
+        assert v[-1] == v[1]
+        assert v[-2] == v[0]
+        with pytest.raises(IndexError):
+            v[-3]
+        with pytest.raises(IndexError):
+            v[2]
+
+        assert arr[1] is pa.NA
+
+        v = arr[3]
+        assert len(v) == 0
+
     @pytest.mark.pandas
     def test_timestamp(self):
         import pandas as pd
diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py
index de532e6..fb1437d 100644
--- a/python/pyarrow/tests/test_types.py
+++ b/python/pyarrow/tests/test_types.py
@@ -52,6 +52,7 @@ def get_many_types():
         pa.large_string(),
         pa.large_binary(),
         pa.list_(pa.int32()),
+        pa.large_list(pa.uint16()),
         pa.struct([pa.field('a', pa.int32()),
                    pa.field('b', pa.int8()),
                    pa.field('c', pa.string())]),
@@ -110,7 +111,14 @@ def test_is_decimal():
 
 
 def test_is_list():
-    assert types.is_list(pa.list_(pa.int32()))
+    a = pa.list_(pa.int32())
+    b = pa.large_list(pa.int32())
+
+    assert types.is_list(a)
+    assert not types.is_large_list(a)
+    assert types.is_large_list(b)
+    assert not types.is_list(b)
+
     assert not types.is_list(pa.int32())
 
 
@@ -129,6 +137,7 @@ def test_is_nested_or_struct():
 
     assert types.is_nested(struct_ex)
     assert types.is_nested(pa.list_(pa.int32()))
+    assert types.is_nested(pa.large_list(pa.int32()))
     assert not types.is_nested(pa.int32())
 
 
@@ -237,12 +246,22 @@ def test_time64_units():
 
 def test_list_type():
     ty = pa.list_(pa.int64())
+    assert isinstance(ty, pa.ListType)
     assert ty.value_type == pa.int64()
 
     with pytest.raises(TypeError):
         pa.list_(None)
 
 
+def test_large_list_type():
+    ty = pa.large_list(pa.utf8())
+    assert isinstance(ty, pa.LargeListType)
+    assert ty.value_type == pa.utf8()
+
+    with pytest.raises(TypeError):
+        pa.large_list(None)
+
+
 def test_struct_type():
     fields = [
         # Duplicate field name on purpose
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index 0db15d5..03a4b3d 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -259,6 +259,27 @@ cdef class ListType(DataType):
         return pyarrow_wrap_data_type(self.list_type.value_type())
 
 
+cdef class LargeListType(DataType):
+    """
+    Concrete class for large list data types
+    (like ListType, but with 64-bit offsets).
+    """
+
+    cdef void init(self, const shared_ptr[CDataType]& type) except *:
+        DataType.init(self, type)
+        self.list_type = <const CLargeListType*> type.get()
+
+    def __reduce__(self):
+        return large_list, (self.value_type,)
+
+    @property
+    def value_type(self):
+        """
+        The data type of large list values.
+        """
+        return pyarrow_wrap_data_type(self.list_type.value_type())
+
+
 cdef class StructType(DataType):
     """
     Concrete class for struct data types.
@@ -1589,6 +1610,40 @@ cpdef ListType list_(value_type):
     return out
 
 
+cpdef LargeListType large_list(value_type):
+    """
+    Create LargeListType instance from child data type or field
+
+    This data type may not be supported by all Arrow implementations.
+    Unless you need to represent data larger than 2**31 elements, you should
+    prefer list_().
+
+    Parameters
+    ----------
+    value_type : DataType or Field
+
+    Returns
+    -------
+    list_type : DataType
+    """
+    cdef:
+        DataType data_type
+        Field _field
+        shared_ptr[CDataType] list_type
+        LargeListType out = LargeListType.__new__(LargeListType)
+
+    if isinstance(value_type, DataType):
+        _field = field('item', value_type)
+    elif isinstance(value_type, Field):
+        _field = value_type
+    else:
+        raise TypeError('List requires DataType or Field')
+
+    list_type.reset(new CLargeListType(_field.sp_field))
+    out.init(list_type)
+    return out
+
+
 cpdef DictionaryType dictionary(index_type, value_type, bint ordered=False):
     """
     Dictionary (categorical, or simply encoded) type
diff --git a/python/pyarrow/types.py b/python/pyarrow/types.py
index dc314e8..d4dffb7 100644
--- a/python/pyarrow/types.py
+++ b/python/pyarrow/types.py
@@ -35,7 +35,8 @@ _FLOATING_TYPES = {lib.Type_HALF_FLOAT, lib.Type_FLOAT, lib.Type_DOUBLE}
 _DATE_TYPES = {lib.Type_DATE32, lib.Type_DATE64}
 _TIME_TYPES = {lib.Type_TIME32, lib.Type_TIME64}
 _TEMPORAL_TYPES = {lib.Type_TIMESTAMP} | _TIME_TYPES | _DATE_TYPES
-_NESTED_TYPES = {lib.Type_LIST, lib.Type_STRUCT, lib.Type_UNION, lib.Type_MAP}
+_NESTED_TYPES = {lib.Type_LIST, lib.Type_LARGE_LIST, lib.Type_STRUCT,
+                 lib.Type_UNION, lib.Type_MAP}
 
 
 def is_null(t):
@@ -164,6 +165,13 @@ def is_list(t):
     return t.id == lib.Type_LIST
 
 
+def is_large_list(t):
+    """
+    Return True if value is an instance of a large list type
+    """
+    return t.id == lib.Type_LARGE_LIST
+
+
 def is_struct(t):
     """
     Return True if value is an instance of a struct type