You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by mo...@apache.org on 2022/06/19 02:24:15 UTC

[doris] branch master updated: [improvement] Change array offset type from UInt32 to UInt64 (#10070)

This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 0e404edf54 [improvement] Change array offset type from UInt32 to UInt64 (#10070)
0e404edf54 is described below

commit 0e404edf54f11499b1697958edab761cd52dae54
Author: camby <10...@qq.com>
AuthorDate: Sun Jun 19 10:24:08 2022 +0800

    [improvement] Change array offset type from UInt32 to UInt64 (#10070)
    
    Now column `Array<T>` contains column `offsets` and `data`, and type of column `offsets` is UInt32 now.
    If we call array_union to merge arrays repeatedly, the size of array may overflow.
    So we need to extend it before `Array Data Type` release.
---
 be/src/exec/olap_scanner.cpp                       |  2 +-
 be/src/olap/column_vector.cpp                      |  6 ++--
 be/src/olap/column_vector.h                        |  4 +--
 be/src/olap/row_block2.cpp                         |  6 ++--
 be/src/olap/rowset/segment_v2/column_reader.cpp    |  2 +-
 be/src/olap/rowset/segment_v2/column_reader.h      |  2 +-
 be/src/olap/rowset/segment_v2/column_writer.cpp    |  4 +--
 be/src/runtime/collection_value.cpp                | 18 ++++++------
 be/src/runtime/collection_value.h                  | 32 +++++++++++-----------
 be/src/runtime/tuple.cpp                           |  4 +--
 be/src/udf/udf.h                                   |  4 +--
 be/src/vec/columns/column.h                        |  2 +-
 be/src/vec/columns/column_string.cpp               |  4 +--
 be/src/vec/columns/column_vector.cpp               |  2 +-
 be/src/vec/data_types/data_type_array.cpp          |  4 +--
 be/src/vec/data_types/data_type_string.cpp         | 18 ++++++------
 be/src/vec/functions/array/function_array_size.h   |  2 +-
 be/src/vec/functions/function_string.h             |  2 +-
 be/src/vec/sink/mysql_result_writer.cpp            |  4 +--
 be/src/vec/utils/arrow_column_to_doris_column.cpp  |  8 +++---
 be/test/olap/column_vector_test.cpp                | 10 +++----
 .../org/apache/doris/catalog/PrimitiveType.java    |  3 +-
 22 files changed, 72 insertions(+), 71 deletions(-)

diff --git a/be/src/exec/olap_scanner.cpp b/be/src/exec/olap_scanner.cpp
index 69d23f5219..3bd2783ea0 100644
--- a/be/src/exec/olap_scanner.cpp
+++ b/be/src/exec/olap_scanner.cpp
@@ -395,7 +395,7 @@ Status OlapScanner::get_batch(RuntimeState* state, RowBatch* batch, bool* eof) {
                     auto pool = batch->tuple_data_pool();
                     CollectionValue::deep_copy_collection(
                             slot, item_type,
-                            [pool](int size) -> MemFootprint {
+                            [pool](int64_t size) -> MemFootprint {
                                 int64_t offset = pool->total_allocated_bytes();
                                 uint8_t* data = pool->allocate(size);
                                 return {offset, data};
diff --git a/be/src/olap/column_vector.cpp b/be/src/olap/column_vector.cpp
index 41e91b9d63..c4962926f2 100644
--- a/be/src/olap/column_vector.cpp
+++ b/be/src/olap/column_vector.cpp
@@ -144,13 +144,13 @@ Status ColumnVectorBatch::create(size_t init_capacity, bool is_nullable, const T
                     array_type_info->item_type_info(), field->get_sub_field(0), &elements));
 
             std::unique_ptr<ColumnVectorBatch> offsets;
-            const auto* offsets_type_info = get_scalar_type_info<OLAP_FIELD_TYPE_UNSIGNED_INT>();
+            const auto* offsets_type_info = get_scalar_type_info<OLAP_FIELD_TYPE_UNSIGNED_BIGINT>();
             RETURN_IF_ERROR(ColumnVectorBatch::create(init_capacity + 1, false, offsets_type_info,
                                                       nullptr, &offsets));
 
             std::unique_ptr<ColumnVectorBatch> local(new ArrayColumnVectorBatch(
                     type_info, is_nullable,
-                    reinterpret_cast<ScalarColumnVectorBatch<uint32_t>*>(offsets.release()),
+                    reinterpret_cast<ScalarColumnVectorBatch<uint64_t>*>(offsets.release()),
                     elements.release()));
             RETURN_IF_ERROR(local->resize(init_capacity));
             *column_vector_batch = std::move(local);
@@ -181,7 +181,7 @@ Status ScalarColumnVectorBatch<ScalarType>::resize(size_t new_cap) {
 }
 
 ArrayColumnVectorBatch::ArrayColumnVectorBatch(const TypeInfo* type_info, bool is_nullable,
-                                               ScalarColumnVectorBatch<uint32_t>* offsets,
+                                               ScalarColumnVectorBatch<uint64_t>* offsets,
                                                ColumnVectorBatch* elements)
         : ColumnVectorBatch(type_info, is_nullable), _data(0) {
     _offsets.reset(offsets);
diff --git a/be/src/olap/column_vector.h b/be/src/olap/column_vector.h
index 70ee6e3041..28139fe7d2 100644
--- a/be/src/olap/column_vector.h
+++ b/be/src/olap/column_vector.h
@@ -178,7 +178,7 @@ private:
 class ArrayColumnVectorBatch : public ColumnVectorBatch {
 public:
     explicit ArrayColumnVectorBatch(const TypeInfo* type_info, bool is_nullable,
-                                    ScalarColumnVectorBatch<uint32_t>* offsets,
+                                    ScalarColumnVectorBatch<uint64_t>* offsets,
                                     ColumnVectorBatch* elements);
     ~ArrayColumnVectorBatch() override;
     Status resize(size_t new_cap) override;
@@ -249,7 +249,7 @@ private:
     std::unique_ptr<ColumnVectorBatch> _elements;
 
     // Stores each array's start offsets in _elements.
-    std::unique_ptr<ScalarColumnVectorBatch<uint32_t>> _offsets;
+    std::unique_ptr<ScalarColumnVectorBatch<uint64_t>> _offsets;
 };
 
 } // namespace doris
diff --git a/be/src/olap/row_block2.cpp b/be/src/olap/row_block2.cpp
index fd4b0ef23e..947bdd4055 100644
--- a/be/src/olap/row_block2.cpp
+++ b/be/src/olap/row_block2.cpp
@@ -289,7 +289,7 @@ Status RowBlockV2::_copy_data_to_column(int cid,
 
         auto& offsets_col = column_array->get_offsets();
         offsets_col.reserve(_selected_size);
-        uint32_t offset = offsets_col.back();
+        uint64_t offset = offsets_col.back();
         for (uint16_t j = 0; j < _selected_size; ++j) {
             uint16_t row_idx = _selection_vector[j];
             auto cv = reinterpret_cast<const CollectionValue*>(column_block(cid).cell_ptr(row_idx));
@@ -550,10 +550,10 @@ Status RowBlockV2::_append_data_to_column(const ColumnVectorBatch* batch, size_t
         auto nested_col = (*column_array->get_data_ptr()).assume_mutable();
 
         auto& offsets_col = column_array->get_offsets();
-        uint32_t offset = offsets_col.back();
+        auto offset = offsets_col.back();
         for (uint32_t j = 0; j < selected_size; ++j) {
             if (!nullable_mark_array[j]) {
-                uint32_t row_idx = j + start;
+                uint64_t row_idx = j + start;
                 auto cv = reinterpret_cast<const CollectionValue*>(batch->cell_ptr(row_idx));
                 offset += cv->length();
                 _append_data_to_column(array_batch->elements(), array_batch->item_offset(row_idx),
diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp b/be/src/olap/rowset/segment_v2/column_reader.cpp
index d68ffc66c0..fe6cd94a82 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/column_reader.cpp
@@ -396,7 +396,7 @@ Status ArrayFileColumnIterator::init(const ColumnIteratorOptions& opts) {
     if (_array_reader->is_nullable()) {
         RETURN_IF_ERROR(_null_iterator->init(opts));
     }
-    const auto* offset_type_info = get_scalar_type_info<OLAP_FIELD_TYPE_UNSIGNED_INT>();
+    const auto* offset_type_info = get_scalar_type_info<OLAP_FIELD_TYPE_UNSIGNED_BIGINT>();
     RETURN_IF_ERROR(
             ColumnVectorBatch::create(1024, false, offset_type_info, nullptr, &_length_batch));
     return Status::OK();
diff --git a/be/src/olap/rowset/segment_v2/column_reader.h b/be/src/olap/rowset/segment_v2/column_reader.h
index 47250a96cb..a365679ee7 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.h
+++ b/be/src/olap/rowset/segment_v2/column_reader.h
@@ -369,7 +369,7 @@ public:
                                            : size_to_read;
                 ColumnBlockView ordinal_view(&ordinal_block);
                 RETURN_IF_ERROR(_length_iterator->next_batch(&this_read, &ordinal_view, &has_null));
-                auto* ordinals = reinterpret_cast<uint32_t*>(_length_batch->data());
+                auto* ordinals = reinterpret_cast<uint64_t*>(_length_batch->data());
                 for (int i = 0; i < this_read; ++i) {
                     item_ordinal += ordinals[i];
                 }
diff --git a/be/src/olap/rowset/segment_v2/column_writer.cpp b/be/src/olap/rowset/segment_v2/column_writer.cpp
index 6c76ddff62..c96ccd9a61 100644
--- a/be/src/olap/rowset/segment_v2/column_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/column_writer.cpp
@@ -110,7 +110,7 @@ Status ColumnWriter::create(const ColumnWriterOptions& opts, const TabletColumn*
                     ColumnWriter::create(item_options, &item_column, _wblock, &item_writer));
 
             // create length writer
-            FieldType length_type = FieldType::OLAP_FIELD_TYPE_UNSIGNED_INT;
+            FieldType length_type = FieldType::OLAP_FIELD_TYPE_UNSIGNED_BIGINT;
 
             ColumnWriterOptions length_options;
             length_options.meta = opts.meta->add_children_columns();
@@ -119,7 +119,7 @@ Status ColumnWriter::create(const ColumnWriterOptions& opts, const TabletColumn*
             length_options.meta->set_type(length_type);
             length_options.meta->set_is_nullable(false);
             length_options.meta->set_length(
-                    get_scalar_type_info<OLAP_FIELD_TYPE_UNSIGNED_INT>()->size());
+                    get_scalar_type_info<OLAP_FIELD_TYPE_UNSIGNED_BIGINT>()->size());
             length_options.meta->set_encoding(DEFAULT_ENCODING);
             length_options.meta->set_compression(opts.meta->compression());
 
diff --git a/be/src/runtime/collection_value.cpp b/be/src/runtime/collection_value.cpp
index 9ea8fc3d43..d8581d8ac5 100644
--- a/be/src/runtime/collection_value.cpp
+++ b/be/src/runtime/collection_value.cpp
@@ -186,7 +186,7 @@ struct ArrayIteratorFunctionsForString : public GenericArrayIteratorFunctions<ty
     static void deserialize(void* item, const char* tuple_data, const TypeDescriptor& type_desc) {
         auto* string_value = static_cast<CppType*>(item);
         if (string_value->len) {
-            int offset = convert_to<int>(string_value->ptr);
+            int64_t offset = convert_to<int64_t>(string_value->ptr);
             string_value->ptr = convert_to<char*>(tuple_data + offset);
         }
     }
@@ -448,7 +448,7 @@ size_t CollectionValue::get_byte_size(const TypeDescriptor& item_type) const {
     return result;
 }
 
-Status CollectionValue::init_collection(ObjectPool* pool, uint32_t size, PrimitiveType child_type,
+Status CollectionValue::init_collection(ObjectPool* pool, uint64_t size, PrimitiveType child_type,
                                         CollectionValue* value) {
     return init_collection(
             value, [pool](size_t size) -> uint8_t* { return pool->add_array(new uint8_t[size]); },
@@ -456,7 +456,7 @@ Status CollectionValue::init_collection(ObjectPool* pool, uint32_t size, Primiti
 }
 
 Status CollectionValue::init_collection(CollectionValue* value, const AllocateMemFunc& allocate,
-                                        uint32_t size, PrimitiveType child_type) {
+                                        uint64_t size, PrimitiveType child_type) {
     if (value == nullptr) {
         return Status::InvalidArgument("collection value is null");
     }
@@ -477,13 +477,13 @@ Status CollectionValue::init_collection(CollectionValue* value, const AllocateMe
     return Status::OK();
 }
 
-Status CollectionValue::init_collection(MemPool* pool, uint32_t size, PrimitiveType child_type,
+Status CollectionValue::init_collection(MemPool* pool, uint64_t size, PrimitiveType child_type,
                                         CollectionValue* value) {
     return init_collection(
             value, [pool](size_t size) { return pool->allocate(size); }, size, child_type);
 }
 
-Status CollectionValue::init_collection(FunctionContext* context, uint32_t size,
+Status CollectionValue::init_collection(FunctionContext* context, uint64_t size,
                                         PrimitiveType child_type, CollectionValue* value) {
     return init_collection(
             value, [context](size_t size) { return context->allocate(size); }, size, child_type);
@@ -506,8 +506,8 @@ void CollectionValue::deep_copy_collection(CollectionValue* shallow_copied_cv,
     }
 
     auto iterator = cv->iterator(item_type.type);
-    int coll_byte_size = cv->length() * iterator.type_size();
-    int nulls_size = cv->has_null() ? cv->length() * sizeof(bool) : 0;
+    uint64_t coll_byte_size = cv->length() * iterator.type_size();
+    uint64_t nulls_size = cv->has_null() ? cv->length() * sizeof(bool) : 0;
 
     MemFootprint footprint = gen_mem_footprint(coll_byte_size + nulls_size);
     int64_t offset = footprint.first;
@@ -544,10 +544,10 @@ void CollectionValue::deserialize_collection(CollectionValue* cv, const char* tu
         return;
     }
     // assgin data and null_sign pointer position in tuple_data
-    int data_offset = convert_to<int>(cv->data());
+    int64_t data_offset = convert_to<int64_t>(cv->data());
     cv->set_data(convert_to<char*>(tuple_data + data_offset));
     if (cv->has_null()) {
-        int null_offset = convert_to<int>(cv->null_signs());
+        int64_t null_offset = convert_to<int64_t>(cv->null_signs());
         cv->set_null_signs(convert_to<bool*>(tuple_data + null_offset));
     }
     auto iterator = cv->iterator(item_type.type);
diff --git a/be/src/runtime/collection_value.h b/be/src/runtime/collection_value.h
index ccf623171f..3fac161503 100644
--- a/be/src/runtime/collection_value.h
+++ b/be/src/runtime/collection_value.h
@@ -32,7 +32,7 @@ using doris_udf::FunctionContext;
 using doris_udf::AnyVal;
 
 using MemFootprint = std::pair<int64_t, uint8_t*>;
-using GenMemFootprintFunc = std::function<MemFootprint(int size)>;
+using GenMemFootprintFunc = std::function<MemFootprint(int64_t size)>;
 
 struct ArrayIteratorFunctionsBase;
 class ArrayIterator;
@@ -64,25 +64,25 @@ class CollectionValue {
 public:
     CollectionValue() = default;
 
-    explicit CollectionValue(uint32_t length)
+    explicit CollectionValue(uint64_t length)
             : _data(nullptr), _length(length), _has_null(false), _null_signs(nullptr) {}
 
-    CollectionValue(void* data, uint32_t length)
+    CollectionValue(void* data, uint64_t length)
             : _data(data), _length(length), _has_null(false), _null_signs(nullptr) {}
 
-    CollectionValue(void* data, uint32_t length, bool* null_signs)
+    CollectionValue(void* data, uint64_t length, bool* null_signs)
             : _data(data), _length(length), _has_null(true), _null_signs(null_signs) {}
 
-    CollectionValue(void* data, uint32_t length, bool has_null, bool* null_signs)
+    CollectionValue(void* data, uint64_t length, bool has_null, bool* null_signs)
             : _data(data), _length(length), _has_null(has_null), _null_signs(null_signs) {}
 
-    bool is_null_at(uint32_t index) const { return this->_has_null && this->_null_signs[index]; }
+    bool is_null_at(uint64_t index) const { return this->_has_null && this->_null_signs[index]; }
 
     void to_collection_val(CollectionVal* val) const;
 
-    uint32_t size() const { return _length; }
+    uint64_t size() const { return _length; }
 
-    uint32_t length() const { return _length; }
+    uint64_t length() const { return _length; }
 
     void shallow_copy(const CollectionValue* other);
 
@@ -96,13 +96,13 @@ public:
     /**
      * init collection, will alloc (children Type's size + 1) * (children Nums) memory  
      */
-    static Status init_collection(ObjectPool* pool, uint32_t size, PrimitiveType child_type,
+    static Status init_collection(ObjectPool* pool, uint64_t size, PrimitiveType child_type,
                                   CollectionValue* value);
 
-    static Status init_collection(MemPool* pool, uint32_t size, PrimitiveType child_type,
+    static Status init_collection(MemPool* pool, uint64_t size, PrimitiveType child_type,
                                   CollectionValue* value);
 
-    static Status init_collection(FunctionContext* context, uint32_t size, PrimitiveType child_type,
+    static Status init_collection(FunctionContext* context, uint64_t size, PrimitiveType child_type,
                                   CollectionValue* value);
 
     static CollectionValue from_collection_val(const CollectionVal& val);
@@ -123,7 +123,7 @@ public:
     const bool* null_signs() const { return _null_signs; }
     void* mutable_data() { return _data; }
     bool* mutable_null_signs() { return _null_signs; }
-    void set_length(uint32_t length) { _length = length; }
+    void set_length(uint64_t length) { _length = length; }
     void set_has_null(bool has_null) { _has_null = has_null; }
     void set_data(void* data) { _data = data; }
     void set_null_signs(bool* null_signs) { _null_signs = null_signs; }
@@ -131,13 +131,13 @@ public:
 private:
     using AllocateMemFunc = std::function<uint8_t*(size_t size)>;
     static Status init_collection(CollectionValue* value, const AllocateMemFunc& allocate,
-                                  uint32_t size, PrimitiveType child_type);
+                                  uint64_t size, PrimitiveType child_type);
     ArrayIterator internal_iterator(PrimitiveType child_type) const;
 
 private:
     // child column data
     void* _data;
-    uint32_t _length;
+    uint64_t _length;
     // item has no null value if has_null is false.
     // item ```may``` has null value if has_null is true.
     bool _has_null;
@@ -160,7 +160,7 @@ public:
         }
         return false;
     }
-    bool seek(uint32_t n) const {
+    bool seek(uint64_t n) const {
         if (n >= _collection_value->size()) {
             return false;
         }
@@ -248,7 +248,7 @@ private:
 
 private:
     CollectionValue* _collection_value;
-    mutable uint32_t _offset;
+    mutable uint64_t _offset;
     const int _type_size;
     const bool _is_type_fixed_width;
 
diff --git a/be/src/runtime/tuple.cpp b/be/src/runtime/tuple.cpp
index 92ee49ac0b..641c13c4bb 100644
--- a/be/src/runtime/tuple.cpp
+++ b/be/src/runtime/tuple.cpp
@@ -93,7 +93,7 @@ void Tuple::deep_copy(Tuple* dst, const TupleDescriptor& desc, MemPool* pool, bo
     // copy collection slot
     deep_copy_collection_slots(
             dst, desc,
-            [pool](int size) -> MemFootprint {
+            [pool](int64_t size) -> MemFootprint {
                 int64_t offset = pool->total_allocated_bytes();
                 uint8_t* data = pool->allocate(size);
                 return {offset, data};
@@ -186,7 +186,7 @@ void Tuple::deep_copy(const TupleDescriptor& desc, char** data, int64_t* offset,
     // copy collection slots
     deep_copy_collection_slots(
             dst, desc,
-            [offset, data](int size) -> MemFootprint {
+            [offset, data](int64_t size) -> MemFootprint {
                 MemFootprint footprint = {*offset, reinterpret_cast<uint8_t*>(*data)};
                 *offset += size;
                 *data += size;
diff --git a/be/src/udf/udf.h b/be/src/udf/udf.h
index 56e447f375..324a1d362a 100644
--- a/be/src/udf/udf.h
+++ b/be/src/udf/udf.h
@@ -742,7 +742,7 @@ struct HllVal : public StringVal {
 
 struct CollectionVal : public AnyVal {
     void* data;
-    uint32_t length;
+    uint64_t length;
     // item has no null value if has_null is false.
     // item ```may``` has null value if has_null is true.
     bool has_null;
@@ -751,7 +751,7 @@ struct CollectionVal : public AnyVal {
 
     CollectionVal() = default;
 
-    CollectionVal(void* data, uint32_t length, bool has_null, bool* null_signs)
+    CollectionVal(void* data, uint64_t length, bool has_null, bool* null_signs)
             : data(data), length(length), has_null(has_null), null_signs(null_signs) {};
 
     static CollectionVal null() {
diff --git a/be/src/vec/columns/column.h b/be/src/vec/columns/column.h
index b0b6231cbb..77c61a4153 100644
--- a/be/src/vec/columns/column.h
+++ b/be/src/vec/columns/column.h
@@ -306,7 +306,7 @@ public:
       * (i-th element should be copied offsets[i] - offsets[i - 1] times.)
       * It is necessary in ARRAY JOIN operation.
       */
-    using Offset = UInt32;
+    using Offset = UInt64;
     using Offsets = PaddedPODArray<Offset>;
     virtual Ptr replicate(const Offsets& offsets) const = 0;
 
diff --git a/be/src/vec/columns/column_string.cpp b/be/src/vec/columns/column_string.cpp
index c469d79190..12701ab5ed 100644
--- a/be/src/vec/columns/column_string.cpp
+++ b/be/src/vec/columns/column_string.cpp
@@ -164,7 +164,7 @@ ColumnPtr ColumnString::permute(const Permutation& perm, size_t limit) const {
 
 StringRef ColumnString::serialize_value_into_arena(size_t n, Arena& arena,
                                                    char const*& begin) const {
-    UInt32 string_size = size_at(n);
+    IColumn::Offset string_size = size_at(n);
     size_t offset = offset_at(n);
 
     StringRef res;
@@ -178,7 +178,7 @@ StringRef ColumnString::serialize_value_into_arena(size_t n, Arena& arena,
 }
 
 const char* ColumnString::deserialize_and_insert_from_arena(const char* pos) {
-    const UInt32 string_size = unaligned_load<UInt32>(pos);
+    const IColumn::Offset string_size = unaligned_load<IColumn::Offset>(pos);
     pos += sizeof(string_size);
 
     const size_t old_size = chars.size();
diff --git a/be/src/vec/columns/column_vector.cpp b/be/src/vec/columns/column_vector.cpp
index acc7e4b9a2..dde2f033a7 100644
--- a/be/src/vec/columns/column_vector.cpp
+++ b/be/src/vec/columns/column_vector.cpp
@@ -327,7 +327,7 @@ ColumnPtr ColumnVector<T>::replicate(const IColumn::Offsets& offsets) const {
 
     // vectorized this code to speed up
     IColumn::Offset counts[size];
-    for (size_t i = 0; i < size; ++i) {
+    for (ssize_t i = 0; i < size; ++i) {
         counts[i] = offsets[i] - offsets[i - 1];
     }
 
diff --git a/be/src/vec/data_types/data_type_array.cpp b/be/src/vec/data_types/data_type_array.cpp
index cc67eb7973..e39f1c569b 100644
--- a/be/src/vec/data_types/data_type_array.cpp
+++ b/be/src/vec/data_types/data_type_array.cpp
@@ -65,7 +65,7 @@ char* DataTypeArray::serialize(const IColumn& column, char* buf) const {
     const auto& data_column = assert_cast<const ColumnArray&>(*ptr.get());
 
     // row num
-    *reinterpret_cast<uint32_t*>(buf) = column.size();
+    *reinterpret_cast<IColumn::Offset*>(buf) = column.size();
     buf += sizeof(IColumn::Offset);
     // offsets
     memcpy(buf, data_column.get_offsets().data(), column.size() * sizeof(IColumn::Offset));
@@ -79,7 +79,7 @@ const char* DataTypeArray::deserialize(const char* buf, IColumn* column) const {
     auto& offsets = data_column->get_offsets();
 
     // row num
-    uint32_t row_num = *reinterpret_cast<const IColumn::Offset*>(buf);
+    IColumn::Offset row_num = *reinterpret_cast<const IColumn::Offset*>(buf);
     buf += sizeof(IColumn::Offset);
     // offsets
     offsets.resize(row_num);
diff --git a/be/src/vec/data_types/data_type_string.cpp b/be/src/vec/data_types/data_type_string.cpp
index 220b418e4b..b7bf2fdc2e 100644
--- a/be/src/vec/data_types/data_type_string.cpp
+++ b/be/src/vec/data_types/data_type_string.cpp
@@ -90,7 +90,7 @@ bool DataTypeString::equals(const IDataType& rhs) const {
 int64_t DataTypeString::get_uncompressed_serialized_bytes(const IColumn& column) const {
     auto ptr = column.convert_to_full_column_if_const();
     const auto& data_column = assert_cast<const ColumnString&>(*ptr.get());
-    return sizeof(uint32_t) * (column.size() + 1) + sizeof(uint64_t) +
+    return sizeof(IColumn::Offset) * (column.size() + 1) + sizeof(uint64_t) +
            data_column.get_chars().size();
 }
 
@@ -99,11 +99,11 @@ char* DataTypeString::serialize(const IColumn& column, char* buf) const {
     const auto& data_column = assert_cast<const ColumnString&>(*ptr.get());
 
     // row num
-    *reinterpret_cast<uint32_t*>(buf) = column.size();
-    buf += sizeof(uint32_t);
+    *reinterpret_cast<IColumn::Offset*>(buf) = column.size();
+    buf += sizeof(IColumn::Offset);
     // offsets
-    memcpy(buf, data_column.get_offsets().data(), column.size() * sizeof(uint32_t));
-    buf += column.size() * sizeof(uint32_t);
+    memcpy(buf, data_column.get_offsets().data(), column.size() * sizeof(IColumn::Offset));
+    buf += column.size() * sizeof(IColumn::Offset);
     // total length
     uint64_t value_len = data_column.get_chars().size();
     *reinterpret_cast<uint64_t*>(buf) = value_len;
@@ -121,12 +121,12 @@ const char* DataTypeString::deserialize(const char* buf, IColumn* column) const
     ColumnString::Offsets& offsets = column_string->get_offsets();
 
     // row num
-    uint32_t row_num = *reinterpret_cast<const uint32_t*>(buf);
-    buf += sizeof(uint32_t);
+    IColumn::Offset row_num = *reinterpret_cast<const IColumn::Offset*>(buf);
+    buf += sizeof(IColumn::Offset);
     // offsets
     offsets.resize(row_num);
-    memcpy(offsets.data(), buf, sizeof(uint32_t) * row_num);
-    buf += sizeof(uint32_t) * row_num;
+    memcpy(offsets.data(), buf, sizeof(IColumn::Offset) * row_num);
+    buf += sizeof(IColumn::Offset) * row_num;
     // total length
     uint64_t value_len = *reinterpret_cast<const uint64_t*>(buf);
     buf += sizeof(uint64_t);
diff --git a/be/src/vec/functions/array/function_array_size.h b/be/src/vec/functions/array/function_array_size.h
index bffed4460d..1988c5b66a 100644
--- a/be/src/vec/functions/array/function_array_size.h
+++ b/be/src/vec/functions/array/function_array_size.h
@@ -59,7 +59,7 @@ public:
         auto dst_column = ColumnInt64::create(input_rows_count);
         auto& dst_data = dst_column->get_data();
 
-        for (size_t i = 0; i < offsets.size(); ++i) {
+        for (ssize_t i = 0; i < offsets.size(); ++i) {
             dst_data[i] = offsets[i] - offsets[i - 1];
         }
 
diff --git a/be/src/vec/functions/function_string.h b/be/src/vec/functions/function_string.h
index 3027c121d9..3c29cb8745 100644
--- a/be/src/vec/functions/function_string.h
+++ b/be/src/vec/functions/function_string.h
@@ -635,7 +635,7 @@ public:
         //
         fmt::memory_buffer buffer;
         res_offsets.resize(input_row_size);
-        for (size_t i = 0; i < input_row_size; ++i) {
+        for (ssize_t i = 0; i < input_row_size; ++i) {
             buffer.clear();
             const char* raw_str = reinterpret_cast<const char*>(&data[offsets[i - 1]]);
             int size = offsets[i] - offsets[i - 1] - 1;
diff --git a/be/src/vec/sink/mysql_result_writer.cpp b/be/src/vec/sink/mysql_result_writer.cpp
index 55fa8aaf41..1902bf9972 100644
--- a/be/src/vec/sink/mysql_result_writer.cpp
+++ b/be/src/vec/sink/mysql_result_writer.cpp
@@ -111,7 +111,7 @@ Status VMysqlResultWriter::_add_one_column(const ColumnPtr& column_ptr,
     } else if constexpr (type == TYPE_ARRAY) {
         auto& column_array = assert_cast<const ColumnArray&>(*column);
         auto& offsets = column_array.get_offsets();
-        for (int i = 0; i < row_size; ++i) {
+        for (ssize_t i = 0; i < row_size; ++i) {
             if (0 != buf_ret) {
                 return Status::InternalError("pack mysql buffer failed.");
             }
@@ -128,7 +128,7 @@ Status VMysqlResultWriter::_add_one_column(const ColumnPtr& column_ptr,
             _buffer.open_dynamic_mode();
             buf_ret = _buffer.push_string("[", 1);
             bool begin = true;
-            for (int j = offsets[i - 1]; j < offsets[i]; ++j) {
+            for (auto j = offsets[i - 1]; j < offsets[i]; ++j) {
                 if (!begin) {
                     buf_ret = _buffer.push_string(", ", 2);
                 }
diff --git a/be/src/vec/utils/arrow_column_to_doris_column.cpp b/be/src/vec/utils/arrow_column_to_doris_column.cpp
index 206c279e4c..76741fdb59 100644
--- a/be/src/vec/utils/arrow_column_to_doris_column.cpp
+++ b/be/src/vec/utils/arrow_column_to_doris_column.cpp
@@ -100,8 +100,8 @@ static size_t fill_nullable_column(const arrow::Array* array, size_t array_idx,
 /// Also internal strings are null terminated.
 static Status convert_column_with_string_data(const arrow::Array* array, size_t array_idx,
                                               MutableColumnPtr& data_column, size_t num_elements) {
-    PaddedPODArray<UInt8>& column_chars_t = assert_cast<ColumnString&>(*data_column).get_chars();
-    PaddedPODArray<UInt32>& column_offsets = assert_cast<ColumnString&>(*data_column).get_offsets();
+    auto& column_chars_t = assert_cast<ColumnString&>(*data_column).get_chars();
+    auto& column_offsets = assert_cast<ColumnString&>(*data_column).get_offsets();
 
     auto concrete_array = down_cast<const arrow::BinaryArray*>(array);
     std::shared_ptr<arrow::Buffer> buffer = concrete_array->value_data();
@@ -121,8 +121,8 @@ static Status convert_column_with_string_data(const arrow::Array* array, size_t
 static Status convert_column_with_fixed_size_data(const arrow::Array* array, size_t array_idx,
                                                   MutableColumnPtr& data_column,
                                                   size_t num_elements) {
-    PaddedPODArray<UInt8>& column_chars_t = assert_cast<ColumnString&>(*data_column).get_chars();
-    PaddedPODArray<UInt32>& column_offsets = assert_cast<ColumnString&>(*data_column).get_offsets();
+    auto& column_chars_t = assert_cast<ColumnString&>(*data_column).get_chars();
+    auto& column_offsets = assert_cast<ColumnString&>(*data_column).get_offsets();
 
     auto concrete_array = down_cast<const arrow::FixedSizeBinaryArray*>(array);
     uint32_t width = concrete_array->byte_width();
diff --git a/be/test/olap/column_vector_test.cpp b/be/test/olap/column_vector_test.cpp
index 5faffb041d..19a28088d8 100644
--- a/be/test/olap/column_vector_test.cpp
+++ b/be/test/olap/column_vector_test.cpp
@@ -95,8 +95,8 @@ void test_read_write_array_column_vector(const TypeInfo* array_type_info, size_t
 
     // first write
     for (size_t i = 0; i < array_init_size; ++i) {
-        uint32_t len = result[i].length();
-        memcpy(offset_cvb->mutable_cell_ptr(1 + i), &len, sizeof(uint32_t));
+        uint64_t len = result[i].length();
+        memcpy(offset_cvb->mutable_cell_ptr(1 + i), &len, sizeof(uint64_t));
     }
     array_cvb->set_null_bits(0, array_init_size, false);
     array_cvb->get_offset_by_length(0, array_init_size);
@@ -114,8 +114,8 @@ void test_read_write_array_column_vector(const TypeInfo* array_type_info, size_t
     // second write
     EXPECT_TRUE(array_cvb->resize(array_size).ok());
     for (int i = array_init_size; i < array_size; ++i) {
-        uint32_t len = result[i].length();
-        memcpy(offset_cvb->mutable_cell_ptr(i + 1), &len, sizeof(uint32_t));
+        uint64_t len = result[i].length();
+        memcpy(offset_cvb->mutable_cell_ptr(i + 1), &len, sizeof(uint64_t));
     }
     array_cvb->set_null_bits(array_init_size, array_size - array_init_size, false);
     array_cvb->get_offset_by_length(array_init_size, array_size - array_init_size);
@@ -170,7 +170,7 @@ TEST_F(ColumnVectorTest, array_column_vector_test) {
 
         auto* item_val = new uint8_t[num_item];
         memset(null_signs, 0, sizeof(bool) * 3);
-        for (int i = 0; i < num_item; ++i) {
+        for (size_t i = 0; i < num_item; ++i) {
             item_val[i] = i;
             if (i % 3 == 0) {
                 size_t array_index = i / 3;
diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/PrimitiveType.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/PrimitiveType.java
index 4c5c4c4c6f..32dc2bec55 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/catalog/PrimitiveType.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/PrimitiveType.java
@@ -55,7 +55,8 @@ public enum PrimitiveType {
     BITMAP("BITMAP", 16, TPrimitiveType.OBJECT),
     QUANTILE_STATE("QUANTILE_STATE", 16, TPrimitiveType.QUANTILE_STATE),
 
-    ARRAY("ARRAY", 24, TPrimitiveType.ARRAY),
+    // sizeof(CollectionValue)
+    ARRAY("ARRAY", 32, TPrimitiveType.ARRAY),
     MAP("MAP", 24, TPrimitiveType.MAP),
     STRUCT("STRUCT", 24, TPrimitiveType.STRUCT),
     STRING("STRING", 16, TPrimitiveType.STRING),


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org