You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by mo...@apache.org on 2022/10/08 10:03:17 UTC

[doris] branch master updated: [feature-wip](parquet-reader) optimize the performance of column conversion (#13122)

This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new b81a8789c3 [feature-wip](parquet-reader) optimize the performance of column conversion (#13122)
b81a8789c3 is described below

commit b81a8789c3f6d47df6fee85c7f0fd470f2d1ca16
Author: Ashin Gau <As...@users.noreply.github.com>
AuthorDate: Sat Oct 8 18:03:10 2022 +0800

    [feature-wip](parquet-reader) optimize the performance of column conversion (#13122)
    
    Convert Parquet column into doris column via batch method.
    In the previous implementation, only numeric types can be converted in batches,
    and other types can only be inserted one by one.
    This process will generate repeated virtual function calls and container expansion.
---
 be/src/vec/exec/format/parquet/parquet_common.cpp  | 44 ++++++++--------
 be/src/vec/exec/format/parquet/parquet_common.h    | 61 +++++++++++++++-------
 be/src/vec/exec/format/parquet/parquet_pred_cmp.h  | 28 +++-------
 .../parquet/vparquet_column_chunk_reader.cpp       |  4 +-
 .../exec/format/parquet/vparquet_column_reader.cpp | 30 ++++++++---
 5 files changed, 93 insertions(+), 74 deletions(-)

diff --git a/be/src/vec/exec/format/parquet/parquet_common.cpp b/be/src/vec/exec/format/parquet/parquet_common.cpp
index fd8d5d7428..83c702f800 100644
--- a/be/src/vec/exec/format/parquet/parquet_common.cpp
+++ b/be/src/vec/exec/format/parquet/parquet_common.cpp
@@ -40,6 +40,12 @@ inline uint64_t ParquetInt96::to_timestamp_micros() const {
     M(TypeIndex::Float32, Float32)   \
     M(TypeIndex::Float64, Float64)
 
+#define FOR_SHORT_INT_TYPES(M) \
+    M(TypeIndex::Int8, Int8)   \
+    M(TypeIndex::UInt8, UInt8) \
+    M(TypeIndex::Int16, Int16) \
+    M(TypeIndex::UInt16, UInt16)
+
 Status Decoder::get_decoder(tparquet::Type::type type, tparquet::Encoding::type encoding,
                             std::unique_ptr<Decoder>& decoder) {
     switch (encoding) {
@@ -155,19 +161,6 @@ Status FixLengthDecoder::skip_values(size_t num_values) {
     return Status::OK();
 }
 
-Status FixLengthDecoder::_decode_short_int(MutableColumnPtr& doris_column, size_t num_values,
-                                           size_t real_length) {
-    if (UNLIKELY(_physical_type != tparquet::Type::INT32)) {
-        return Status::InternalError("Short int can only be decoded from INT32");
-    }
-    for (int i = 0; i < num_values; ++i) {
-        char* buf_start = _FIXED_GET_DATA_OFFSET(i);
-        doris_column->insert_data(buf_start, real_length);
-        _FIXED_SHIFT_DATA_OFFSET();
-    }
-    return Status::OK();
-}
-
 Status FixLengthDecoder::decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
                                        size_t num_values) {
     if (_has_dict) {
@@ -178,12 +171,11 @@ Status FixLengthDecoder::decode_values(MutableColumnPtr& doris_column, DataTypeP
     }
     TypeIndex logical_type = remove_nullable(data_type)->get_type_id();
     switch (logical_type) {
-    case TypeIndex::Int8:
-    case TypeIndex::UInt8:
-        return _decode_short_int(doris_column, num_values, 1);
-    case TypeIndex::Int16:
-    case TypeIndex::UInt16:
-        return _decode_short_int(doris_column, num_values, 2);
+#define DISPATCH(SHORT_INT_TYPE, CPP_SHORT_INT_TYPE) \
+    case SHORT_INT_TYPE:                             \
+        return _decode_short_int<CPP_SHORT_INT_TYPE>(doris_column, num_values);
+        FOR_SHORT_INT_TYPES(DISPATCH)
+#undef DISPATCH
 #define DISPATCH(NUMERIC_TYPE, CPP_NUMERIC_TYPE) \
     case NUMERIC_TYPE:                           \
         return _decode_numeric<CPP_NUMERIC_TYPE>(doris_column, num_values);
@@ -329,13 +321,15 @@ Status ByteArrayDecoder::decode_values(MutableColumnPtr& doris_column, DataTypeP
     TypeIndex logical_type = remove_nullable(data_type)->get_type_id();
     switch (logical_type) {
     case TypeIndex::String:
-    case TypeIndex::FixedString:
+    case TypeIndex::FixedString: {
+        std::vector<StringRef> string_values;
+        string_values.reserve(num_values);
         for (int i = 0; i < num_values; ++i) {
             if (_has_dict) {
                 uint32_t idx = _indexes[i];
                 uint32_t idx_cursor = _dict_offsets[idx];
                 char* buff_start = reinterpret_cast<char*>(_dict.get() + idx_cursor);
-                doris_column->insert_data(buff_start, _dict_offsets[idx + 1] - idx_cursor - 4);
+                string_values.emplace_back(buff_start, _dict_offsets[idx + 1] - idx_cursor - 4);
             } else {
                 if (UNLIKELY(_offset + 4 > _data->size)) {
                     return Status::IOError("Can't read byte array length from plain decoder");
@@ -346,11 +340,13 @@ Status ByteArrayDecoder::decode_values(MutableColumnPtr& doris_column, DataTypeP
                 if (UNLIKELY(_offset + length) > _data->size) {
                     return Status::IOError("Can't read enough bytes in plain decoder");
                 }
-                doris_column->insert_data(_data->data + _offset, length);
+                string_values.emplace_back(_data->data + _offset, length);
                 _offset += length;
             }
         }
+        doris_column->insert_many_strings(&string_values[0], num_values);
         return Status::OK();
+    }
     case TypeIndex::Decimal32:
         return _decode_binary_decimal<Int32>(doris_column, data_type, num_values);
     case TypeIndex::Decimal64:
@@ -392,12 +388,14 @@ Status BoolPlainDecoder::skip_values(size_t num_values) {
 Status BoolPlainDecoder::decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
                                        size_t num_values) {
     auto& column_data = static_cast<ColumnVector<UInt8>&>(*doris_column).get_data();
+    auto origin_size = column_data.size();
+    column_data.resize(origin_size + num_values);
     bool value;
     for (int i = 0; i < num_values; ++i) {
         if (UNLIKELY(!_decode_value(&value))) {
             return Status::IOError("Can't read enough booleans in plain decoder");
         }
-        column_data.emplace_back(value);
+        column_data[origin_size + i] = (UInt8)value;
     }
     return Status::OK();
 }
diff --git a/be/src/vec/exec/format/parquet/parquet_common.h b/be/src/vec/exec/format/parquet/parquet_common.h
index e08027a137..a5e67fad2e 100644
--- a/be/src/vec/exec/format/parquet/parquet_common.h
+++ b/be/src/vec/exec/format/parquet/parquet_common.h
@@ -180,7 +180,8 @@ public:
     void set_data(Slice* data) override;
 
 protected:
-    Status _decode_short_int(MutableColumnPtr& doris_column, size_t num_values, size_t real_length);
+    template <typename ShortIntType>
+    Status _decode_short_int(MutableColumnPtr& doris_column, size_t num_values);
 
     template <typename Numeric>
     Status _decode_numeric(MutableColumnPtr& doris_column, size_t num_values);
@@ -219,15 +220,33 @@ protected:
     std::vector<uint32_t> _indexes;
 };
 
+template <typename ShortIntType>
+Status FixLengthDecoder::_decode_short_int(MutableColumnPtr& doris_column, size_t num_values) {
+    if (UNLIKELY(_physical_type != tparquet::Type::INT32)) {
+        return Status::InternalError("Short int can only be decoded from INT32");
+    }
+    auto& column_data = static_cast<ColumnVector<ShortIntType>&>(*doris_column).get_data();
+    auto origin_size = column_data.size();
+    column_data.resize(origin_size + num_values);
+    for (int i = 0; i < num_values; ++i) {
+        char* buf_start = _FIXED_GET_DATA_OFFSET(i);
+        column_data[origin_size + i] = *(ShortIntType*)buf_start;
+        _FIXED_SHIFT_DATA_OFFSET();
+    }
+    return Status::OK();
+}
+
 template <typename Numeric>
 Status FixLengthDecoder::_decode_numeric(MutableColumnPtr& doris_column, size_t num_values) {
+    auto& column_data = static_cast<ColumnVector<Numeric>&>(*doris_column).get_data();
     if (_has_dict) {
+        auto origin_size = column_data.size();
+        column_data.resize(origin_size + num_values);
         for (int i = 0; i < num_values; ++i) {
             char* buf_start = _FIXED_GET_DATA_OFFSET(i);
-            doris_column->insert_data(buf_start, _type_length);
+            column_data[origin_size + i] = *(Numeric*)buf_start;
         }
     } else {
-        auto& column_data = static_cast<ColumnVector<Numeric>&>(*doris_column).get_data();
         const auto* raw_data = reinterpret_cast<const Numeric*>(_data->data + _offset);
         column_data.insert(raw_data, raw_data + num_values);
         _offset += _type_length * num_values;
@@ -239,17 +258,17 @@ template <typename CppType, typename ColumnType>
 Status FixLengthDecoder::_decode_date(MutableColumnPtr& doris_column, TypeIndex& logical_type,
                                       size_t num_values) {
     auto& column_data = static_cast<ColumnVector<ColumnType>&>(*doris_column).get_data();
+    auto origin_size = column_data.size();
+    column_data.resize(origin_size + num_values);
     for (int i = 0; i < num_values; ++i) {
         char* buf_start = _FIXED_GET_DATA_OFFSET(i);
         int64_t date_value = static_cast<int64_t>(*reinterpret_cast<int32_t*>(buf_start));
-        CppType v;
+        auto& v = reinterpret_cast<CppType&>(column_data[origin_size + i]);
         v.from_unixtime(date_value * 24 * 60 * 60, *_decode_params->ctz); // day to seconds
         if constexpr (std::is_same_v<CppType, VecDateTimeValue>) {
             // we should cast to date if using date v1.
             v.cast_to_date();
         }
-        ColumnType& cast_value = *reinterpret_cast<ColumnType*>(&v);
-        column_data.emplace_back(cast_value);
         _FIXED_SHIFT_DATA_OFFSET();
     }
     return Status::OK();
@@ -259,18 +278,18 @@ template <typename CppType, typename ColumnType>
 Status FixLengthDecoder::_decode_datetime64(MutableColumnPtr& doris_column, TypeIndex& logical_type,
                                             size_t num_values) {
     auto& column_data = static_cast<ColumnVector<ColumnType>&>(*doris_column).get_data();
+    auto origin_size = column_data.size();
+    column_data.resize(origin_size + num_values);
     for (int i = 0; i < num_values; i++) {
         char* buf_start = _FIXED_GET_DATA_OFFSET(i);
         int64_t& date_value = *reinterpret_cast<int64_t*>(buf_start);
-        CppType v;
+        auto& v = reinterpret_cast<CppType&>(column_data[origin_size + i]);
         v.from_unixtime(date_value / _decode_params->second_mask, *_decode_params->ctz);
         if constexpr (std::is_same_v<CppType, DateV2Value<DateTimeV2ValueType>>) {
             // nanoseconds will be ignored.
             v.set_microsecond((date_value % _decode_params->second_mask) *
                               _decode_params->scale_to_nano_factor / 1000);
         }
-        ColumnType& cast_value = *reinterpret_cast<ColumnType*>(&v);
-        column_data.emplace_back(cast_value);
         _FIXED_SHIFT_DATA_OFFSET();
     }
     return Status::OK();
@@ -280,10 +299,12 @@ template <typename CppType, typename ColumnType>
 Status FixLengthDecoder::_decode_datetime96(MutableColumnPtr& doris_column, TypeIndex& logical_type,
                                             size_t num_values) {
     auto& column_data = static_cast<ColumnVector<ColumnType>&>(*doris_column).get_data();
+    auto origin_size = column_data.size();
+    column_data.resize(origin_size + num_values);
     for (int i = 0; i < num_values; ++i) {
         char* buf_start = _FIXED_GET_DATA_OFFSET(i);
         ParquetInt96& datetime96 = *reinterpret_cast<ParquetInt96*>(buf_start);
-        CppType v;
+        auto& v = reinterpret_cast<CppType&>(column_data[origin_size + i]);
         int64_t micros = datetime96.to_timestamp_micros();
         v.from_unixtime(micros / 1000000, *_decode_params->ctz);
         if constexpr (std::is_same_v<CppType, DateV2Value<DateTimeV2ValueType>>) {
@@ -291,8 +312,6 @@ Status FixLengthDecoder::_decode_datetime96(MutableColumnPtr& doris_column, Type
             // only keep microseconds.
             v.set_microsecond(micros % 1000000);
         }
-        ColumnType& cast_value = *reinterpret_cast<ColumnType*>(&v);
-        column_data.emplace_back(cast_value);
         _FIXED_SHIFT_DATA_OFFSET();
     }
     return Status::OK();
@@ -304,6 +323,8 @@ Status FixLengthDecoder::_decode_binary_decimal(MutableColumnPtr& doris_column,
     init_decimal_converter<DecimalPrimitiveType>(data_type);
     auto& column_data =
             static_cast<ColumnDecimal<Decimal<DecimalPrimitiveType>>&>(*doris_column).get_data();
+    auto origin_size = column_data.size();
+    column_data.resize(origin_size + num_values);
     DecimalScaleParams& scale_params = _decode_params->decimal_scale;
     for (int i = 0; i < num_values; ++i) {
         char* buf_start = _FIXED_GET_DATA_OFFSET(i);
@@ -318,8 +339,8 @@ Status FixLengthDecoder::_decode_binary_decimal(MutableColumnPtr& doris_column,
         } else if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) {
             value /= scale_params.scale_factor;
         }
-        DecimalPrimitiveType cast_value(value);
-        column_data.emplace_back(*reinterpret_cast<Decimal<DecimalPrimitiveType>*>(&cast_value));
+        auto& v = reinterpret_cast<DecimalPrimitiveType&>(column_data[origin_size + i]);
+        v = (DecimalPrimitiveType)value;
         _FIXED_SHIFT_DATA_OFFSET();
     }
     return Status::OK();
@@ -331,6 +352,8 @@ Status FixLengthDecoder::_decode_primitive_decimal(MutableColumnPtr& doris_colum
     init_decimal_converter<DecimalPrimitiveType>(data_type);
     auto& column_data =
             static_cast<ColumnDecimal<Decimal<DecimalPrimitiveType>>&>(*doris_column).get_data();
+    auto origin_size = column_data.size();
+    column_data.resize(origin_size + num_values);
     DecimalScaleParams& scale_params = _decode_params->decimal_scale;
     for (int i = 0; i < num_values; ++i) {
         char* buf_start = _FIXED_GET_DATA_OFFSET(i);
@@ -341,8 +364,8 @@ Status FixLengthDecoder::_decode_primitive_decimal(MutableColumnPtr& doris_colum
         } else if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) {
             value /= scale_params.scale_factor;
         }
-        DecimalPrimitiveType cast_value(value);
-        column_data.emplace_back(*reinterpret_cast<Decimal<DecimalPrimitiveType>*>(&cast_value));
+        auto& v = reinterpret_cast<DecimalPrimitiveType&>(column_data[origin_size + i]);
+        v = (DecimalPrimitiveType)value;
         _FIXED_SHIFT_DATA_OFFSET();
     }
     return Status::OK();
@@ -381,6 +404,8 @@ Status ByteArrayDecoder::_decode_binary_decimal(MutableColumnPtr& doris_column,
     init_decimal_converter<DecimalPrimitiveType>(data_type);
     auto& column_data =
             static_cast<ColumnDecimal<Decimal<DecimalPrimitiveType>>&>(*doris_column).get_data();
+    auto origin_size = column_data.size();
+    column_data.resize(origin_size + num_values);
     DecimalScaleParams& scale_params = _decode_params->decimal_scale;
     for (int i = 0; i < num_values; ++i) {
         char* buf_start;
@@ -409,8 +434,8 @@ Status ByteArrayDecoder::_decode_binary_decimal(MutableColumnPtr& doris_column,
         } else if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) {
             value /= scale_params.scale_factor;
         }
-        DecimalPrimitiveType cast_value(value);
-        column_data.emplace_back(*reinterpret_cast<Decimal<DecimalPrimitiveType>*>(&cast_value));
+        auto& v = reinterpret_cast<DecimalPrimitiveType&>(column_data[origin_size + i]);
+        v = (DecimalPrimitiveType)value;
     }
     return Status::OK();
 }
diff --git a/be/src/vec/exec/format/parquet/parquet_pred_cmp.h b/be/src/vec/exec/format/parquet/parquet_pred_cmp.h
index 517bab5d61..89792eaec1 100644
--- a/be/src/vec/exec/format/parquet/parquet_pred_cmp.h
+++ b/be/src/vec/exec/format/parquet/parquet_pred_cmp.h
@@ -97,9 +97,7 @@ static bool _eval_in_val(PrimitiveType conjunct_type, std::vector<void*> in_pred
     }
     case TYPE_STRING:
     case TYPE_VARCHAR:
-    case TYPE_CHAR:
-    case TYPE_DATE:
-    case TYPE_DATETIME: {
+    case TYPE_CHAR: {
         std::vector<const char*> in_values;
         for (auto val : in_pred_values) {
             const char* value = ((std::string*)val)->data();
@@ -147,9 +145,7 @@ static bool _eval_eq(PrimitiveType conjunct_type, void* value, const char* min_b
     }
     case TYPE_STRING:
     case TYPE_VARCHAR:
-    case TYPE_CHAR:
-    case TYPE_DATE:
-    case TYPE_DATETIME: {
+    case TYPE_CHAR: {
         const char* conjunct_value = ((std::string*)value)->data();
         if (strcmp(conjunct_value, min_bytes) < 0 || strcmp(conjunct_value, max_bytes) > 0) {
             return true;
@@ -186,10 +182,7 @@ static bool _eval_gt(PrimitiveType conjunct_type, void* value, const char* max_b
     }
     case TYPE_STRING:
     case TYPE_VARCHAR:
-    case TYPE_CHAR:
-    case TYPE_DATE:
-    case TYPE_DATETIME: {
-        //            case TYPE_TIME:
+    case TYPE_CHAR: {
         const char* conjunct_value = ((std::string*)value)->data();
         if (strcmp(max_bytes, conjunct_value) <= 0) {
             return true;
@@ -226,10 +219,7 @@ static bool _eval_ge(PrimitiveType conjunct_type, void* value, const char* max_b
     }
     case TYPE_STRING:
     case TYPE_VARCHAR:
-    case TYPE_CHAR:
-    case TYPE_DATE:
-    case TYPE_DATETIME: {
-        //            case TYPE_TIME:
+    case TYPE_CHAR: {
         const char* conjunct_value = ((std::string*)value)->data();
         if (strcmp(max_bytes, conjunct_value) < 0) {
             return true;
@@ -266,10 +256,7 @@ static bool _eval_lt(PrimitiveType conjunct_type, void* value, const char* min_b
     }
     case TYPE_STRING:
     case TYPE_VARCHAR:
-    case TYPE_CHAR:
-    case TYPE_DATE:
-    case TYPE_DATETIME: {
-        //            case TYPE_TIME:
+    case TYPE_CHAR: {
         const char* conjunct_value = ((std::string*)value)->data();
         if (strcmp(min_bytes, conjunct_value) >= 0) {
             return true;
@@ -306,10 +293,7 @@ static bool _eval_le(PrimitiveType conjunct_type, void* value, const char* min_b
     }
     case TYPE_STRING:
     case TYPE_VARCHAR:
-    case TYPE_CHAR:
-    case TYPE_DATE:
-    case TYPE_DATETIME: {
-        //            case TYPE_TIME:
+    case TYPE_CHAR: {
         const char* conjunct_value = ((std::string*)value)->data();
         if (strcmp(min_bytes, conjunct_value) > 0) {
             return true;
diff --git a/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp
index fc8b8cfdee..cce1a79e56 100644
--- a/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp
+++ b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp
@@ -214,9 +214,7 @@ void ColumnChunkReader::insert_null_values(ColumnPtr& doris_column, size_t num_v
 
 void ColumnChunkReader::insert_null_values(MutableColumnPtr& doris_column, size_t num_values) {
     SCOPED_RAW_TIMER(&_statistics.decode_value_time);
-    for (int i = 0; i < num_values; ++i) {
-        doris_column->insert_default();
-    }
+    doris_column->insert_many_defaults(num_values);
     _remaining_num_values -= num_values;
 }
 
diff --git a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp
index 5338c169f7..39774d1e8d 100644
--- a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp
+++ b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp
@@ -124,7 +124,9 @@ Status ScalarColumnReader::_read_values(size_t num_values, ColumnPtr& doris_colu
     auto* nullable_column = reinterpret_cast<vectorized::ColumnNullable*>(
             (*std::move(doris_column)).mutate().get());
     MutableColumnPtr data_column = nullable_column->get_nested_column_ptr();
-    NullMap& map_data = nullable_column->get_null_map_data();
+    NullMap& map_data_column = nullable_column->get_null_map_data();
+    auto origin_size = map_data_column.size();
+    map_data_column.resize(origin_size + num_values);
 
     if (_chunk_reader->max_def_level() > 0) {
         LevelDecoder& def_decoder = _chunk_reader->def_level_decoder();
@@ -135,7 +137,7 @@ Status ScalarColumnReader::_read_values(size_t num_values, ColumnPtr& doris_colu
             bool is_null = def_level == 0;
             // fill NullMap
             for (int i = 0; i < loop_read; ++i) {
-                map_data.emplace_back(is_null);
+                map_data_column[origin_size + has_read + i] = (UInt8)is_null;
             }
             // decode data
             if (is_null) {
@@ -148,7 +150,7 @@ Status ScalarColumnReader::_read_values(size_t num_values, ColumnPtr& doris_colu
         }
     } else {
         for (int i = 0; i < num_values; ++i) {
-            map_data.emplace_back(false);
+            map_data_column[origin_size + i] = (UInt8) false;
         }
         RETURN_IF_ERROR(_chunk_reader->decode_values(data_column, type, num_values));
     }
@@ -256,7 +258,7 @@ Status ArrayColumnReader::read_column_data(ColumnPtr& doris_column, DataTypePtr&
     CHECK(doris_column->is_nullable());
     auto* nullable_column = reinterpret_cast<vectorized::ColumnNullable*>(
             (*std::move(doris_column)).mutate().get());
-    NullMap& map_data = nullable_column->get_null_map_data();
+    NullMap& map_data_column = nullable_column->get_null_map_data();
     MutableColumnPtr data_column = nullable_column->get_nested_column_ptr();
 
     // generate array offset
@@ -296,8 +298,11 @@ Status ArrayColumnReader::read_column_data(ColumnPtr& doris_column, DataTypePtr&
             size_t scan_values =
                     element_offsets[offset_index + scan_rows] - element_offsets[offset_index];
             // null array, should ignore the last offset in element_offsets
+            auto origin_size = map_data_column.size();
+            map_data_column.resize(origin_size + scan_rows);
             for (int i = offset_index; i < offset_index + scan_rows; ++i) {
-                map_data.emplace_back(definitions[element_offsets[i]] == _NULL_ARRAY);
+                map_data_column[origin_size + i] =
+                        (UInt8)(definitions[element_offsets[i]] == _NULL_ARRAY);
             }
             // fill array offset, should skip a value when parsing null array
             _fill_array_offset(data_column, element_offsets, offset_index, scan_rows);
@@ -339,14 +344,23 @@ Status ArrayColumnReader::_load_nested_column(ColumnPtr& doris_column, DataTypeP
     level_t* definitions = _def_levels_buf.get();
     auto* nullable_column = reinterpret_cast<vectorized::ColumnNullable*>(
             (*std::move(doris_column)).mutate().get());
-    NullMap& map_data = nullable_column->get_null_map_data();
+    NullMap& map_data_column = nullable_column->get_null_map_data();
     MutableColumnPtr data_column = nullable_column->get_nested_column_ptr();
+    size_t null_map_size = 0;
     for (int i = _def_offset; i < _def_offset + read_values; ++i) {
         // should skip _EMPTY_ARRAY and _NULL_ARRAY
+        if (definitions[i] == _CONCRETE_ELEMENT || definitions[i] == _NULL_ELEMENT) {
+            null_map_size++;
+        }
+    }
+    auto origin_size = map_data_column.size();
+    map_data_column.resize(origin_size + null_map_size);
+    size_t null_map_idx = origin_size;
+    for (int i = _def_offset; i < _def_offset + read_values; ++i) {
         if (definitions[i] == _CONCRETE_ELEMENT) {
-            map_data.emplace_back(false);
+            map_data_column[null_map_idx++] = (UInt8) false;
         } else if (definitions[i] == _NULL_ELEMENT) {
-            map_data.emplace_back(true);
+            map_data_column[null_map_idx++] = (UInt8) true;
         }
     }
     // column with null values


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org