You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by yi...@apache.org on 2022/04/03 11:50:43 UTC

[incubator-doris] branch master updated: [Bug][Vectorized] Fix core bug of segment vectorized (#8800)

This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-doris.git


The following commit(s) were added to refs/heads/master by this push:
     new fcefed7  [Bug][Vectorized] Fix core bug of segment vectorized (#8800)
fcefed7 is described below

commit fcefed7c1cb8a55debbd32f158e47e467e939d2e
Author: HappenLee <ha...@hotmail.com>
AuthorDate: Sun Apr 3 19:50:25 2022 +0800

    [Bug][Vectorized] Fix core bug of segment vectorized (#8800)
    
    * [Bug][Vectorized] Fix core bug of segment vectorized
    1. Read table with delete condition
    2. Read table with default value HLL/Bitmap Column
    
    * refactor some code
    
    Co-authored-by: lihaopeng <li...@baidu.com>
---
 be/src/olap/rowset/segment_v2/column_reader.cpp    | 79 ++++++++++++++--------
 be/src/olap/rowset/segment_v2/segment_iterator.cpp | 20 ++++--
 be/src/olap/rowset/segment_v2/segment_iterator.h   |  4 ++
 be/src/vec/core/block.cpp                          | 32 +++++----
 be/src/vec/core/block.h                            | 12 +++-
 5 files changed, 94 insertions(+), 53 deletions(-)

diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp b/be/src/olap/rowset/segment_v2/column_reader.cpp
index d65c355..7611408 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/column_reader.cpp
@@ -775,39 +775,58 @@ Status DefaultValueColumnIterator::next_batch(size_t* n, ColumnBlockView* dst, b
 
 void DefaultValueColumnIterator::insert_default_data(vectorized::MutableColumnPtr &dst, size_t n) {
     vectorized::Int128 int128;
-    char* data_ptr = (char*)&int128;
+    char* data_ptr = (char *) &int128;
     size_t data_len = sizeof(int128);
 
-    auto type = _type_info->type();
-    if (type == OLAP_FIELD_TYPE_DATE) {
-        assert(_type_size == sizeof(FieldTypeTraits<OLAP_FIELD_TYPE_DATE>::CppType)); //uint24_t
-        std::string str = FieldTypeTraits<OLAP_FIELD_TYPE_DATE>::to_string(_mem_value);
-
-        vectorized::VecDateTimeValue value;
-        value.from_date_str(str.c_str(), str.length());
-        value.cast_to_date();
-        //TODO: here is int128 = int64
-        int128 = binary_cast<vectorized::VecDateTimeValue, vectorized::Int64>(value);
-    } else if (type == OLAP_FIELD_TYPE_DATETIME) {
-        assert(_type_size == sizeof(FieldTypeTraits<OLAP_FIELD_TYPE_DATETIME>::CppType)); //int64_t
-        std::string str = FieldTypeTraits<OLAP_FIELD_TYPE_DATETIME>::to_string(_mem_value);
-
-        vectorized::VecDateTimeValue value;
-        value.from_date_str(str.c_str(), str.length());
-        value.to_datetime();
-
-        int128 = binary_cast<vectorized::VecDateTimeValue, vectorized::Int64>(value);
-    } else if (type == OLAP_FIELD_TYPE_DECIMAL) {
-        assert(_type_size == sizeof(FieldTypeTraits<OLAP_FIELD_TYPE_DECIMAL>::CppType)); //decimal12_t
-        decimal12_t* d = (decimal12_t*)_mem_value;
-        int128 = DecimalV2Value(d->integer, d->fraction).value();
-    } else {
-        data_ptr = (char*)_mem_value;
-        data_len = _type_size;
-    }
+    auto insert_column_data = [&]() {
+        for (size_t i = 0; i < n; ++i) {
+            dst->insert_data(data_ptr, data_len);
+        }
+    };
+
+    switch (_type_info->type()) {
+        case OLAP_FIELD_TYPE_OBJECT:
+        case OLAP_FIELD_TYPE_HLL:{
+            dst->insert_many_defaults(n);
+            break;
+        }
+
+        case OLAP_FIELD_TYPE_DATE: {
+            assert(_type_size == sizeof(FieldTypeTraits<OLAP_FIELD_TYPE_DATE>::CppType)); //uint24_t
+            std::string str = FieldTypeTraits<OLAP_FIELD_TYPE_DATE>::to_string(_mem_value);
+
+            vectorized::VecDateTimeValue value;
+            value.from_date_str(str.c_str(), str.length());
+            value.cast_to_date();
+            //TODO: here is int128 = int64, here rely on the logic of little endian
+            int128 = binary_cast<vectorized::VecDateTimeValue, vectorized::Int64>(value);
+            insert_column_data();
+            break;
+        }
+        case OLAP_FIELD_TYPE_DATETIME: {
+            assert(_type_size == sizeof(FieldTypeTraits<OLAP_FIELD_TYPE_DATETIME>::CppType)); //int64_t
+            std::string str = FieldTypeTraits<OLAP_FIELD_TYPE_DATETIME>::to_string(_mem_value);
 
-    for (size_t i = 0; i < n; ++i) {
-        dst->insert_data(data_ptr, data_len);
+            vectorized::VecDateTimeValue value;
+            value.from_date_str(str.c_str(), str.length());
+            value.to_datetime();
+
+            int128 = binary_cast<vectorized::VecDateTimeValue, vectorized::Int64>(value);
+            insert_column_data();
+            break;
+        }
+        case OLAP_FIELD_TYPE_DECIMAL: {
+            assert(_type_size == sizeof(FieldTypeTraits<OLAP_FIELD_TYPE_DECIMAL>::CppType)); //decimal12_t
+            decimal12_t *d = (decimal12_t *) _mem_value;
+            int128 = DecimalV2Value(d->integer, d->fraction).value();
+            insert_column_data();
+            break;
+        }
+        default: {
+            data_ptr = (char *) _mem_value;
+            data_len = _type_size;
+            insert_column_data();
+        }
     }
 }
 
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
index ed0429c..53bf87a 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
@@ -134,6 +134,7 @@ Status SegmentIterator::_init(bool is_vec) {
     RETURN_IF_ERROR(_get_row_ranges_by_column_conditions());
     if (is_vec) {
         _vec_init_lazy_materialization();
+        _vec_init_char_column_id();
     } else {
         _init_lazy_materialization();
     }
@@ -707,6 +708,17 @@ void SegmentIterator::_vec_init_lazy_materialization() {
     }
 }
 
+void SegmentIterator::_vec_init_char_column_id() {
+    for (size_t i = 0; i < _schema.num_column_ids(); i++) {
+        auto cid = _schema.column_id(i);
+        auto column_desc = _schema.column(cid);
+
+        if (column_desc->type() == OLAP_FIELD_TYPE_CHAR) {
+            _char_type_idx.emplace_back(i);
+        }
+    }
+}
+
 Status SegmentIterator::_read_columns(const std::vector<ColumnId>& column_ids,
                                       vectorized::MutableColumns& column_block, size_t nrows) {
     for (auto cid : column_ids) {
@@ -720,8 +732,6 @@ Status SegmentIterator::_read_columns(const std::vector<ColumnId>& column_ids,
 
 void SegmentIterator::_init_current_block(
         vectorized::Block* block, std::vector<vectorized::MutableColumnPtr>& current_columns) {
-    _char_type_idx.clear();
-
     bool is_block_mem_reuse = block->mem_reuse();
     if (is_block_mem_reuse) {
         block->clear_column_data(_schema.num_column_ids());
@@ -738,11 +748,7 @@ void SegmentIterator::_init_current_block(
         auto cid = _schema.column_id(i);
         auto column_desc = _schema.column(cid);
 
-        if (column_desc->type() == OLAP_FIELD_TYPE_CHAR) {
-            _char_type_idx.emplace_back(i);
-        }
-
-        if (_is_pred_column[cid]) { //todo(wb) maybe we can relase it after output block
+        if (_is_pred_column[cid]) { //todo(wb) maybe we can release it after output block
             current_columns[cid]->clear();
         } else { // non-predicate column
             if (is_block_mem_reuse) {
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.h b/be/src/olap/rowset/segment_v2/segment_iterator.h
index 07ee964..42a2caf 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.h
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.h
@@ -79,6 +79,10 @@ private:
 
     void _init_lazy_materialization();
     void _vec_init_lazy_materialization();
+    // TODO: Fix Me
+    // CHAR type in storge layer padding the 0 in length. But query engine need ignore the padding 0.
+    // so segment iterator need to shrink char column before output it. only use in vec query engine.
+    void _vec_init_char_column_id();
 
     uint32_t segment_id() const { return _segment->id(); }
     uint32_t num_rows() const { return _segment->num_rows(); }
diff --git a/be/src/vec/core/block.cpp b/be/src/vec/core/block.cpp
index ca44be9..dc970df 100644
--- a/be/src/vec/core/block.cpp
+++ b/be/src/vec/core/block.cpp
@@ -924,22 +924,24 @@ std::unique_ptr<Block> Block::create_same_struct_block(size_t size) const {
     return temp_block;
 }
 
-void Block::shrink_char_type_column_suffix_zero(std::vector<size_t> char_type_idx) {
+void Block::shrink_char_type_column_suffix_zero(const std::vector<size_t>& char_type_idx) {
     for (auto idx : char_type_idx) {
-        if (this->get_by_position(idx).column->is_nullable()) {
-            this->get_by_position(idx).column = ColumnNullable::create(
-                    reinterpret_cast<const ColumnString*>(
-                            reinterpret_cast<const ColumnNullable*>(
-                                    this->get_by_position(idx).column.get())
-                                    ->get_nested_column_ptr()
-                                    .get())
-                            ->get_shinked_column(),
-                    reinterpret_cast<const ColumnNullable*>(this->get_by_position(idx).column.get())
-                            ->get_null_map_column_ptr());
-        } else {
-            this->get_by_position(idx).column =
-                    reinterpret_cast<const ColumnString*>(this->get_by_position(idx).column.get())
-                            ->get_shinked_column();
+        if (idx < data.size()) {
+            if (this->get_by_position(idx).column->is_nullable()) {
+                this->get_by_position(idx).column = ColumnNullable::create(
+                        reinterpret_cast<const ColumnString *>(
+                                reinterpret_cast<const ColumnNullable *>(
+                                        this->get_by_position(idx).column.get())
+                                        ->get_nested_column_ptr()
+                                        .get())
+                                ->get_shinked_column(),
+                        reinterpret_cast<const ColumnNullable *>(this->get_by_position(idx).column.get())
+                                ->get_null_map_column_ptr());
+            } else {
+                this->get_by_position(idx).column =
+                        reinterpret_cast<const ColumnString *>(this->get_by_position(idx).column.get())
+                                ->get_shinked_column();
+            }
         }
     }
 }
diff --git a/be/src/vec/core/block.h b/be/src/vec/core/block.h
index 8fa4a30..82d8514 100644
--- a/be/src/vec/core/block.h
+++ b/be/src/vec/core/block.h
@@ -104,6 +104,16 @@ public:
                                      doris::vectorized::IColumn* input_col_ptr,
                                      uint16_t* sel_rowid_idx, uint16_t select_size, int block_cid,
                                      size_t batch_size) {
+        // Only the additional deleted filter condition need to materialize column be at the end of the block
+        // We should not to materialize the column of query engine do not need. So here just return OK.
+        // Eg:
+        //      `delete from table where a = 10;`
+        //      `select b from table;`
+        // a column only effective in segment iterator, the block from query engine only contain the b column.
+        // so the `block_cid >= data.size()` is true
+        if (block_cid >= data.size())
+            return Status::OK();
+
         if (is_block_mem_reuse) {
             auto* raw_res_ptr = this->get_by_position(block_cid).column.get();
             const_cast<doris::vectorized::IColumn*>(raw_res_ptr)->reserve(batch_size);
@@ -296,7 +306,7 @@ public:
     doris::Tuple* deep_copy_tuple(const TupleDescriptor&, MemPool*, int, int,
                                   bool padding_char = false);
 
-    void shrink_char_type_column_suffix_zero(std::vector<size_t> char_type_idx);
+    void shrink_char_type_column_suffix_zero(const std::vector<size_t>& char_type_idx);
 
 private:
     void erase_impl(size_t position);

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org