You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by yi...@apache.org on 2022/04/03 11:50:43 UTC
[incubator-doris] branch master updated: [Bug][Vectorized] Fix core bug of segment vectorized (#8800)
This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-doris.git
The following commit(s) were added to refs/heads/master by this push:
new fcefed7 [Bug][Vectorized] Fix core bug of segment vectorized (#8800)
fcefed7 is described below
commit fcefed7c1cb8a55debbd32f158e47e467e939d2e
Author: HappenLee <ha...@hotmail.com>
AuthorDate: Sun Apr 3 19:50:25 2022 +0800
[Bug][Vectorized] Fix core bug of segment vectorized (#8800)
* [Bug][Vectorized] Fix core bug of segment vectorized
1. Read table with delete condition
2. Read table with default value HLL/Bitmap Column
* refactor some code
Co-authored-by: lihaopeng <li...@baidu.com>
---
be/src/olap/rowset/segment_v2/column_reader.cpp | 79 ++++++++++++++--------
be/src/olap/rowset/segment_v2/segment_iterator.cpp | 20 ++++--
be/src/olap/rowset/segment_v2/segment_iterator.h | 4 ++
be/src/vec/core/block.cpp | 32 +++++----
be/src/vec/core/block.h | 12 +++-
5 files changed, 94 insertions(+), 53 deletions(-)
diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp b/be/src/olap/rowset/segment_v2/column_reader.cpp
index d65c355..7611408 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/column_reader.cpp
@@ -775,39 +775,58 @@ Status DefaultValueColumnIterator::next_batch(size_t* n, ColumnBlockView* dst, b
void DefaultValueColumnIterator::insert_default_data(vectorized::MutableColumnPtr &dst, size_t n) {
vectorized::Int128 int128;
- char* data_ptr = (char*)&int128;
+ char* data_ptr = (char *) &int128;
size_t data_len = sizeof(int128);
- auto type = _type_info->type();
- if (type == OLAP_FIELD_TYPE_DATE) {
- assert(_type_size == sizeof(FieldTypeTraits<OLAP_FIELD_TYPE_DATE>::CppType)); //uint24_t
- std::string str = FieldTypeTraits<OLAP_FIELD_TYPE_DATE>::to_string(_mem_value);
-
- vectorized::VecDateTimeValue value;
- value.from_date_str(str.c_str(), str.length());
- value.cast_to_date();
- //TODO: here is int128 = int64
- int128 = binary_cast<vectorized::VecDateTimeValue, vectorized::Int64>(value);
- } else if (type == OLAP_FIELD_TYPE_DATETIME) {
- assert(_type_size == sizeof(FieldTypeTraits<OLAP_FIELD_TYPE_DATETIME>::CppType)); //int64_t
- std::string str = FieldTypeTraits<OLAP_FIELD_TYPE_DATETIME>::to_string(_mem_value);
-
- vectorized::VecDateTimeValue value;
- value.from_date_str(str.c_str(), str.length());
- value.to_datetime();
-
- int128 = binary_cast<vectorized::VecDateTimeValue, vectorized::Int64>(value);
- } else if (type == OLAP_FIELD_TYPE_DECIMAL) {
- assert(_type_size == sizeof(FieldTypeTraits<OLAP_FIELD_TYPE_DECIMAL>::CppType)); //decimal12_t
- decimal12_t* d = (decimal12_t*)_mem_value;
- int128 = DecimalV2Value(d->integer, d->fraction).value();
- } else {
- data_ptr = (char*)_mem_value;
- data_len = _type_size;
- }
+ auto insert_column_data = [&]() {
+ for (size_t i = 0; i < n; ++i) {
+ dst->insert_data(data_ptr, data_len);
+ }
+ };
+
+ switch (_type_info->type()) {
+ case OLAP_FIELD_TYPE_OBJECT:
+ case OLAP_FIELD_TYPE_HLL:{
+ dst->insert_many_defaults(n);
+ break;
+ }
+
+ case OLAP_FIELD_TYPE_DATE: {
+ assert(_type_size == sizeof(FieldTypeTraits<OLAP_FIELD_TYPE_DATE>::CppType)); //uint24_t
+ std::string str = FieldTypeTraits<OLAP_FIELD_TYPE_DATE>::to_string(_mem_value);
+
+ vectorized::VecDateTimeValue value;
+ value.from_date_str(str.c_str(), str.length());
+ value.cast_to_date();
+ //TODO: here is int128 = int64, here rely on the logic of little endian
+ int128 = binary_cast<vectorized::VecDateTimeValue, vectorized::Int64>(value);
+ insert_column_data();
+ break;
+ }
+ case OLAP_FIELD_TYPE_DATETIME: {
+ assert(_type_size == sizeof(FieldTypeTraits<OLAP_FIELD_TYPE_DATETIME>::CppType)); //int64_t
+ std::string str = FieldTypeTraits<OLAP_FIELD_TYPE_DATETIME>::to_string(_mem_value);
- for (size_t i = 0; i < n; ++i) {
- dst->insert_data(data_ptr, data_len);
+ vectorized::VecDateTimeValue value;
+ value.from_date_str(str.c_str(), str.length());
+ value.to_datetime();
+
+ int128 = binary_cast<vectorized::VecDateTimeValue, vectorized::Int64>(value);
+ insert_column_data();
+ break;
+ }
+ case OLAP_FIELD_TYPE_DECIMAL: {
+ assert(_type_size == sizeof(FieldTypeTraits<OLAP_FIELD_TYPE_DECIMAL>::CppType)); //decimal12_t
+ decimal12_t *d = (decimal12_t *) _mem_value;
+ int128 = DecimalV2Value(d->integer, d->fraction).value();
+ insert_column_data();
+ break;
+ }
+ default: {
+ data_ptr = (char *) _mem_value;
+ data_len = _type_size;
+ insert_column_data();
+ }
}
}
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
index ed0429c..53bf87a 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
@@ -134,6 +134,7 @@ Status SegmentIterator::_init(bool is_vec) {
RETURN_IF_ERROR(_get_row_ranges_by_column_conditions());
if (is_vec) {
_vec_init_lazy_materialization();
+ _vec_init_char_column_id();
} else {
_init_lazy_materialization();
}
@@ -707,6 +708,17 @@ void SegmentIterator::_vec_init_lazy_materialization() {
}
}
+void SegmentIterator::_vec_init_char_column_id() {
+ for (size_t i = 0; i < _schema.num_column_ids(); i++) {
+ auto cid = _schema.column_id(i);
+ auto column_desc = _schema.column(cid);
+
+ if (column_desc->type() == OLAP_FIELD_TYPE_CHAR) {
+ _char_type_idx.emplace_back(i);
+ }
+ }
+}
+
Status SegmentIterator::_read_columns(const std::vector<ColumnId>& column_ids,
vectorized::MutableColumns& column_block, size_t nrows) {
for (auto cid : column_ids) {
@@ -720,8 +732,6 @@ Status SegmentIterator::_read_columns(const std::vector<ColumnId>& column_ids,
void SegmentIterator::_init_current_block(
vectorized::Block* block, std::vector<vectorized::MutableColumnPtr>& current_columns) {
- _char_type_idx.clear();
-
bool is_block_mem_reuse = block->mem_reuse();
if (is_block_mem_reuse) {
block->clear_column_data(_schema.num_column_ids());
@@ -738,11 +748,7 @@ void SegmentIterator::_init_current_block(
auto cid = _schema.column_id(i);
auto column_desc = _schema.column(cid);
- if (column_desc->type() == OLAP_FIELD_TYPE_CHAR) {
- _char_type_idx.emplace_back(i);
- }
-
- if (_is_pred_column[cid]) { //todo(wb) maybe we can relase it after output block
+ if (_is_pred_column[cid]) { //todo(wb) maybe we can release it after output block
current_columns[cid]->clear();
} else { // non-predicate column
if (is_block_mem_reuse) {
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.h b/be/src/olap/rowset/segment_v2/segment_iterator.h
index 07ee964..42a2caf 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.h
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.h
@@ -79,6 +79,10 @@ private:
void _init_lazy_materialization();
void _vec_init_lazy_materialization();
+ // TODO: Fix Me
+ // CHAR type in storge layer padding the 0 in length. But query engine need ignore the padding 0.
+ // so segment iterator need to shrink char column before output it. only use in vec query engine.
+ void _vec_init_char_column_id();
uint32_t segment_id() const { return _segment->id(); }
uint32_t num_rows() const { return _segment->num_rows(); }
diff --git a/be/src/vec/core/block.cpp b/be/src/vec/core/block.cpp
index ca44be9..dc970df 100644
--- a/be/src/vec/core/block.cpp
+++ b/be/src/vec/core/block.cpp
@@ -924,22 +924,24 @@ std::unique_ptr<Block> Block::create_same_struct_block(size_t size) const {
return temp_block;
}
-void Block::shrink_char_type_column_suffix_zero(std::vector<size_t> char_type_idx) {
+void Block::shrink_char_type_column_suffix_zero(const std::vector<size_t>& char_type_idx) {
for (auto idx : char_type_idx) {
- if (this->get_by_position(idx).column->is_nullable()) {
- this->get_by_position(idx).column = ColumnNullable::create(
- reinterpret_cast<const ColumnString*>(
- reinterpret_cast<const ColumnNullable*>(
- this->get_by_position(idx).column.get())
- ->get_nested_column_ptr()
- .get())
- ->get_shinked_column(),
- reinterpret_cast<const ColumnNullable*>(this->get_by_position(idx).column.get())
- ->get_null_map_column_ptr());
- } else {
- this->get_by_position(idx).column =
- reinterpret_cast<const ColumnString*>(this->get_by_position(idx).column.get())
- ->get_shinked_column();
+ if (idx < data.size()) {
+ if (this->get_by_position(idx).column->is_nullable()) {
+ this->get_by_position(idx).column = ColumnNullable::create(
+ reinterpret_cast<const ColumnString *>(
+ reinterpret_cast<const ColumnNullable *>(
+ this->get_by_position(idx).column.get())
+ ->get_nested_column_ptr()
+ .get())
+ ->get_shinked_column(),
+ reinterpret_cast<const ColumnNullable *>(this->get_by_position(idx).column.get())
+ ->get_null_map_column_ptr());
+ } else {
+ this->get_by_position(idx).column =
+ reinterpret_cast<const ColumnString *>(this->get_by_position(idx).column.get())
+ ->get_shinked_column();
+ }
}
}
}
diff --git a/be/src/vec/core/block.h b/be/src/vec/core/block.h
index 8fa4a30..82d8514 100644
--- a/be/src/vec/core/block.h
+++ b/be/src/vec/core/block.h
@@ -104,6 +104,16 @@ public:
doris::vectorized::IColumn* input_col_ptr,
uint16_t* sel_rowid_idx, uint16_t select_size, int block_cid,
size_t batch_size) {
+ // Only the additional deleted filter condition need to materialize column be at the end of the block
+ // We should not to materialize the column of query engine do not need. So here just return OK.
+ // Eg:
+ // `delete from table where a = 10;`
+ // `select b from table;`
+ // a column only effective in segment iterator, the block from query engine only contain the b column.
+ // so the `block_cid >= data.size()` is true
+ if (block_cid >= data.size())
+ return Status::OK();
+
if (is_block_mem_reuse) {
auto* raw_res_ptr = this->get_by_position(block_cid).column.get();
const_cast<doris::vectorized::IColumn*>(raw_res_ptr)->reserve(batch_size);
@@ -296,7 +306,7 @@ public:
doris::Tuple* deep_copy_tuple(const TupleDescriptor&, MemPool*, int, int,
bool padding_char = false);
- void shrink_char_type_column_suffix_zero(std::vector<size_t> char_type_idx);
+ void shrink_char_type_column_suffix_zero(const std::vector<size_t>& char_type_idx);
private:
void erase_impl(size_t position);
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org