You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by mo...@apache.org on 2021/07/21 02:50:46 UTC
[incubator-doris] branch master updated: [Enhance] improve
performance of init_scan_key by sharing the schema (#6099)
This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-doris.git
The following commit(s) were added to refs/heads/master by this push:
new 2d78c31 [Enhance] improve performance of init_scan_key by sharing the schema (#6099)
2d78c31 is described below
commit 2d78c31d4956f24ebce106e23680e85dcbe5e030
Author: huangmengbin <44...@users.noreply.github.com>
AuthorDate: Wed Jul 21 10:50:31 2021 +0800
[Enhance] improve performance of init_scan_key by sharing the schema (#6099)
Co-authored-by: huangmengbin <hu...@bytedance.com>
---
be/src/olap/generic_iterators.cpp | 4 +-
be/src/olap/reader.cpp | 30 +++++-
be/src/olap/row_cursor.cpp | 115 +++++++++++++--------
be/src/olap/row_cursor.h | 11 +-
be/src/olap/rowset/segment_v2/segment_iterator.cpp | 4 +-
be/src/olap/schema.h | 1 -
be/test/olap/row_cursor_test.cpp | 6 +-
7 files changed, 118 insertions(+), 53 deletions(-)
diff --git a/be/src/olap/generic_iterators.cpp b/be/src/olap/generic_iterators.cpp
index 66e97d6..8fd8e68 100644
--- a/be/src/olap/generic_iterators.cpp
+++ b/be/src/olap/generic_iterators.cpp
@@ -66,9 +66,9 @@ Status AutoIncrementIterator::next_batch(RowBlockV2* block) {
while (row_idx < block->capacity() && _rows_returned < _num_rows) {
RowBlockRow row = block->row(row_idx);
- for (int i = 0; i < _schema.columns().size(); ++i) {
+ for (int i = 0; i < _schema.num_columns(); ++i) {
row.set_is_null(i, false);
- auto& col_schema = _schema.columns()[i];
+ const auto* col_schema = _schema.column(i);
switch (col_schema->type()) {
case OLAP_FIELD_TYPE_SMALLINT:
*(int16_t*)row.cell_ptr(i) = _rows_returned + i;
diff --git a/be/src/olap/reader.cpp b/be/src/olap/reader.cpp
index fc6020f..c2cc1df 100644
--- a/be/src/olap/reader.cpp
+++ b/be/src/olap/reader.cpp
@@ -31,6 +31,7 @@
#include "olap/row_block.h"
#include "olap/row_cursor.h"
#include "olap/rowset/beta_rowset_reader.h"
+#include "olap/schema.h"
#include "olap/rowset/column_data.h"
#include "olap/storage_engine.h"
#include "olap/tablet.h"
@@ -549,14 +550,34 @@ OLAPStatus Reader::_init_keys_param(const ReaderParams& read_params) {
size_t start_key_size = read_params.start_key.size();
_keys_param.start_keys.resize(start_key_size, nullptr);
+
+ size_t scan_key_size = read_params.start_key.front().size();
+ if (scan_key_size > _tablet->tablet_schema().num_columns()) {
+ LOG(WARNING)
+ << "Input param are invalid. Column count is bigger than num_columns of schema. "
+ << "column_count=" << scan_key_size
+ << ", schema.num_columns=" << _tablet->tablet_schema().num_columns();
+ return OLAP_ERR_INPUT_PARAMETER_ERROR;
+ }
+
+ std::vector<uint32_t> columns(scan_key_size);
+ std::iota(columns.begin(), columns.end(), 0);
+
+ std::shared_ptr<Schema> schema = std::make_shared<Schema>(_tablet->tablet_schema().columns(), columns);
+
for (size_t i = 0; i < start_key_size; ++i) {
+ if (read_params.start_key[i].size() != scan_key_size) {
+ OLAP_LOG_WARNING("The start_key.at(%ld).size == %ld, not equals the %ld", i, read_params.start_key[i].size(), scan_key_size);
+ return OLAP_ERR_INPUT_PARAMETER_ERROR;
+ }
+
if ((_keys_param.start_keys[i] = new (nothrow) RowCursor()) == nullptr) {
OLAP_LOG_WARNING("fail to new RowCursor!");
return OLAP_ERR_MALLOC_ERROR;
}
OLAPStatus res = _keys_param.start_keys[i]->init_scan_key(
- _tablet->tablet_schema(), read_params.start_key[i].values());
+ _tablet->tablet_schema(), read_params.start_key[i].values(), schema);
if (res != OLAP_SUCCESS) {
OLAP_LOG_WARNING("fail to init row cursor. [res=%d]", res);
return res;
@@ -572,13 +593,18 @@ OLAPStatus Reader::_init_keys_param(const ReaderParams& read_params) {
size_t end_key_size = read_params.end_key.size();
_keys_param.end_keys.resize(end_key_size, nullptr);
for (size_t i = 0; i < end_key_size; ++i) {
+ if (read_params.end_key[i].size() != scan_key_size) {
+ OLAP_LOG_WARNING("The end_key.at(%ld).size == %ld, not equals the %ld", i, read_params.end_key[i].size(), scan_key_size);
+ return OLAP_ERR_INPUT_PARAMETER_ERROR;
+ }
+
if ((_keys_param.end_keys[i] = new (nothrow) RowCursor()) == nullptr) {
OLAP_LOG_WARNING("fail to new RowCursor!");
return OLAP_ERR_MALLOC_ERROR;
}
OLAPStatus res = _keys_param.end_keys[i]->init_scan_key(_tablet->tablet_schema(),
- read_params.end_key[i].values());
+ read_params.end_key[i].values(), schema);
if (res != OLAP_SUCCESS) {
OLAP_LOG_WARNING("fail to init row cursor. [res=%d]", res);
return res;
diff --git a/be/src/olap/row_cursor.cpp b/be/src/olap/row_cursor.cpp
index acde936..020ccf7 100644
--- a/be/src/olap/row_cursor.cpp
+++ b/be/src/olap/row_cursor.cpp
@@ -35,9 +35,7 @@ RowCursor::~RowCursor() {
delete[] _variable_buf;
}
-OLAPStatus RowCursor::_init(const std::vector<TabletColumn>& schema,
- const std::vector<uint32_t>& columns) {
- _schema.reset(new Schema(schema, columns));
+OLAPStatus RowCursor::_init(const std::vector<uint32_t>& columns) {
_variable_len = 0;
for (auto cid : columns) {
if (_schema->column(cid) == nullptr) {
@@ -59,6 +57,61 @@ OLAPStatus RowCursor::_init(const std::vector<TabletColumn>& schema,
return OLAP_SUCCESS;
}
+OLAPStatus RowCursor::_init(const std::shared_ptr<Schema>& shared_schema,
+ const std::vector<uint32_t>& columns) {
+ _schema = shared_schema;
+ return _init(columns);
+}
+
+OLAPStatus RowCursor::_init(const std::vector<TabletColumn>& schema,
+ const std::vector<uint32_t>& columns) {
+ _schema.reset(new Schema(schema, columns));
+ return _init(columns);
+}
+
+OLAPStatus RowCursor::_init_scan_key(const TabletSchema& schema, const std::vector<std::string>& scan_keys) {
+ // NOTE: cid equal with column index
+ // Hyperloglog cannot be key, no need to handle it
+ _variable_len = 0;
+ for (auto cid : _schema->column_ids()) {
+ const TabletColumn& column = schema.column(cid);
+ FieldType type = column.type();
+ if (type == OLAP_FIELD_TYPE_VARCHAR) {
+ _variable_len += scan_keys[cid].length();
+ } else if (type == OLAP_FIELD_TYPE_CHAR || type == OLAP_FIELD_TYPE_ARRAY) {
+ _variable_len += std::max(scan_keys[cid].length(), column.length());
+ }
+ }
+
+ // variable_len for null bytes
+ _variable_buf = new (nothrow) char[_variable_len];
+ if (_variable_buf == nullptr) {
+ OLAP_LOG_WARNING("Fail to malloc _variable_buf.");
+ return OLAP_ERR_MALLOC_ERROR;
+ }
+ memset(_variable_buf, 0, _variable_len);
+ char* fixed_ptr = _fixed_buf;
+ char* variable_ptr = _variable_buf;
+ for (auto cid : _schema->column_ids()) {
+ const TabletColumn& column = schema.column(cid);
+ fixed_ptr = _fixed_buf + _schema->column_offset(cid);
+ FieldType type = column.type();
+ if (type == OLAP_FIELD_TYPE_VARCHAR) {
+ Slice* slice = reinterpret_cast<Slice*>(fixed_ptr + 1);
+ slice->data = variable_ptr;
+ slice->size = scan_keys[cid].length();
+ variable_ptr += scan_keys[cid].length();
+ } else if (type == OLAP_FIELD_TYPE_CHAR) {
+ Slice* slice = reinterpret_cast<Slice*>(fixed_ptr + 1);
+ slice->data = variable_ptr;
+ slice->size = std::max(scan_keys[cid].length(), column.length());
+ variable_ptr += slice->size;
+ }
+ }
+
+ return OLAP_SUCCESS;
+}
+
OLAPStatus RowCursor::init(const TabletSchema& schema) {
return init(schema.columns(), schema.num_columns());
}
@@ -116,53 +169,27 @@ OLAPStatus RowCursor::init_scan_key(const TabletSchema& schema,
return OLAP_ERR_INPUT_PARAMETER_ERROR;
}
- std::vector<uint32_t> columns;
- for (size_t i = 0; i < scan_key_size; ++i) {
- columns.push_back(i);
- }
+ std::vector<uint32_t> columns(scan_key_size);
+ std::iota(columns.begin(), columns.end(), 0);
RETURN_NOT_OK(_init(schema.columns(), columns));
- // NOTE: cid equal with column index
- // Hyperloglog cannot be key, no need to handle it
- _variable_len = 0;
- for (auto cid : _schema->column_ids()) {
- const TabletColumn& column = schema.column(cid);
- FieldType type = column.type();
- if (type == OLAP_FIELD_TYPE_VARCHAR) {
- _variable_len += scan_keys[cid].length();
- } else if (type == OLAP_FIELD_TYPE_CHAR || type == OLAP_FIELD_TYPE_ARRAY) {
- _variable_len += std::max(scan_keys[cid].length(), column.length());
- }
- }
+ return _init_scan_key(schema, scan_keys);
+}
- // variable_len for null bytes
- _variable_buf = new (nothrow) char[_variable_len];
- if (_variable_buf == nullptr) {
- OLAP_LOG_WARNING("Fail to malloc _variable_buf.");
- return OLAP_ERR_MALLOC_ERROR;
- }
- memset(_variable_buf, 0, _variable_len);
- char* fixed_ptr = _fixed_buf;
- char* variable_ptr = _variable_buf;
- for (auto cid : _schema->column_ids()) {
- const TabletColumn& column = schema.column(cid);
- fixed_ptr = _fixed_buf + _schema->column_offset(cid);
- FieldType type = column.type();
- if (type == OLAP_FIELD_TYPE_VARCHAR) {
- Slice* slice = reinterpret_cast<Slice*>(fixed_ptr + 1);
- slice->data = variable_ptr;
- slice->size = scan_keys[cid].length();
- variable_ptr += scan_keys[cid].length();
- } else if (type == OLAP_FIELD_TYPE_CHAR) {
- Slice* slice = reinterpret_cast<Slice*>(fixed_ptr + 1);
- slice->data = variable_ptr;
- slice->size = std::max(scan_keys[cid].length(), column.length());
- variable_ptr += slice->size;
- }
+OLAPStatus RowCursor::init_scan_key(const TabletSchema& schema,
+ const std::vector<std::string>& scan_keys,
+ const std::shared_ptr<Schema>& shared_schema) {
+ size_t scan_key_size = scan_keys.size();
+
+ std::vector<uint32_t> columns;
+ for (size_t i = 0; i < scan_key_size; ++i) {
+ columns.push_back(i);
}
- return OLAP_SUCCESS;
+ RETURN_NOT_OK(_init(shared_schema, columns));
+
+ return _init_scan_key(schema, scan_keys);
}
// TODO(yingchun): parameter 'const TabletSchema& schema' is not used
diff --git a/be/src/olap/row_cursor.h b/be/src/olap/row_cursor.h
index cc8dc45..222a4f9 100644
--- a/be/src/olap/row_cursor.h
+++ b/be/src/olap/row_cursor.h
@@ -55,6 +55,10 @@ public:
// 目前仅用在拆分key区间的时候
OLAPStatus init_scan_key(const TabletSchema& schema, const std::vector<std::string>& keys);
+ OLAPStatus init_scan_key(const TabletSchema& schema,
+ const std::vector<std::string>& keys,
+ const std::shared_ptr<Schema>& shared_schema);
+
//allocate memory for string type, which include char, varchar, hyperloglog
OLAPStatus allocate_memory_for_string_type(const TabletSchema& schema);
@@ -143,10 +147,15 @@ public:
char* row_ptr() const { return _fixed_buf; }
private:
+ OLAPStatus _init(const std::vector<uint32_t>& columns);
+ OLAPStatus _init(const std::shared_ptr<Schema>& shared_schema,
+ const std::vector<uint32_t>& columns);
// common init function
OLAPStatus _init(const std::vector<TabletColumn>& schema, const std::vector<uint32_t>& columns);
- std::unique_ptr<Schema> _schema;
+ OLAPStatus _init_scan_key(const TabletSchema& schema, const std::vector<std::string>& scan_keys);
+
+ std::shared_ptr<Schema> _schema;
char* _fixed_buf = nullptr; // point to fixed buf
size_t _fixed_len;
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
index 50e8feb..e2d4b1d 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
@@ -177,13 +177,13 @@ Status SegmentIterator::_prepare_seek(const StorageReadOptions::KeyRange& key_ra
if (key_range.lower_key != nullptr) {
for (auto cid : key_range.lower_key->schema()->column_ids()) {
column_set.emplace(cid);
- key_fields.emplace_back(key_range.lower_key->schema()->column(cid));
+ key_fields.emplace_back(key_range.lower_key->column_schema(cid));
}
}
if (key_range.upper_key != nullptr) {
for (auto cid : key_range.upper_key->schema()->column_ids()) {
if (column_set.count(cid) == 0) {
- key_fields.emplace_back(key_range.upper_key->schema()->column(cid));
+ key_fields.emplace_back(key_range.upper_key->column_schema(cid));
column_set.emplace(cid);
}
}
diff --git a/be/src/olap/schema.h b/be/src/olap/schema.h
index 0a2add8..0a7c8e9 100644
--- a/be/src/olap/schema.h
+++ b/be/src/olap/schema.h
@@ -100,7 +100,6 @@ public:
~Schema();
- const std::vector<Field*>& columns() const { return _cols; }
const Field* column(ColumnId cid) const { return _cols[cid]; }
size_t num_key_columns() const { return _num_key_columns; }
diff --git a/be/test/olap/row_cursor_test.cpp b/be/test/olap/row_cursor_test.cpp
index a11ef6d..34410b9 100644
--- a/be/test/olap/row_cursor_test.cpp
+++ b/be/test/olap/row_cursor_test.cpp
@@ -21,6 +21,7 @@
#include "common/object_pool.h"
#include "olap/row.h"
+#include "olap/schema.h"
#include "olap/tablet_schema.h"
#include "runtime/mem_pool.h"
#include "runtime/mem_tracker.h"
@@ -310,8 +311,11 @@ TEST_F(TestRowCursor, InitRowCursorWithScanKey) {
scan_keys.push_back("char_exceed_length");
scan_keys.push_back("varchar_exceed_length");
+ std::vector<uint32_t> columns{0, 1};
+ std::shared_ptr<Schema> schema = std::make_shared<Schema>(tablet_schema.columns(), columns);
+
RowCursor row;
- OLAPStatus res = row.init_scan_key(tablet_schema, scan_keys);
+ OLAPStatus res = row.init_scan_key(tablet_schema, scan_keys, schema);
ASSERT_EQ(res, OLAP_SUCCESS);
ASSERT_EQ(row.get_fixed_len(), 34);
ASSERT_EQ(row.get_variable_len(), 39);
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org