You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by mo...@apache.org on 2023/01/14 13:06:57 UTC
[doris] branch master updated: [Feature](inverted index) implementation of inverted index writer for numeric types, using bkd index (#15918)
This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 0206e0bc57 [Feature](inverted index) implementation of inverted index writer for numeric types, using bkd index (#15918)
0206e0bc57 is described below
commit 0206e0bc57b26f053726febf1ef816d5ac696c37
Author: airborne12 <ai...@gmail.com>
AuthorDate: Sat Jan 14 21:06:51 2023 +0800
[Feature](inverted index) implementation of inverted index writer for numeric types, using bkd index (#15918)
Step3 of DSIP-023: Add inverted index for full text search
implementation of inverted index writer for numeric types, using bkd index
dependency pr: #14207 #15807 #15821
---
be/src/common/config.h | 2 +
.../rowset/segment_v2/inverted_index_writer.cpp | 106 ++++++++++++++++++---
2 files changed, 96 insertions(+), 12 deletions(-)
diff --git a/be/src/common/config.h b/be/src/common/config.h
index e0eb895175..e8263388f9 100644
--- a/be/src/common/config.h
+++ b/be/src/common/config.h
@@ -880,6 +880,8 @@ CONF_mDouble(inverted_index_ram_buffer_size, "512");
CONF_Int32(query_bkd_inverted_index_limit_percent, "5"); // 5%
// dict path for chinese analyzer
CONF_String(inverted_index_dict_path, "${DORIS_HOME}/dict");
+// tree depth for bkd index
+CONF_Int32(max_depth_in_bkd_tree, "32");
#ifdef BE_TEST
// test s3
CONF_String(test_s3_resource, "resource");
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
index 3df38bec92..31e4cff6bf 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
@@ -19,6 +19,7 @@
#include <CLucene.h>
#include <CLucene/analysis/LanguageBasedAnalyzer.h>
+#include <CLucene/util/bkd/bkd_writer.h>
#include <memory>
@@ -45,6 +46,9 @@ namespace doris::segment_v2 {
const int32_t MAX_FIELD_LEN = 0x7FFFFFFFL;
const int32_t MAX_BUFFER_DOCS = 100000000;
const int32_t MERGE_FACTOR = 100000000;
+const int32_t MAX_LEAF_COUNT = 1024;
+const float MAXMBSortInHeap = 512.0 * 8;
+const int DIMS = 1;
const std::string empty_value;
template <FieldType field_type>
@@ -62,6 +66,7 @@ public:
_index_meta(index_meta) {
_parser_type = get_inverted_index_parser_type_from_string(
get_parser_string_from_properties(_index_meta->properties()));
+ _value_key_coder = get_key_coder(field_type);
_field_name = std::wstring(field_name.begin(), field_name.end());
};
@@ -72,12 +77,14 @@ public:
if constexpr (field_is_slice_type(field_type)) {
return init_fulltext_index();
} else if constexpr (field_is_numeric_type(field_type)) {
- return Status::Error<doris::ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>();
+ return init_bkd_index();
}
- return Status::InternalError("field type not supported");
+ return Status::Error<doris::ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>(
+ "Field type not supported");
} catch (const CLuceneError& e) {
LOG(WARNING) << "Inverted index writer init error occurred: " << e.what();
- return Status::Error<doris::ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>();
+ return Status::Error<doris::ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>(
+ "Inverted index writer init error occurred");
}
}
@@ -104,6 +111,17 @@ public:
}
};
+ Status init_bkd_index() {
+ size_t value_length = sizeof(CppType);
+ // NOTE: initialize with 0, set to max_row_id when finished.
+ int32_t max_doc = 0;
+ int32_t total_point_count = std::numeric_limits<std::int32_t>::max();
+ _bkd_writer = std::make_shared<lucene::util::bkd::bkd_writer>(
+ max_doc, DIMS, DIMS, value_length, MAX_LEAF_COUNT, MAXMBSortInHeap,
+ total_point_count, true, config::max_depth_in_bkd_tree);
+ return Status::OK();
+ }
+
Status init_fulltext_index() {
bool create = true;
@@ -177,11 +195,8 @@ public:
void new_fulltext_field(const char* field_value_data, size_t field_value_size) {
if (_parser_type == InvertedIndexParserType::PARSER_ENGLISH) {
- //NOTE:if parser is english analyzer, just construct token stream to analyzer for efficiency.
new_char_token_stream(field_value_data, field_value_size, _field);
} else if (_parser_type == InvertedIndexParserType::PARSER_CHINESE) {
- //NOTE:if parser is chinese analyzer, need to do utf8->unicode->wide_char
- //that's inefficient, need to do performance test.
auto stringReader = _CLNEW lucene::util::SimpleInputStreamReader(
new lucene::util::AStringReader(field_value_data, field_value_size),
lucene::util::SimpleInputStreamReader::UTF8);
@@ -219,7 +234,7 @@ public:
_rid++;
}
} else if constexpr (field_is_numeric_type(field_type)) {
- return Status::Error<doris::ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>();
+ add_numeric_values(values, count);
}
return Status::OK();
}
@@ -251,25 +266,88 @@ public:
_index_writer->addDocument(_doc);
}
} else if constexpr (field_is_numeric_type(field_type)) {
- //TODO
- return Status::Error<doris::ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>();
+ auto p = reinterpret_cast<const CppType*>(item_data_ptr);
+ for (int i = 0; i < count; ++i) {
+ for (size_t j = 0; j < values->length(); ++j) {
+ if (values->is_null_at(j)) {
+ // bkd do not index null values, so we do nothing here.
+ } else {
+ std::string new_value;
+ size_t value_length = sizeof(CppType);
+
+ _value_key_coder->full_encode_ascending(p, &new_value);
+ _bkd_writer->add((const uint8_t*)new_value.c_str(), value_length, _rid);
+ }
+ p++;
+ }
+ _row_ids_seen_for_bkd++;
+ _rid++;
+ }
}
return Status::OK();
}
+ void add_numeric_values(const void* values, size_t count) {
+ auto p = reinterpret_cast<const CppType*>(values);
+ for (size_t i = 0; i < count; ++i) {
+ add_value(*p);
+ p++;
+ _row_ids_seen_for_bkd++;
+ }
+ }
+
+ void add_value(const CppType& value) {
+ std::string new_value;
+ size_t value_length = sizeof(CppType);
+
+ _value_key_coder->full_encode_ascending(&value, &new_value);
+ _bkd_writer->add((const uint8_t*)new_value.c_str(), value_length, _rid);
+
+ _rid++;
+ }
+
uint64_t size() const override {
//TODO: get size of inverted index
return 0;
}
Status finish() override {
+ lucene::store::Directory* dir = nullptr;
+ lucene::store::IndexOutput* data_out = nullptr;
+ lucene::store::IndexOutput* index_out = nullptr;
+ lucene::store::IndexOutput* meta_out = nullptr;
try {
- if constexpr (field_is_slice_type(field_type)) {
+ if constexpr (field_is_numeric_type(field_type)) {
+ auto index_path = InvertedIndexDescriptor::get_temporary_index_path(
+ _directory + "/" + _segment_file_name, _index_meta->index_id());
+ dir = DorisCompoundDirectory::getDirectory(_fs, index_path.c_str(), true);
+ _bkd_writer->max_doc_ = _rid;
+ _bkd_writer->docs_seen_ = _row_ids_seen_for_bkd;
+ data_out = dir->createOutput(
+ InvertedIndexDescriptor::get_temporary_bkd_index_data_file_name().c_str());
+ meta_out = dir->createOutput(
+ InvertedIndexDescriptor::get_temporary_bkd_index_meta_file_name().c_str());
+ index_out = dir->createOutput(
+ InvertedIndexDescriptor::get_temporary_bkd_index_file_name().c_str());
+ if (data_out != nullptr && meta_out != nullptr && index_out != nullptr) {
+ _bkd_writer->meta_finish(meta_out, _bkd_writer->finish(data_out, index_out),
+ field_type);
+ }
+ FINALIZE_OUTPUT(meta_out)
+ FINALIZE_OUTPUT(data_out)
+ FINALIZE_OUTPUT(index_out)
+ FINALIZE_OUTPUT(dir)
+ } else if constexpr (field_is_slice_type(field_type)) {
close();
}
} catch (CLuceneError& e) {
+ FINALLY_FINALIZE_OUTPUT(meta_out)
+ FINALLY_FINALIZE_OUTPUT(data_out)
+ FINALLY_FINALIZE_OUTPUT(index_out)
+ FINALLY_FINALIZE_OUTPUT(dir)
LOG(WARNING) << "Inverted index writer finish error occurred: " << e.what();
- return Status::Error<doris::ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>();
+ return Status::Error<doris::ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>(
+ "Inverted index writer finish error occurred");
}
return Status::OK();
@@ -277,6 +355,8 @@ public:
private:
rowid_t _rid = 0;
+ uint32_t _row_ids_seen_for_bkd = 0;
+ roaring::Roaring _null_bitmap;
uint64_t _reverted_index_size;
lucene::document::Document* _doc {};
@@ -284,9 +364,11 @@ private:
lucene::index::IndexWriter* _index_writer {};
lucene::analysis::Analyzer* _analyzer {};
lucene::util::SStringReader<char>* _char_string_reader {};
+ std::shared_ptr<lucene::util::bkd::bkd_writer> _bkd_writer;
std::string _segment_file_name;
std::string _directory;
io::FileSystemSPtr _fs;
+ const KeyCoder* _value_key_coder;
const TabletIndex* _index_meta;
InvertedIndexParserType _parser_type;
std::wstring _field_name;
@@ -416,4 +498,4 @@ Status InvertedIndexColumnWriter::create(const Field* field,
}
return Status::OK();
}
-} // namespace doris::segment_v2
+} // namespace doris::segment_v2
\ No newline at end of file
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org