You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by GitBox <gi...@apache.org> on 2023/01/17 10:10:46 UTC

[GitHub] [doris] airborne12 commented on a diff in pull request #15994: [Feature-WIP](inverted index)(bkd) bdk index'reader implementation which in inverted index using for numeric types

airborne12 commented on code in PR #15994:
URL: https://github.com/apache/doris/pull/15994#discussion_r1071985440


##########
be/src/olap/rowset/segment_v2/inverted_index_reader.cpp:
##########
@@ -290,6 +291,313 @@ InvertedIndexReaderType StringTypeInvertedIndexReader::type() {
     return InvertedIndexReaderType::STRING_TYPE;
 }
 
+BkdIndexReader::BkdIndexReader(io::FileSystemSPtr fs, const std::string& path,
+                               const uint32_t uniq_id)
+        : InvertedIndexReader(fs, path, uniq_id), compoundReader(nullptr) {
+    io::Path io_path(_path);
+    auto index_dir = io_path.parent_path();
+    auto index_file_name =
+            InvertedIndexDescriptor::get_index_file_name(io_path.filename(), _index_id);
+
+    // check index file existence
+    auto index_file = index_dir / index_file_name;
+    if (!indexExists(index_file)) {
+        LOG(WARNING) << "bkd index: " << index_file.string() << " not exist.";
+        return;
+    }
+    compoundReader = new DorisCompoundReader(
+            DorisCompoundDirectory::getDirectory(fs, index_dir.c_str()), index_file_name.c_str());
+}
+
+Status BkdIndexReader::new_iterator(const TabletIndex* index_meta,
+                                    InvertedIndexIterator** iterator) {
+    *iterator = new InvertedIndexIterator(index_meta, this);
+    return Status::OK();
+}
+
+Status BkdIndexReader::bkd_query(const std::string& column_name, const void* query_value,
+                                 InvertedIndexQueryType query_type,
+                                 std::shared_ptr<lucene::util::bkd::bkd_reader>&& r,
+                                 InvertedIndexVisitor* visitor) {
+    lucene::util::bkd::bkd_reader* tmp_reader;
+    auto status = get_bkd_reader(tmp_reader);
+    if (!status.ok()) {
+        LOG(WARNING) << "get bkd reader for column " << column_name
+                     << " failed: " << status.code_as_string();
+        return status;
+    }
+    r.reset(tmp_reader);
+    char tmp[r->bytes_per_dim_];
+    switch (query_type) {
+    case InvertedIndexQueryType::EQUAL_QUERY: {
+        _value_key_coder->full_encode_ascending(query_value, &visitor->queryMax);
+        _value_key_coder->full_encode_ascending(query_value, &visitor->queryMin);
+        break;
+    }
+    case InvertedIndexQueryType::LESS_THAN_QUERY:
+    case InvertedIndexQueryType::LESS_EQUAL_QUERY: {
+        _value_key_coder->full_encode_ascending(query_value, &visitor->queryMax);
+        _type_info->set_to_min(tmp);
+        _value_key_coder->full_encode_ascending(tmp, &visitor->queryMin);
+        break;
+    }
+    case InvertedIndexQueryType::GREATER_THAN_QUERY:
+    case InvertedIndexQueryType::GREATER_EQUAL_QUERY: {
+        _value_key_coder->full_encode_ascending(query_value, &visitor->queryMin);
+        _type_info->set_to_max(tmp);
+        _value_key_coder->full_encode_ascending(tmp, &visitor->queryMax);
+        break;
+    }
+    default:
+        LOG(ERROR) << "invalid query type when query bkd index";
+        return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>();
+    }
+    visitor->set_reader(r.get());
+    return Status::OK();
+}
+
+Status BkdIndexReader::query(const std::string& column_name, const void* query_value,
+                             InvertedIndexQueryType query_type,
+                             InvertedIndexParserType analyser_type, roaring::Roaring* bit_map) {
+    uint64_t start = UnixMillis();
+    auto visitor = std::make_unique<InvertedIndexVisitor>(bit_map, query_type);
+    std::shared_ptr<lucene::util::bkd::bkd_reader> r;
+    try {
+        RETURN_IF_ERROR(
+                bkd_query(column_name, query_value, query_type, std::move(r), visitor.get()));
+        r->intersect(visitor.get());
+    } catch (const CLuceneError& e) {
+        LOG(WARNING) << "BKD Query CLuceneError Occurred, error msg: " << e.what();
+        return Status::Error<ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>();
+    }
+
+    LOG(INFO) << "BKD index search time taken: " << UnixMillis() - start << "ms "
+              << " column: " << column_name << " result: " << bit_map->cardinality()
+              << " reader stats: " << r->stats.to_string();
+    return Status::OK();
+}
+
+Status BkdIndexReader::get_bkd_reader(lucene::util::bkd::bkd_reader*& bkdReader) {
+    // bkd file reader
+    if (compoundReader == nullptr) {
+        LOG(WARNING) << "bkd index input file not found";
+        return Status::Error<ErrorCode::INVERTED_INDEX_FILE_NOT_FOUND>();
+    }
+    CLuceneError err;
+    lucene::store::IndexInput* data_in;
+    lucene::store::IndexInput* meta_in;
+    lucene::store::IndexInput* index_in;
+
+    if (!compoundReader->openInput(
+                InvertedIndexDescriptor::get_temporary_bkd_index_data_file_name().c_str(), data_in,
+                err) ||
+        !compoundReader->openInput(
+                InvertedIndexDescriptor::get_temporary_bkd_index_meta_file_name().c_str(), meta_in,
+                err) ||
+        !compoundReader->openInput(
+                InvertedIndexDescriptor::get_temporary_bkd_index_file_name().c_str(), index_in,
+                err)) {
+        LOG(WARNING) << "bkd index input error: " << err.what();
+        return Status::Error<ErrorCode::INVERTED_INDEX_FILE_NOT_FOUND>();
+    }
+
+    bkdReader = new lucene::util::bkd::bkd_reader(data_in);
+    if (0 == bkdReader->read_meta(meta_in)) {
+        return Status::EndOfFile("bkd index file is empty");
+    }
+
+    bkdReader->read_index(index_in);
+
+    _type_info = get_scalar_type_info((FieldType)bkdReader->type);
+    if (_type_info == nullptr) {
+        auto type = bkdReader->type;
+        delete bkdReader;
+        LOG(WARNING) << "unsupported typeinfo, type=" << type;
+        return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>();
+    }
+    _value_key_coder = get_key_coder(_type_info->type());
+    return Status::OK();
+}
+
+InvertedIndexReaderType BkdIndexReader::type() {
+    return InvertedIndexReaderType::BKD;
+}
+
+InvertedIndexVisitor::InvertedIndexVisitor(roaring::Roaring* h, InvertedIndexQueryType query_type,
+                                           bool only_count)
+        : hits(h), num_hits(0), only_count(only_count), query_type(query_type) {}
+
+bool InvertedIndexVisitor::matches(uint8_t* packedValue) {
+    for (int dim = 0; dim < reader->num_data_dims_; dim++) {
+        int offset = dim * reader->bytes_per_dim_;
+        if (query_type == InvertedIndexQueryType::LESS_THAN_QUERY) {
+            if (lucene::util::FutureArrays::CompareUnsigned(
+                        packedValue, offset, offset + reader->bytes_per_dim_,
+                        (const uint8_t*)queryMax.c_str(), offset,
+                        offset + reader->bytes_per_dim_) >= 0) {
+                // Doc's value is too high, in this dimension
+                return false;
+            }
+        } else if (query_type == InvertedIndexQueryType::GREATER_THAN_QUERY) {
+            if (lucene::util::FutureArrays::CompareUnsigned(
+                        packedValue, offset, offset + reader->bytes_per_dim_,
+                        (const uint8_t*)queryMin.c_str(), offset,
+                        offset + reader->bytes_per_dim_) <= 0) {
+                // Doc's value is too high, in this dimension
+                return false;
+            }
+        } else {
+            if (lucene::util::FutureArrays::CompareUnsigned(
+                        packedValue, offset, offset + reader->bytes_per_dim_,
+                        (const uint8_t*)queryMin.c_str(), offset,
+                        offset + reader->bytes_per_dim_) < 0) {
+                // Doc's value is too low, in this dimension
+                return false;
+            }
+            if (lucene::util::FutureArrays::CompareUnsigned(
+                        packedValue, offset, offset + reader->bytes_per_dim_,
+                        (const uint8_t*)queryMax.c_str(), offset,
+                        offset + reader->bytes_per_dim_) > 0) {
+                // Doc's value is too high, in this dimension
+                return false;
+            }
+        }
+    }
+    return true;
+}
+
+void InvertedIndexVisitor::visit(std::vector<char>& docID, std::vector<uint8_t>& packedValue) {
+    if (!matches(packedValue.data())) {
+        return;
+    }
+    visit(roaring::Roaring::read(docID.data(), false));
+}
+
+void InvertedIndexVisitor::visit(Roaring* docID, std::vector<uint8_t>& packedValue) {
+    if (!matches(packedValue.data())) {
+        return;
+    }
+    visit(*docID);
+}
+
+void InvertedIndexVisitor::visit(roaring::Roaring&& r) {
+    if (only_count) {
+        num_hits += r.cardinality();
+    } else {
+        *hits |= r;
+    }
+}
+
+void InvertedIndexVisitor::visit(roaring::Roaring& r) {
+    if (only_count) {
+        num_hits += r.cardinality();
+    } else {
+        *hits |= r;
+    }
+}
+
+void InvertedIndexVisitor::visit(int rowID) {
+    if (only_count) {
+        num_hits++;
+    } else {
+        hits->add(rowID);
+    }
+    if (0) {
+        std::wcout << L"visit docID=" << rowID << std::endl;
+    }
+}
+
+void InvertedIndexVisitor::visit(lucene::util::bkd::bkd_docid_set_iterator* iter,
+                                 std::vector<uint8_t>& packedValue) {
+    if (!matches(packedValue.data())) {
+        return;
+    }
+    int32_t docID = iter->docid_set->nextDoc();
+    while (docID != lucene::util::bkd::bkd_docid_set::NO_MORE_DOCS) {
+        if (only_count) {
+            num_hits++;
+        } else {
+            hits->add(docID);
+        }
+        docID = iter->docid_set->nextDoc();
+    }
+}
+
+void InvertedIndexVisitor::visit(int rowID, std::vector<uint8_t>& packedValue) {
+    if (0) {

Review Comment:
   no need for these code now



##########
be/src/olap/rowset/segment_v2/inverted_index_reader.cpp:
##########
@@ -290,6 +291,313 @@ InvertedIndexReaderType StringTypeInvertedIndexReader::type() {
     return InvertedIndexReaderType::STRING_TYPE;
 }
 
+BkdIndexReader::BkdIndexReader(io::FileSystemSPtr fs, const std::string& path,
+                               const uint32_t uniq_id)
+        : InvertedIndexReader(fs, path, uniq_id), compoundReader(nullptr) {
+    io::Path io_path(_path);
+    auto index_dir = io_path.parent_path();
+    auto index_file_name =
+            InvertedIndexDescriptor::get_index_file_name(io_path.filename(), _index_id);
+
+    // check index file existence
+    auto index_file = index_dir / index_file_name;
+    if (!indexExists(index_file)) {
+        LOG(WARNING) << "bkd index: " << index_file.string() << " not exist.";
+        return;
+    }
+    compoundReader = new DorisCompoundReader(
+            DorisCompoundDirectory::getDirectory(fs, index_dir.c_str()), index_file_name.c_str());
+}
+
+Status BkdIndexReader::new_iterator(const TabletIndex* index_meta,
+                                    InvertedIndexIterator** iterator) {
+    *iterator = new InvertedIndexIterator(index_meta, this);
+    return Status::OK();
+}
+
+Status BkdIndexReader::bkd_query(const std::string& column_name, const void* query_value,
+                                 InvertedIndexQueryType query_type,
+                                 std::shared_ptr<lucene::util::bkd::bkd_reader>&& r,
+                                 InvertedIndexVisitor* visitor) {
+    lucene::util::bkd::bkd_reader* tmp_reader;
+    auto status = get_bkd_reader(tmp_reader);
+    if (!status.ok()) {
+        LOG(WARNING) << "get bkd reader for column " << column_name
+                     << " failed: " << status.code_as_string();
+        return status;
+    }
+    r.reset(tmp_reader);
+    char tmp[r->bytes_per_dim_];
+    switch (query_type) {
+    case InvertedIndexQueryType::EQUAL_QUERY: {
+        _value_key_coder->full_encode_ascending(query_value, &visitor->queryMax);
+        _value_key_coder->full_encode_ascending(query_value, &visitor->queryMin);
+        break;
+    }
+    case InvertedIndexQueryType::LESS_THAN_QUERY:
+    case InvertedIndexQueryType::LESS_EQUAL_QUERY: {
+        _value_key_coder->full_encode_ascending(query_value, &visitor->queryMax);
+        _type_info->set_to_min(tmp);
+        _value_key_coder->full_encode_ascending(tmp, &visitor->queryMin);
+        break;
+    }
+    case InvertedIndexQueryType::GREATER_THAN_QUERY:
+    case InvertedIndexQueryType::GREATER_EQUAL_QUERY: {
+        _value_key_coder->full_encode_ascending(query_value, &visitor->queryMin);
+        _type_info->set_to_max(tmp);
+        _value_key_coder->full_encode_ascending(tmp, &visitor->queryMax);
+        break;
+    }
+    default:
+        LOG(ERROR) << "invalid query type when query bkd index";
+        return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>();
+    }
+    visitor->set_reader(r.get());
+    return Status::OK();
+}
+
+Status BkdIndexReader::query(const std::string& column_name, const void* query_value,
+                             InvertedIndexQueryType query_type,
+                             InvertedIndexParserType analyser_type, roaring::Roaring* bit_map) {
+    uint64_t start = UnixMillis();
+    auto visitor = std::make_unique<InvertedIndexVisitor>(bit_map, query_type);
+    std::shared_ptr<lucene::util::bkd::bkd_reader> r;
+    try {
+        RETURN_IF_ERROR(
+                bkd_query(column_name, query_value, query_type, std::move(r), visitor.get()));
+        r->intersect(visitor.get());
+    } catch (const CLuceneError& e) {
+        LOG(WARNING) << "BKD Query CLuceneError Occurred, error msg: " << e.what();
+        return Status::Error<ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>();
+    }
+
+    LOG(INFO) << "BKD index search time taken: " << UnixMillis() - start << "ms "
+              << " column: " << column_name << " result: " << bit_map->cardinality()
+              << " reader stats: " << r->stats.to_string();
+    return Status::OK();
+}
+
+Status BkdIndexReader::get_bkd_reader(lucene::util::bkd::bkd_reader*& bkdReader) {
+    // bkd file reader
+    if (compoundReader == nullptr) {
+        LOG(WARNING) << "bkd index input file not found";
+        return Status::Error<ErrorCode::INVERTED_INDEX_FILE_NOT_FOUND>();
+    }
+    CLuceneError err;
+    lucene::store::IndexInput* data_in;
+    lucene::store::IndexInput* meta_in;
+    lucene::store::IndexInput* index_in;
+
+    if (!compoundReader->openInput(
+                InvertedIndexDescriptor::get_temporary_bkd_index_data_file_name().c_str(), data_in,
+                err) ||
+        !compoundReader->openInput(
+                InvertedIndexDescriptor::get_temporary_bkd_index_meta_file_name().c_str(), meta_in,
+                err) ||
+        !compoundReader->openInput(
+                InvertedIndexDescriptor::get_temporary_bkd_index_file_name().c_str(), index_in,
+                err)) {
+        LOG(WARNING) << "bkd index input error: " << err.what();
+        return Status::Error<ErrorCode::INVERTED_INDEX_FILE_NOT_FOUND>();
+    }
+
+    bkdReader = new lucene::util::bkd::bkd_reader(data_in);
+    if (0 == bkdReader->read_meta(meta_in)) {
+        return Status::EndOfFile("bkd index file is empty");
+    }
+
+    bkdReader->read_index(index_in);
+
+    _type_info = get_scalar_type_info((FieldType)bkdReader->type);
+    if (_type_info == nullptr) {
+        auto type = bkdReader->type;
+        delete bkdReader;
+        LOG(WARNING) << "unsupported typeinfo, type=" << type;
+        return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>();
+    }
+    _value_key_coder = get_key_coder(_type_info->type());
+    return Status::OK();
+}
+
+InvertedIndexReaderType BkdIndexReader::type() {
+    return InvertedIndexReaderType::BKD;
+}
+
+InvertedIndexVisitor::InvertedIndexVisitor(roaring::Roaring* h, InvertedIndexQueryType query_type,
+                                           bool only_count)
+        : hits(h), num_hits(0), only_count(only_count), query_type(query_type) {}
+
+bool InvertedIndexVisitor::matches(uint8_t* packedValue) {
+    for (int dim = 0; dim < reader->num_data_dims_; dim++) {
+        int offset = dim * reader->bytes_per_dim_;
+        if (query_type == InvertedIndexQueryType::LESS_THAN_QUERY) {
+            if (lucene::util::FutureArrays::CompareUnsigned(
+                        packedValue, offset, offset + reader->bytes_per_dim_,
+                        (const uint8_t*)queryMax.c_str(), offset,
+                        offset + reader->bytes_per_dim_) >= 0) {
+                // Doc's value is too high, in this dimension
+                return false;
+            }
+        } else if (query_type == InvertedIndexQueryType::GREATER_THAN_QUERY) {
+            if (lucene::util::FutureArrays::CompareUnsigned(
+                        packedValue, offset, offset + reader->bytes_per_dim_,
+                        (const uint8_t*)queryMin.c_str(), offset,
+                        offset + reader->bytes_per_dim_) <= 0) {
+                // Doc's value is too high, in this dimension
+                return false;
+            }
+        } else {
+            if (lucene::util::FutureArrays::CompareUnsigned(
+                        packedValue, offset, offset + reader->bytes_per_dim_,
+                        (const uint8_t*)queryMin.c_str(), offset,
+                        offset + reader->bytes_per_dim_) < 0) {
+                // Doc's value is too low, in this dimension
+                return false;
+            }
+            if (lucene::util::FutureArrays::CompareUnsigned(
+                        packedValue, offset, offset + reader->bytes_per_dim_,
+                        (const uint8_t*)queryMax.c_str(), offset,
+                        offset + reader->bytes_per_dim_) > 0) {
+                // Doc's value is too high, in this dimension
+                return false;
+            }
+        }
+    }
+    return true;
+}
+
+void InvertedIndexVisitor::visit(std::vector<char>& docID, std::vector<uint8_t>& packedValue) {
+    if (!matches(packedValue.data())) {
+        return;
+    }
+    visit(roaring::Roaring::read(docID.data(), false));
+}
+
+void InvertedIndexVisitor::visit(Roaring* docID, std::vector<uint8_t>& packedValue) {
+    if (!matches(packedValue.data())) {
+        return;
+    }
+    visit(*docID);
+}
+
+void InvertedIndexVisitor::visit(roaring::Roaring&& r) {
+    if (only_count) {
+        num_hits += r.cardinality();
+    } else {
+        *hits |= r;
+    }
+}
+
+void InvertedIndexVisitor::visit(roaring::Roaring& r) {
+    if (only_count) {
+        num_hits += r.cardinality();
+    } else {
+        *hits |= r;
+    }
+}
+
+void InvertedIndexVisitor::visit(int rowID) {
+    if (only_count) {
+        num_hits++;
+    } else {
+        hits->add(rowID);
+    }
+    if (0) {

Review Comment:
   no need for this code



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org