You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by mo...@apache.org on 2022/07/03 15:20:35 UTC

[doris] branch master updated: [feature-wip](unique-key-merge-on-write) add primary key index (#10529)

This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 91fca49df4 [feature-wip](unique-key-merge-on-write) add primary key index (#10529)
91fca49df4 is described below

commit 91fca49df47be07c68ba0b39a8b07847fb300581
Author: Xin Liao <li...@126.com>
AuthorDate: Sun Jul 3 23:20:30 2022 +0800

    [feature-wip](unique-key-merge-on-write) add primary key index (#10529)
---
 be/src/olap/CMakeLists.txt              |   1 +
 be/src/olap/primary_key_index.cpp       |  66 +++++++++++++++++
 be/src/olap/primary_key_index.h         |  84 ++++++++++++++++++++++
 be/test/CMakeLists.txt                  |   1 +
 be/test/olap/primary_key_index_test.cpp | 123 ++++++++++++++++++++++++++++++++
 5 files changed, 275 insertions(+)

diff --git a/be/src/olap/CMakeLists.txt b/be/src/olap/CMakeLists.txt
index 615a9fff44..c003c2bdc4 100644
--- a/be/src/olap/CMakeLists.txt
+++ b/be/src/olap/CMakeLists.txt
@@ -90,6 +90,7 @@ add_library(Olap STATIC
     types.cpp 
     utils.cpp
     wrapper_field.cpp
+    primary_key_index.cpp
     rowset/segment_v2/bitmap_index_reader.cpp
     rowset/segment_v2/bitmap_index_writer.cpp
     rowset/segment_v2/bitshuffle_page.cpp
diff --git a/be/src/olap/primary_key_index.cpp b/be/src/olap/primary_key_index.cpp
new file mode 100644
index 0000000000..996dfa755c
--- /dev/null
+++ b/be/src/olap/primary_key_index.cpp
@@ -0,0 +1,66 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "olap/primary_key_index.h"
+
+#include "olap/rowset/segment_v2/encoding_info.h"
+
+namespace doris {
+
+Status PrimaryKeyIndexBuilder::init() {
+    // TODO(liaoxin) using the column type directly if there's only one column in unique key columns
+    const auto* type_info = get_scalar_type_info<OLAP_FIELD_TYPE_VARCHAR>();
+    segment_v2::IndexedColumnWriterOptions options;
+    options.write_ordinal_index = false;
+    options.write_value_index = true;
+    options.encoding = segment_v2::EncodingInfo::get_default_encoding(type_info, true);
+    // TODO(liaoxin) test to confirm whether it needs to be compressed
+    options.compression = segment_v2::NO_COMPRESSION; // currently not compressed
+    _index_builder.reset(new segment_v2::IndexedColumnWriter(options, type_info, _wblock));
+    return _index_builder->init();
+}
+
+Status PrimaryKeyIndexBuilder::add_item(const Slice& key) {
+    _index_builder->add(&key);
+    // the key is already sorted, so the first key is min_key, and
+    // the last key is max_key.
+    if (UNLIKELY(_num_rows == 0)) {
+        _min_key.append(key.get_data(), key.get_size());
+    }
+    _max_key.clear();
+    _max_key.append(key.get_data(), key.get_size());
+    _num_rows++;
+    _size += key.get_size();
+    return Status::OK();
+}
+
+Status PrimaryKeyIndexBuilder::finalize(segment_v2::IndexedColumnMetaPB* meta) {
+    // finish primary key index
+    return _index_builder->finish(meta);
+}
+
+Status PrimaryKeyIndexReader::parse(const FilePathDesc& path_desc,
+                                    const segment_v2::IndexedColumnMetaPB& meta) {
+    // parse primary key index
+    _index_reader.reset(new segment_v2::IndexedColumnReader(path_desc, meta));
+    RETURN_IF_ERROR(_index_reader->load(_use_page_cache, _kept_in_memory));
+
+    _parsed = true;
+    return Status::OK();
+}
+
+} // namespace doris
diff --git a/be/src/olap/primary_key_index.h b/be/src/olap/primary_key_index.h
new file mode 100644
index 0000000000..fdddb1fab1
--- /dev/null
+++ b/be/src/olap/primary_key_index.h
@@ -0,0 +1,84 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "common/status.h"
+#include "gen_cpp/segment_v2.pb.h"
+#include "olap/rowset/segment_v2/indexed_column_reader.h"
+#include "olap/rowset/segment_v2/indexed_column_writer.h"
+#include "util/faststring.h"
+#include "util/slice.h"
+
+namespace doris {
+
+// Build index for primary key.
+// The primary key index is designed in a similar way like RocksDB
+// Partitioned Index, which is created in the segment file when MemTable flushes.
+// Index is stored in multiple pages to leverage the IndexedColumnWriter.
+class PrimaryKeyIndexBuilder {
+public:
+    PrimaryKeyIndexBuilder(fs::WritableBlock* wblock) : _wblock(wblock), _num_rows(0), _size(0) {}
+
+    Status init();
+
+    Status add_item(const Slice& key);
+
+    uint32_t num_rows() const { return _num_rows; }
+
+    uint64_t size() { return _size; }
+
+    Slice min_key() { return Slice(_min_key); }
+    Slice max_key() { return Slice(_max_key); }
+
+    Status finalize(segment_v2::IndexedColumnMetaPB* meta);
+
+private:
+    fs::WritableBlock* _wblock = nullptr;
+    uint32_t _num_rows;
+    uint64_t _size;
+
+    faststring _min_key;
+    faststring _max_key;
+    std::unique_ptr<segment_v2::IndexedColumnWriter> _index_builder;
+};
+
+class PrimaryKeyIndexReader {
+public:
+    PrimaryKeyIndexReader() : _parsed(false) {}
+
+    Status parse(const FilePathDesc& path_desc, const segment_v2::IndexedColumnMetaPB& meta);
+
+    Status new_iterator(std::unique_ptr<segment_v2::IndexedColumnIterator>* index_iterator) const {
+        DCHECK(_parsed);
+        index_iterator->reset(new segment_v2::IndexedColumnIterator(_index_reader.get()));
+        return Status::OK();
+    }
+
+    uint32_t num_rows() const {
+        DCHECK(_parsed);
+        return _index_reader->num_values();
+    }
+
+private:
+    bool _parsed;
+    bool _use_page_cache = true;
+    bool _kept_in_memory = true;
+    std::unique_ptr<segment_v2::IndexedColumnReader> _index_reader;
+};
+
+} // namespace doris
diff --git a/be/test/CMakeLists.txt b/be/test/CMakeLists.txt
index 49c14c49f6..6b8648b9ba 100644
--- a/be/test/CMakeLists.txt
+++ b/be/test/CMakeLists.txt
@@ -199,6 +199,7 @@ set(OLAP_TEST_FILES
     olap/options_test.cpp
     olap/fs/file_block_manager_test.cpp
     olap/common_test.cpp
+    olap/primary_key_index_test.cpp
     # olap/memtable_flush_executor_test.cpp
     # olap/push_handler_test.cpp
 )
diff --git a/be/test/olap/primary_key_index_test.cpp b/be/test/olap/primary_key_index_test.cpp
new file mode 100644
index 0000000000..da8a63bc41
--- /dev/null
+++ b/be/test/olap/primary_key_index_test.cpp
@@ -0,0 +1,123 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "olap/primary_key_index.h"
+
+#include <gtest/gtest.h>
+
+#include "olap/fs/block_manager.h"
+#include "olap/fs/fs_util.h"
+#include "olap/row_cursor.h"
+#include "olap/tablet_schema_helper.h"
+#include "util/debug_util.h"
+#include "util/file_utils.h"
+
+namespace doris {
+
+class PrimaryKeyIndexTest : public testing::Test {
+public:
+    PrimaryKeyIndexTest() {}
+    virtual ~PrimaryKeyIndexTest() {}
+
+    void SetUp() override {
+        if (FileUtils::check_exist(kTestDir)) {
+            EXPECT_TRUE(FileUtils::remove_all(kTestDir).ok());
+        }
+        EXPECT_TRUE(FileUtils::create_dir(kTestDir).ok());
+    }
+    void TearDown() override {
+        if (FileUtils::check_exist(kTestDir)) {
+            EXPECT_TRUE(FileUtils::remove_all(kTestDir).ok());
+        }
+    }
+
+private:
+    const std::string kTestDir = "./ut_dir/primary_key_index_test";
+};
+
+TEST_F(PrimaryKeyIndexTest, builder) {
+    std::string filename = kTestDir + "/builder";
+    std::unique_ptr<fs::WritableBlock> wblock;
+    fs::CreateBlockOptions opts(filename);
+    std::string storage_name;
+    EXPECT_TRUE(fs::fs_util::block_manager(storage_name)->create_block(opts, &wblock).ok());
+
+    PrimaryKeyIndexBuilder builder(wblock.get());
+    builder.init();
+    size_t num_rows = 0;
+    std::vector<std::string> keys;
+    for (int i = 1000; i < 10000; i += 2) {
+        keys.push_back(std::to_string(i));
+        builder.add_item(std::to_string(i));
+        num_rows++;
+    }
+    EXPECT_EQ("1000", builder.min_key().to_string());
+    EXPECT_EQ("9998", builder.max_key().to_string());
+    segment_v2::IndexedColumnMetaPB index_meta;
+    EXPECT_TRUE(builder.finalize(&index_meta));
+    EXPECT_TRUE(wblock->close().ok());
+    EXPECT_EQ(num_rows, builder.num_rows());
+
+    FilePathDesc path_desc(filename);
+    PrimaryKeyIndexReader index_reader;
+    EXPECT_TRUE(index_reader.parse(path_desc, index_meta).ok());
+    EXPECT_EQ(num_rows, index_reader.num_rows());
+
+    std::unique_ptr<segment_v2::IndexedColumnIterator> index_iterator;
+    EXPECT_TRUE(index_reader.new_iterator(&index_iterator).ok());
+    bool exact_match = false;
+    uint32_t row_id;
+    for (size_t i = 0; i < keys.size(); i++) {
+        auto status = index_iterator->seek_at_or_after(&keys[i], &exact_match);
+        EXPECT_TRUE(status.ok());
+        EXPECT_TRUE(exact_match);
+        row_id = index_iterator->get_current_ordinal();
+        EXPECT_EQ(i, row_id);
+    }
+    // find a non-existing key "8701"
+    {
+        string key("8701");
+        Slice slice(key);
+        auto status = index_iterator->seek_at_or_after(&slice, &exact_match);
+        EXPECT_TRUE(status.ok());
+        EXPECT_FALSE(exact_match);
+        row_id = index_iterator->get_current_ordinal();
+        EXPECT_EQ(3851, row_id);
+    }
+
+    // find prefix "87"
+    {
+        string key("87");
+        Slice slice(key);
+        auto status = index_iterator->seek_at_or_after(&slice, &exact_match);
+        EXPECT_TRUE(status.ok());
+        EXPECT_FALSE(exact_match);
+        row_id = index_iterator->get_current_ordinal();
+        EXPECT_EQ(3850, row_id);
+    }
+
+    // find prefix "9999"
+    {
+        string key("9999");
+        Slice slice(key);
+        auto status = index_iterator->seek_at_or_after(&slice, &exact_match);
+        EXPECT_FALSE(exact_match);
+        EXPECT_TRUE(status.is_not_found());
+    }
+}
+
+} // namespace doris


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org