You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by ya...@apache.org on 2021/07/30 01:23:20 UTC

[incubator-doris] branch master updated: [Feature] Support config max length of zone map index (#6293)

This is an automated email from the ASF dual-hosted git repository.

yangzhg pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 6597a33  [Feature] Support config max length of zone map index (#6293)
6597a33 is described below

commit 6597a338dcce053b4e809c0cf5fb4386db74aeb5
Author: HappenLee <ha...@hotmail.com>
AuthorDate: Thu Jul 29 20:23:11 2021 -0500

    [Feature] Support config max length of zone map index (#6293)
---
 be/src/common/configbase.cpp                       |  1 -
 be/src/olap/field.h                                | 76 ++++++++++++++++++++++
 be/src/olap/olap_define.h                          |  2 +
 be/src/olap/rowset/segment_v2/zone_map_index.cpp   | 18 +++--
 be/src/olap/rowset/segment_v2/zone_map_index.h     |  6 +-
 be/src/olap/types.cpp                              |  1 +
 be/src/olap/types.h                                | 35 +++++++++-
 be/src/runtime/descriptors.cpp                     |  2 -
 be/src/runtime/row_batch.cpp                       |  2 -
 be/src/runtime/row_batch.h                         |  1 -
 be/src/service/doris_main.cpp                      |  3 -
 .../olap/rowset/segment_v2/zone_map_index_test.cpp | 51 +++++++++++++++
 12 files changed, 178 insertions(+), 20 deletions(-)

diff --git a/be/src/common/configbase.cpp b/be/src/common/configbase.cpp
index dc5a5b2..b44fc96 100644
--- a/be/src/common/configbase.cpp
+++ b/be/src/common/configbase.cpp
@@ -22,7 +22,6 @@
 #include <iostream>
 #include <list>
 #include <map>
-#include <sstream>
 
 #define __IN_CONFIGBASE_CPP__
 #include "common/config.h"
diff --git a/be/src/olap/field.h b/be/src/olap/field.h
index 0edb4db..e290b50 100644
--- a/be/src/olap/field.h
+++ b/be/src/olap/field.h
@@ -66,7 +66,14 @@ public:
     inline const std::string& name() const { return _name; }
 
     virtual inline void set_to_max(char* buf) const { return _type_info->set_to_max(buf); }
+    virtual inline void set_to_zone_map_max(char* buf) const {
+        set_to_max(buf);
+    }
+
     inline void set_to_min(char* buf) const { return _type_info->set_to_min(buf); }
+    inline void set_to_zone_map_min(char* buf) const {
+        set_to_min(buf);
+    }
 
     // This function allocate memory from pool, other than allocate_memory
     // reserve memory from continuous memory.
@@ -74,6 +81,10 @@ public:
         return (char*)pool->allocate(_type_info->size());
     }
 
+    virtual inline char* allocate_zone_map_value(MemPool* pool) const {
+        return allocate_value(pool);
+    }
+
     inline void agg_update(RowCursorCell* dest, const RowCursorCell& src,
                            MemPool* mem_pool = nullptr) const {
         _agg_info->update(dest, src, mem_pool);
@@ -103,6 +114,8 @@ public:
 
     virtual size_t get_variable_len() const { return 0; }
 
+    virtual void modify_zone_map_index(char*) const {};
+
     virtual Field* clone() const {
         auto* local = new Field();
         this->clone(local);
@@ -456,6 +469,37 @@ public:
         slice->size = _length;
         memset(slice->data, 0xFF, slice->size);
     }
+
+    // To prevent zone map cost too many memory, if varchar length
+    // longer than `MAX_ZONE_MAP_INDEX_SIZE`. we just allocate
+    // `MAX_ZONE_MAP_INDEX_SIZE` of memory
+    char* allocate_zone_map_value(MemPool *pool) const override {
+        char* type_value = (char*)pool->allocate(sizeof(Slice));
+        auto slice = reinterpret_cast<Slice*>(type_value);
+        slice->size = MAX_ZONE_MAP_INDEX_SIZE > _length ? _length :
+                MAX_ZONE_MAP_INDEX_SIZE;
+        slice->data = (char*)pool->allocate(slice->size);
+        return type_value;
+    }
+
+    // only varchar filed need modify zone map index when zone map max_value
+    // index longer than `MAX_ZONE_MAP_INDEX_SIZE`. so here we add one
+    // for the last byte
+    // In UTF8 encoding, here do not appear 0xff in last byte
+    void modify_zone_map_index(char* src) const override {
+        auto slice = reinterpret_cast<Slice*>(src);
+        if (slice->size == MAX_ZONE_MAP_INDEX_SIZE) {
+            slice->mutable_data()[slice->size - 1] += 1;
+        }
+    }
+
+     void set_to_zone_map_max(char* ch) const override {
+        auto slice = reinterpret_cast<Slice*>(ch);
+        int length = _length < MAX_ZONE_MAP_INDEX_SIZE ? _length :
+                MAX_ZONE_MAP_INDEX_SIZE;
+        slice->size = length;
+        memset(slice->data, 0xFF, slice->size);
+    }
 };
 
 class VarcharField : public Field {
@@ -484,11 +528,43 @@ public:
         return Field::allocate_string_value(pool);
     }
 
+    // To prevent zone map cost too many memory, if varchar length
+    // longer than `MAX_ZONE_MAP_INDEX_SIZE`. we just allocate
+    // `MAX_ZONE_MAP_INDEX_SIZE` of memory
+    char* allocate_zone_map_value(MemPool *pool) const override {
+        char* type_value = (char*)pool->allocate(sizeof(Slice));
+        auto slice = reinterpret_cast<Slice*>(type_value);
+        slice->size = MAX_ZONE_MAP_INDEX_SIZE > _length ? _length :
+                MAX_ZONE_MAP_INDEX_SIZE;
+        slice->data = (char*)pool->allocate(slice->size);
+        return type_value;
+    }
+
+    // only varchar filed need modify zone map index when zone map max_value
+    // index longer than `MAX_ZONE_MAP_INDEX_SIZE`. so here we add one
+    // for the last byte
+    // In UTF8 encoding, here do not appear 0xff in last byte
+    void modify_zone_map_index(char* src) const override {
+        auto slice = reinterpret_cast<Slice*>(src);
+        if (slice->size == MAX_ZONE_MAP_INDEX_SIZE) {
+            slice->mutable_data()[slice->size - 1] += 1;
+        }
+    }
+
     void set_to_max(char* ch) const override {
         auto slice = reinterpret_cast<Slice*>(ch);
         slice->size = _length - OLAP_STRING_MAX_BYTES;
         memset(slice->data, 0xFF, slice->size);
     }
+
+    void set_to_zone_map_max(char* ch) const override {
+        auto slice = reinterpret_cast<Slice*>(ch);
+        int length = _length < MAX_ZONE_MAP_INDEX_SIZE ? _length :
+                MAX_ZONE_MAP_INDEX_SIZE;
+
+        slice->size = length - OLAP_STRING_MAX_BYTES;
+        memset(slice->data, 0xFF, slice->size);
+    }
 };
 
 class BitmapAggField : public Field {
diff --git a/be/src/olap/olap_define.h b/be/src/olap/olap_define.h
index 977e04b..75b1087 100644
--- a/be/src/olap/olap_define.h
+++ b/be/src/olap/olap_define.h
@@ -62,6 +62,8 @@ static const uint16_t OLAP_STRING_MAX_BYTES = sizeof(StringLengthType);
 // the max bytes for stored array length
 static const uint16_t OLAP_ARRAY_MAX_BYTES = OLAP_ARRAY_MAX_LENGTH;
 
+static constexpr uint16_t MAX_ZONE_MAP_INDEX_SIZE = 512;
+
 enum OLAPDataVersion {
     OLAP_V1 = 0,
     DORIS_V1 = 1,
diff --git a/be/src/olap/rowset/segment_v2/zone_map_index.cpp b/be/src/olap/rowset/segment_v2/zone_map_index.cpp
index d9775e5..c051319 100644
--- a/be/src/olap/rowset/segment_v2/zone_map_index.cpp
+++ b/be/src/olap/rowset/segment_v2/zone_map_index.cpp
@@ -33,11 +33,11 @@ namespace segment_v2 {
 
 ZoneMapIndexWriter::ZoneMapIndexWriter(Field* field)
         : _field(field), _tracker(new MemTracker(-1, "ZoneMapIndexWriter")), _pool(_tracker.get()) {
-    _page_zone_map.min_value = _field->allocate_value(&_pool);
-    _page_zone_map.max_value = _field->allocate_value(&_pool);
+    _page_zone_map.min_value = _field->allocate_zone_map_value(&_pool);
+    _page_zone_map.max_value = _field->allocate_zone_map_value(&_pool);
     _reset_zone_map(&_page_zone_map);
-    _segment_zone_map.min_value = _field->allocate_value(&_pool);
-    _segment_zone_map.max_value = _field->allocate_value(&_pool);
+    _segment_zone_map.min_value = _field->allocate_zone_map_value(&_pool);
+    _segment_zone_map.max_value = _field->allocate_zone_map_value(&_pool);
     _reset_zone_map(&_segment_zone_map);
 }
 
@@ -48,15 +48,19 @@ void ZoneMapIndexWriter::add_values(const void* values, size_t count) {
     const char* vals = reinterpret_cast<const char*>(values);
     for (int i = 0; i < count; ++i) {
         if (_field->compare(_page_zone_map.min_value, vals) > 0) {
-            _field->type_info()->direct_copy(_page_zone_map.min_value, vals);
+            _field->type_info()->direct_copy_may_cut(_page_zone_map.min_value, vals);
         }
         if (_field->compare(_page_zone_map.max_value, vals) < 0) {
-            _field->type_info()->direct_copy(_page_zone_map.max_value, vals);
+            _field->type_info()->direct_copy_may_cut(_page_zone_map.max_value, vals);
         }
         vals += _field->size();
     }
 }
 
+void ZoneMapIndexWriter::moidfy_index_before_flush(struct doris::segment_v2::ZoneMap & zone_map) {
+    _field->modify_zone_map_index(zone_map.max_value);
+}
+
 void ZoneMapIndexWriter::reset_page_zone_map() {
     _page_zone_map.pass_all  = true;
 }
@@ -81,6 +85,7 @@ Status ZoneMapIndexWriter::flush() {
     }
 
     ZoneMapPB zone_map_pb;
+    moidfy_index_before_flush(_page_zone_map);
     _page_zone_map.to_proto(&zone_map_pb, _field);
     _reset_zone_map(&_page_zone_map);
 
@@ -98,6 +103,7 @@ Status ZoneMapIndexWriter::finish(fs::WritableBlock* wblock, ColumnIndexMetaPB*
     index_meta->set_type(ZONE_MAP_INDEX);
     ZoneMapIndexPB* meta = index_meta->mutable_zone_map_index();
     // store segment zone map
+    moidfy_index_before_flush(_segment_zone_map);
     _segment_zone_map.to_proto(meta->mutable_segment_zone_map(), _field);
 
     // write out zone map for each data pages
diff --git a/be/src/olap/rowset/segment_v2/zone_map_index.h b/be/src/olap/rowset/segment_v2/zone_map_index.h
index a3f4bbe..6efc786 100644
--- a/be/src/olap/rowset/segment_v2/zone_map_index.h
+++ b/be/src/olap/rowset/segment_v2/zone_map_index.h
@@ -85,6 +85,8 @@ public:
 
     Status finish(fs::WritableBlock* wblock, ColumnIndexMetaPB* index_meta);
 
+    void moidfy_index_before_flush(ZoneMap& zone_map);
+
     uint64_t size() { return _estimated_size; }
 
     void reset_page_zone_map();
@@ -93,8 +95,8 @@ public:
 private:
     void _reset_zone_map(ZoneMap* zone_map) {
         // we should allocate max varchar length and set to max for min value
-        _field->set_to_max(zone_map->min_value);
-        _field->set_to_min(zone_map->max_value);
+        _field->set_to_zone_map_max(zone_map->min_value);
+        _field->set_to_zone_map_min(zone_map->max_value);
         zone_map->has_null = false;
         zone_map->has_not_null = false;
         zone_map->pass_all = false;
diff --git a/be/src/olap/types.cpp b/be/src/olap/types.cpp
index 136bd7e..b26e069 100644
--- a/be/src/olap/types.cpp
+++ b/be/src/olap/types.cpp
@@ -29,6 +29,7 @@ ScalarTypeInfo::ScalarTypeInfo(TypeTraitsClass t)
           _deep_copy(TypeTraitsClass::deep_copy),
           _copy_object(TypeTraitsClass::copy_object),
           _direct_copy(TypeTraitsClass::direct_copy),
+          _direct_copy_may_cut(TypeTraitsClass::direct_copy_may_cut),
           _convert_from(TypeTraitsClass::convert_from),
           _from_string(TypeTraitsClass::from_string),
           _to_string(TypeTraitsClass::to_string),
diff --git a/be/src/olap/types.h b/be/src/olap/types.h
index e42aa80..a2fba4d 100644
--- a/be/src/olap/types.h
+++ b/be/src/olap/types.h
@@ -59,6 +59,9 @@ public:
 
     virtual void direct_copy(void* dest, const void* src) const = 0;
 
+    // use only in zone map to cut data
+    virtual void direct_copy_may_cut(void* dest, const void* src) const = 0;
+
     //convert and deep copy value from other type's source
     virtual OLAPStatus convert_from(void* dest, const void* src, const TypeInfo* src_type,
                                     MemPool* mem_pool) const = 0;
@@ -100,6 +103,8 @@ public:
 
     inline void direct_copy(void* dest, const void* src) const override { _direct_copy(dest, src); }
 
+    inline void direct_copy_may_cut(void* dest, const void* src) const override { _direct_copy_may_cut(dest, src); }
+
     //convert and deep copy value from other type's source
     OLAPStatus convert_from(void* dest, const void* src, const TypeInfo* src_type,
                             MemPool* mem_pool) const override {
@@ -130,6 +135,7 @@ private:
     void (*_deep_copy)(void* dest, const void* src, MemPool* mem_pool);
     void (*_copy_object)(void* dest, const void* src, MemPool* mem_pool);
     void (*_direct_copy)(void* dest, const void* src);
+    void (*_direct_copy_may_cut)(void* dest, const void* src);
     OLAPStatus (*_convert_from)(void* dest, const void* src, const TypeInfo* src_type,
                                 MemPool* mem_pool);
 
@@ -291,6 +297,10 @@ public:
         }
     }
 
+    inline void direct_copy_may_cut(void* dest, const void* src) const override {
+        direct_copy(dest, src);
+    }
+
     OLAPStatus convert_from(void* dest, const void* src, const TypeInfo* src_type,
                             MemPool* mem_pool) const override {
         return OLAPStatus::OLAP_ERR_FUNC_NOT_IMPLEMENTED;
@@ -492,6 +502,10 @@ struct BaseFieldtypeTraits : public CppTypeTraits<field_type> {
         *reinterpret_cast<CppType*>(dest) = *reinterpret_cast<const CppType*>(src);
     }
 
+    static inline void direct_copy_may_cut(void* dest, const void* src) {
+        direct_copy(dest, src);
+    }
+
     static OLAPStatus convert_from(void* dest, const void* src, const TypeInfo* src_type,
                                    MemPool* mem_pool) {
         return OLAPStatus::OLAP_ERR_FUNC_NOT_IMPLEMENTED;
@@ -510,9 +524,7 @@ struct BaseFieldtypeTraits : public CppTypeTraits<field_type> {
     }
 
     static std::string to_string(const void* src) {
-        std::stringstream stream;
-        stream << *reinterpret_cast<const CppType*>(src);
-        return stream.str();
+        return std::to_string(*reinterpret_cast<const CppType*>(src));
     }
 
     static OLAPStatus from_string(void* buf, const std::string& scan_key) {
@@ -704,6 +716,11 @@ struct FieldTypeTraits<OLAP_FIELD_TYPE_LARGEINT>
     static void direct_copy(void* dest, const void* src) {
         *reinterpret_cast<PackedInt128*>(dest) = *reinterpret_cast<const PackedInt128*>(src);
     }
+
+    static inline void direct_copy_may_cut(void* dest, const void* src) {
+        direct_copy(dest, src);
+    }
+
     static void set_to_max(void* buf) {
         *reinterpret_cast<PackedInt128*>(buf) = ~((int128_t)(1) << 127);
     }
@@ -979,6 +996,7 @@ struct FieldTypeTraits<OLAP_FIELD_TYPE_CHAR> : public BaseFieldtypeTraits<OLAP_F
         auto slice = reinterpret_cast<const Slice*>(src);
         return slice->to_string();
     }
+
     static void deep_copy(void* dest, const void* src, MemPool* mem_pool) {
         auto l_slice = reinterpret_cast<Slice*>(dest);
         auto r_slice = reinterpret_cast<const Slice*>(src);
@@ -1005,6 +1023,17 @@ struct FieldTypeTraits<OLAP_FIELD_TYPE_CHAR> : public BaseFieldtypeTraits<OLAP_F
         auto slice = reinterpret_cast<Slice*>(buf);
         memset(slice->data, 0, slice->size);
     }
+
+    static void direct_copy_may_cut(void* dest, const void* src) {
+        auto l_slice = reinterpret_cast<Slice*>(dest);
+        auto r_slice = reinterpret_cast<const Slice*>(src);
+
+        auto min_size = MAX_ZONE_MAP_INDEX_SIZE >= r_slice->size ? r_slice->size :
+                MAX_ZONE_MAP_INDEX_SIZE;
+        memory_copy(l_slice->data, r_slice->data, min_size);
+        l_slice->size = min_size;
+    }
+
     static uint32_t hash_code(const void* data, uint32_t seed) {
         auto slice = reinterpret_cast<const Slice*>(data);
         return HashUtil::hash(slice->data, slice->size, seed);
diff --git a/be/src/runtime/descriptors.cpp b/be/src/runtime/descriptors.cpp
index 40299ce..f6f7b2a 100644
--- a/be/src/runtime/descriptors.cpp
+++ b/be/src/runtime/descriptors.cpp
@@ -22,9 +22,7 @@
 #include <sstream>
 
 #include "common/object_pool.h"
-#include "exprs/expr.h"
 #include "gen_cpp/Descriptors_types.h"
-#include "gen_cpp/PlanNodes_types.h"
 #include "gen_cpp/descriptors.pb.h"
 
 namespace doris {
diff --git a/be/src/runtime/row_batch.cpp b/be/src/runtime/row_batch.cpp
index 739b004..54031c4 100644
--- a/be/src/runtime/row_batch.cpp
+++ b/be/src/runtime/row_batch.cpp
@@ -25,11 +25,9 @@
 #include "runtime/runtime_state.h"
 #include "runtime/string_value.h"
 #include "runtime/tuple_row.h"
-//#include "runtime/mem_tracker.h"
 #include "gen_cpp/Data_types.h"
 #include "gen_cpp/data.pb.h"
 #include "runtime/collection_value.h"
-#include "util/debug_util.h"
 
 using std::vector;
 
diff --git a/be/src/runtime/row_batch.h b/be/src/runtime/row_batch.h
index 66e2f69..9349eef 100644
--- a/be/src/runtime/row_batch.h
+++ b/be/src/runtime/row_batch.h
@@ -25,7 +25,6 @@
 #include "codegen/doris_ir.h"
 #include "common/logging.h"
 #include "runtime/buffered_block_mgr2.h" // for BufferedBlockMgr2::Block
-// #include "runtime/buffered_tuple_stream2.inline.h"
 #include "runtime/bufferpool/buffer_pool.h"
 #include "runtime/descriptors.h"
 #include "runtime/disk_io_mgr.h"
diff --git a/be/src/service/doris_main.cpp b/be/src/service/doris_main.cpp
index d1d085d..2b136a2 100644
--- a/be/src/service/doris_main.cpp
+++ b/be/src/service/doris_main.cpp
@@ -52,12 +52,9 @@
 #include "service/http_service.h"
 #include "util/debug_util.h"
 #include "util/doris_metrics.h"
-#include "util/file_utils.h"
 #include "util/logging.h"
-#include "util/network_util.h"
 #include "util/thrift_rpc_helper.h"
 #include "util/thrift_server.h"
-#include "util/thrift_util.h"
 #include "util/uid_util.h"
 
 static void help(const char*);
diff --git a/be/test/olap/rowset/segment_v2/zone_map_index_test.cpp b/be/test/olap/rowset/segment_v2/zone_map_index_test.cpp
index e795fb5..3f11e69 100644
--- a/be/test/olap/rowset/segment_v2/zone_map_index_test.cpp
+++ b/be/test/olap/rowset/segment_v2/zone_map_index_test.cpp
@@ -15,6 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#include "common/config.h"
 #include "olap/rowset/segment_v2/zone_map_index.h"
 
 #include <gtest/gtest.h>
@@ -99,6 +100,47 @@ public:
         ASSERT_EQ(true, zone_maps[2].has_null());
         ASSERT_EQ(false, zone_maps[2].has_not_null());
     }
+
+    void test_cut_zone_map(std::string testname, Field* field) {
+        std::string filename = kTestDir + "/" + testname;
+
+        ZoneMapIndexWriter builder(field);
+        char ch = 'a';
+        char buf[1024];
+        for (int i = 0; i < 5; i++) {
+            memset(buf, ch + i, 1024);
+            Slice slice(buf, 1024);
+            builder.add_values((const uint8_t*)&slice, 1);
+        }
+        builder.flush();
+
+        // write out zone map index
+        ColumnIndexMetaPB index_meta;
+        {
+            std::unique_ptr<fs::WritableBlock> wblock;
+            fs::CreateBlockOptions opts({filename});
+            ASSERT_TRUE(fs::fs_util::block_manager()->create_block(opts, &wblock).ok());
+            ASSERT_TRUE(builder.finish(wblock.get(), &index_meta).ok());
+            ASSERT_EQ(ZONE_MAP_INDEX, index_meta.type());
+            ASSERT_TRUE(wblock->close().ok());
+        }
+
+        ZoneMapIndexReader column_zone_map(filename, &index_meta.zone_map_index());
+        Status status = column_zone_map.load(true, false);
+        ASSERT_TRUE(status.ok());
+        ASSERT_EQ(1, column_zone_map.num_pages());
+        const std::vector<ZoneMapPB>& zone_maps = column_zone_map.page_zone_maps();
+        ASSERT_EQ(1, zone_maps.size());
+
+        char value[512];
+        memset(value, 'a', 512);
+        ASSERT_EQ(value, zone_maps[0].min());
+        memset(value, 'f', 512);
+        value[511] += 1;
+        ASSERT_EQ(value, zone_maps[0].max());
+        ASSERT_EQ(false, zone_maps[0].has_null());
+        ASSERT_EQ(true, zone_maps[0].has_not_null());
+    }
 };
 
 // Test for int
@@ -171,6 +213,15 @@ TEST_F(ColumnZoneMapTest, NormalTestCharPage) {
     delete field;
 }
 
+// Test for zone map limit
+TEST_F(ColumnZoneMapTest, ZoneMapCut) {
+    TabletColumn varchar_column = create_varchar_key(0);
+    varchar_column.set_index_length(1024);
+    Field* field = FieldFactory::create(varchar_column);
+    test_string("ZoneMapCut", field);
+    delete field;
+}
+
 } // namespace segment_v2
 } // namespace doris
 

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org