You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by zh...@apache.org on 2019/09/30 08:25:36 UTC

[incubator-doris] branch master updated: v2 segment support string encode(#1766) (#1816)

This is an automated email from the ASF dual-hosted git repository.

zhaoc pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 8aa8e08  v2 segment support string encode(#1766) (#1816)
8aa8e08 is described below

commit 8aa8e08f278fabf125f9883c6eb22137ca2159b2
Author: wangbo <50...@qq.com>
AuthorDate: Mon Sep 30 16:25:31 2019 +0800

    v2 segment support string encode(#1766) (#1816)
    
    major change
    
    change data format of binary dict page, appending (dict page data) and (dict page offset) to binary dict page;
    add new decoding method for new binary dict page format
    add ut for segment test
    set the elements of initial array to 0 ,when calling arena.AllocateNewBlock
    hard code way to choose dict coding for string
    0919 commit major change
    
    change dict file format:when saving binary dict page, separate dict page from dict page,one dict page may have multi data pages;when reading a binary dict page,one ColumnReader keeps one dict page
    loading dict when calling column_reader._read_page
    3.rollback BinaryDictPage
    no longer using memset(0) to inital column_zonemap.max_value
    0926 17 commit major change
    
    init column_zone_map min value column_zone_map slice's data array;
    set char/varchar column_zone_map'max value size to 0
    add ut for char column zone map query hit/miss
    0929 10 commit major change
    
    allocate mem for column_zone_map 's max and min value
    direct copy content to column_zone_map's max and min value
---
 be/src/olap/field.h                                |   5 +
 be/src/olap/olap_define.h                          |   5 +-
 be/src/olap/rowset/segment_v2/binary_dict_page.cpp |  13 +-
 be/src/olap/rowset/segment_v2/binary_dict_page.h   |   6 +-
 be/src/olap/rowset/segment_v2/column_reader.cpp    |  23 ++
 be/src/olap/rowset/segment_v2/column_reader.h      |   9 +
 be/src/olap/rowset/segment_v2/column_writer.cpp    |  11 +
 be/src/olap/rowset/segment_v2/column_writer.h      |   1 +
 be/src/olap/rowset/segment_v2/column_zone_map.cpp  |  14 +-
 be/src/olap/rowset/segment_v2/column_zone_map.h    |   1 -
 be/src/olap/rowset/segment_v2/encoding_info.cpp    |  17 ++
 be/src/olap/rowset/segment_v2/options.h            |   1 -
 be/src/olap/types.cpp                              |   1 +
 be/src/olap/types.h                                |  23 ++
 .../rowset/segment_v2/binary_dict_page_test.cpp    |   5 +-
 .../rowset/segment_v2/binary_plain_page_test.cpp   |   2 +-
 be/test/olap/rowset/segment_v2/segment_test.cpp    | 241 +++++++++++++++++++++
 be/test/olap/tablet_schema_helper.h                |  24 ++
 gensrc/proto/segment_v2.proto                      |   2 +-
 19 files changed, 384 insertions(+), 20 deletions(-)

diff --git a/be/src/olap/field.h b/be/src/olap/field.h
index 0416789..7a2c2a4 100644
--- a/be/src/olap/field.h
+++ b/be/src/olap/field.h
@@ -57,6 +57,7 @@ public:
 
     inline void set_to_max(char* buf) const { return _type_info->set_to_max(buf); }
     inline void set_to_min(char* buf) const { return _type_info->set_to_min(buf); }
+    inline char* allocate_value_from_arena(Arena* arena) const { return _type_info->allocate_value_from_arena(arena); }
 
     inline void agg_update(RowCursorCell* dest, const RowCursorCell& src, MemPool* mem_pool = nullptr) const {
         _agg_info->update(dest, src, mem_pool);
@@ -199,6 +200,10 @@ public:
         _type_info->deep_copy_with_arena(dest, src, arena);
     }
 
+    inline void direct_copy_content(char* dest, const char* src) const {
+        _type_info->direct_copy(dest, src);
+    }
+
     // Copy srouce content to destination in index format.
     template<typename DstCellType, typename SrcCellType>
     void to_index(DstCellType* dst, const SrcCellType& src) const;
diff --git a/be/src/olap/olap_define.h b/be/src/olap/olap_define.h
index 3735f1e..685a9ef 100644
--- a/be/src/olap/olap_define.h
+++ b/be/src/olap/olap_define.h
@@ -50,9 +50,12 @@ static const uint64_t OLAP_FIX_HEADER_MAGIC_NUMBER = 0;
 // 执行be/ce时默认的候选集大小
 static constexpr uint32_t OLAP_COMPACTION_DEFAULT_CANDIDATE_SIZE = 10;
 
-// the max length supported for string type
+// the max length supported for varchar type
 static const uint16_t OLAP_STRING_MAX_LENGTH = 65535;
 
+//the max length supported for char type
+static const uint16_t OLAP_CHAR_MAX_LENGTH = 255;
+
 static const int32_t PREFERRED_SNAPSHOT_VERSION = 3;
 
 // the max bytes for stored string length
diff --git a/be/src/olap/rowset/segment_v2/binary_dict_page.cpp b/be/src/olap/rowset/segment_v2/binary_dict_page.cpp
index 675ed26..3c80e29 100644
--- a/be/src/olap/rowset/segment_v2/binary_dict_page.cpp
+++ b/be/src/olap/rowset/segment_v2/binary_dict_page.cpp
@@ -102,7 +102,7 @@ Slice BinaryDictPageBuilder::finish() {
     Slice data_slice = _data_page_builder->finish();
     _buffer.append(data_slice.data, data_slice.size);
     encode_fixed32_le(&_buffer[0], _encoding_type);
-    return Slice(_buffer.data(), _buffer.size());
+    return Slice(_buffer);
 }
 
 void BinaryDictPageBuilder::reset() {
@@ -147,7 +147,6 @@ BinaryDictPageDecoder::BinaryDictPageDecoder(Slice data, const PageDecoderOption
     _data(data),
     _options(options),
     _data_page_decoder(nullptr),
-    _dict_decoder(options.dict_decoder),
     _parsed(false),
     _encoding_type(UNKNOWN_ENCODING) { }
 
@@ -161,7 +160,6 @@ Status BinaryDictPageDecoder::init() {
     _encoding_type = static_cast<EncodingTypePB>(type);
     _data.remove_prefix(BINARY_DICT_PAGE_HEADER_SIZE);
     if (_encoding_type == DICT_ENCODING) {
-        DCHECK(_dict_decoder != nullptr) << "dict decoder pointer is nullptr";
         _data_page_decoder.reset(new BitShufflePageDecoder<OLAP_FIELD_TYPE_INT>(_data, _options));
     } else if (_encoding_type == PLAIN_ENCODING) {
         DCHECK_EQ(_encoding_type, PLAIN_ENCODING);
@@ -180,12 +178,21 @@ Status BinaryDictPageDecoder::seek_to_position_in_page(size_t pos) {
     return _data_page_decoder->seek_to_position_in_page(pos);
 }
 
+bool BinaryDictPageDecoder::is_dict_encoding() const {
+    return _encoding_type == DICT_ENCODING;
+}
+
+void BinaryDictPageDecoder::set_dict_decoder(PageDecoder* dict_decoder){
+    _dict_decoder = (BinaryPlainPageDecoder*)dict_decoder;
+};
+
 Status BinaryDictPageDecoder::next_batch(size_t* n, ColumnBlockView* dst) {
     if (_encoding_type == PLAIN_ENCODING) {
         return _data_page_decoder->next_batch(n, dst);
     }
     // dictionary encoding
     DCHECK(_parsed);
+    DCHECK(_dict_decoder != nullptr) << "dict decoder pointer is nullptr";
     if (PREDICT_FALSE(*n == 0)) {
         *n = 0;
         return Status::OK();
diff --git a/be/src/olap/rowset/segment_v2/binary_dict_page.h b/be/src/olap/rowset/segment_v2/binary_dict_page.h
index e434e99..7951efd 100644
--- a/be/src/olap/rowset/segment_v2/binary_dict_page.h
+++ b/be/src/olap/rowset/segment_v2/binary_dict_page.h
@@ -116,11 +116,15 @@ public:
         return _data_page_decoder->current_index();
     }
 
+    bool is_dict_encoding() const;
+
+    void set_dict_decoder(PageDecoder* dict_decoder);
+
 private:
     Slice _data;
     PageDecoderOptions _options;
     std::unique_ptr<PageDecoder> _data_page_decoder;
-    BinaryPlainPageDecoder* _dict_decoder;
+    const BinaryPlainPageDecoder* _dict_decoder = nullptr;
     bool _parsed;
     EncodingTypePB _encoding_type;
     faststring _code_buf;
diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp b/be/src/olap/rowset/segment_v2/column_reader.cpp
index c08b39a..9c7c69d 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/column_reader.cpp
@@ -32,6 +32,7 @@
 #include "util/crc32c.h"
 #include "util/rle_encoding.h" // for RleDecoder
 #include "util/block_compression.h"
+#include "olap/rowset/segment_v2/binary_dict_page.h" // for BinaryDictPageDecoder
 
 namespace doris {
 namespace segment_v2 {
@@ -168,6 +169,10 @@ void ColumnReader::get_row_ranges_by_zone_map(CondColumn* cond_column,
     _calculate_row_ranges(page_indexes, row_ranges);
 }
 
+PagePointer ColumnReader::get_dict_page_pointer() const {
+    return _meta.dict_page();
+}
+
 void ColumnReader::_get_filtered_pages(CondColumn* cond_column,
         const std::vector<CondColumn*>& delete_conditions, std::vector<uint32_t>* page_indexes) {
     FieldType type = _type_info->type();
@@ -426,6 +431,24 @@ Status FileColumnIterator::_read_page(const OrdinalPageIndexIterator& iter, Pars
     RETURN_IF_ERROR(_reader->encoding_info()->create_page_decoder(data, options, &page->data_decoder));
     RETURN_IF_ERROR(page->data_decoder->init());
 
+    // lazy init dict_encoding'dict for three reasons
+    // 1. a column use dictionary encoding still has non-dict-encoded data pages are seeked,load dict when necessary
+    // 2. ColumnReader which is owned by Segment and Rowset can being alive even when there is no query,it should retain memory as small as possible.
+    // 3. Iterators of the same column won't repeat load the dict page because of page cache.
+    if (_reader->encoding_info()->encoding() == DICT_ENCODING) {
+        BinaryDictPageDecoder* binary_dict_page_decoder = (BinaryDictPageDecoder*)page->data_decoder;
+        if (binary_dict_page_decoder->is_dict_encoding()) {
+            if (_dict_decoder == nullptr) {
+                PagePointer pp = _reader->get_dict_page_pointer();
+                RETURN_IF_ERROR(_reader->read_page(pp, &_dict_page_handle));
+
+                _dict_decoder.reset(new BinaryPlainPageDecoder(_dict_page_handle.data()));
+                RETURN_IF_ERROR(_dict_decoder->init());
+            }
+            binary_dict_page_decoder->set_dict_decoder(_dict_decoder.get());
+        }
+    }
+
     page->offset_in_page = 0;
 
     return Status::OK();
diff --git a/be/src/olap/rowset/segment_v2/column_reader.h b/be/src/olap/rowset/segment_v2/column_reader.h
index 6d425cb..af34db2 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.h
+++ b/be/src/olap/rowset/segment_v2/column_reader.h
@@ -29,6 +29,7 @@
 #include "olap/rowset/segment_v2/ordinal_page_index.h" // for OrdinalPageIndexIterator
 #include "olap/rowset/segment_v2/column_zone_map.h" // for ColumnZoneMap
 #include "olap/rowset/segment_v2/row_ranges.h" // for RowRanges
+#include "olap/rowset/segment_v2/page_handle.h" // for PageHandle
 
 namespace doris {
 
@@ -85,6 +86,8 @@ public:
     void get_row_ranges_by_zone_map(CondColumn* cond_column,
             const std::vector<CondColumn*>& delete_conditions, RowRanges* row_ranges);
 
+    PagePointer get_dict_page_pointer() const;
+
 private:
     Status _init_ordinal_index();
 
@@ -189,6 +192,12 @@ private:
     // 3. When _page is null, it means that this reader can not be read.
     std::unique_ptr<ParsedPage> _page;
 
+    // keep dict page decoder
+    std::unique_ptr<PageDecoder> _dict_decoder;
+
+    // keep dict page handle to avoid released
+    PageHandle _dict_page_handle;
+
     // page iterator used to get next page when current page is finished.
     // This value will be reset when a new seek is issued
     OrdinalPageIndexIterator _page_iter;
diff --git a/be/src/olap/rowset/segment_v2/column_writer.cpp b/be/src/olap/rowset/segment_v2/column_writer.cpp
index fe8dade..4faf372 100644
--- a/be/src/olap/rowset/segment_v2/column_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/column_writer.cpp
@@ -213,6 +213,14 @@ Status ColumnWriter::write_data() {
         RETURN_IF_ERROR(_write_data_page(page));
         page = page->next;
     }
+    // write column dict
+    if (_encoding_info->encoding() == DICT_ENCODING) {
+        Slice dict_page;
+        _page_builder->get_dictionary_page(&dict_page);
+        std::vector<Slice> origin_data;
+        origin_data.push_back(dict_page);
+        RETURN_IF_ERROR(_write_physical_page(&origin_data, &_dict_page_pp));
+    }
     return Status::OK();
 }
 
@@ -240,6 +248,9 @@ void ColumnWriter::write_meta(ColumnMetaPB* meta) {
     if (_opts.need_zone_map) {
         _zone_map_pp.to_proto(meta->mutable_zone_map_page());
     }
+    if (_encoding_info->encoding() == DICT_ENCODING) {
+        _dict_page_pp.to_proto(meta->mutable_dict_page());
+    }
 }
 
 // write a page into file and update ordinal index
diff --git a/be/src/olap/rowset/segment_v2/column_writer.h b/be/src/olap/rowset/segment_v2/column_writer.h
index 76cd5b4..b881c13 100644
--- a/be/src/olap/rowset/segment_v2/column_writer.h
+++ b/be/src/olap/rowset/segment_v2/column_writer.h
@@ -157,6 +157,7 @@ private:
 
     PagePointer _ordinal_index_pp;
     PagePointer _zone_map_pp;
+    PagePointer _dict_page_pp;
     uint64_t _written_size = 0;
 };
 
diff --git a/be/src/olap/rowset/segment_v2/column_zone_map.cpp b/be/src/olap/rowset/segment_v2/column_zone_map.cpp
index 909a9cf..61dd3f2 100644
--- a/be/src/olap/rowset/segment_v2/column_zone_map.cpp
+++ b/be/src/olap/rowset/segment_v2/column_zone_map.cpp
@@ -28,9 +28,9 @@ ColumnZoneMapBuilder::ColumnZoneMapBuilder(const TypeInfo* type_info) : _type_in
     options.data_page_size = 0;
     _page_builder.reset(new BinaryPlainPageBuilder(options));
     _field.reset(FieldFactory::create_by_type(_type_info->type()));
-    _max_string_value = _arena.Allocate(OLAP_STRING_MAX_LENGTH);
-    _zone_map.min_value = _arena.Allocate(_type_info->size());
-    _zone_map.max_value = _arena.Allocate(_type_info->size());
+    _zone_map.min_value = _field->allocate_value_from_arena(&_arena);
+    _zone_map.max_value = _field->allocate_value_from_arena(&_arena);
+
     _reset_zone_map();
 }
 
@@ -38,10 +38,10 @@ Status ColumnZoneMapBuilder::add(const uint8_t *vals, size_t count) {
     if (vals != nullptr) {
         for (int i = 0; i < count; ++i) {
             if (_field->compare(_zone_map.min_value, (char *)vals) > 0) {
-                _field->deep_copy_content(_zone_map.min_value, (const char *)vals, &_arena);
+                _field->direct_copy_content(_zone_map.min_value, (const char *)vals);
             }
             if (_field->compare(_zone_map.max_value, (char *)vals) < 0) {
-                _field->deep_copy_content(_zone_map.max_value, (const char *)vals, &_arena);
+                _field->direct_copy_content(_zone_map.max_value, (const char *)vals);
             }
             vals += _type_info->size();
             if (!_zone_map.has_not_null) {
@@ -78,10 +78,6 @@ Status ColumnZoneMapBuilder::flush() {
 }
 
 void ColumnZoneMapBuilder::_reset_zone_map() {
-    // we should allocate max varchar length and set to max for min value
-    Slice *min_slice = (Slice *)_zone_map.min_value;
-    min_slice->data = _max_string_value;
-    min_slice->size = OLAP_STRING_MAX_LENGTH;
     _field->set_to_max(_zone_map.min_value);
     _field->set_to_min(_zone_map.max_value);
     _zone_map.has_null = false;
diff --git a/be/src/olap/rowset/segment_v2/column_zone_map.h b/be/src/olap/rowset/segment_v2/column_zone_map.h
index 1fb01bc..3e2b7ac 100644
--- a/be/src/olap/rowset/segment_v2/column_zone_map.h
+++ b/be/src/olap/rowset/segment_v2/column_zone_map.h
@@ -73,7 +73,6 @@ private:
     std::unique_ptr<Field> _field;
     // memory will be managed by arena
     ZoneMap _zone_map;
-    char* _max_string_value;
     Arena _arena;
 };
 
diff --git a/be/src/olap/rowset/segment_v2/encoding_info.cpp b/be/src/olap/rowset/segment_v2/encoding_info.cpp
index 3cfb715..31539a2 100644
--- a/be/src/olap/rowset/segment_v2/encoding_info.cpp
+++ b/be/src/olap/rowset/segment_v2/encoding_info.cpp
@@ -20,6 +20,7 @@
 #include "olap/olap_common.h"
 #include "olap/rowset/segment_v2/bitshuffle_page.h"
 #include "olap/rowset/segment_v2/rle_page.h"
+#include "olap/rowset/segment_v2/binary_dict_page.h"
 
 namespace doris {
 namespace segment_v2 {
@@ -67,6 +68,18 @@ struct TypeEncodingTraits<type, RLE> {
     }
 };
 
+template<FieldType type>
+struct TypeEncodingTraits<type, DICT_ENCODING> {
+    static Status create_page_builder(const PageBuilderOptions& opts, PageBuilder** builder) {
+        *builder = new BinaryDictPageBuilder(opts);
+        return Status::OK();
+    }
+    static Status create_page_decoder(const Slice& data, const PageDecoderOptions& opts, PageDecoder** decoder) {
+        *decoder = new BinaryDictPageDecoder(data, opts);
+        return Status::OK();
+    }
+};
+
 template<FieldType Type, EncodingTypePB Encoding>
 struct EncodingTraits : TypeEncodingTraits<Type, Encoding> {
     static const FieldType type = Type;
@@ -122,6 +135,10 @@ EncodingInfoResolver::EncodingInfoResolver() {
     _add_map<OLAP_FIELD_TYPE_FLOAT, PLAIN_ENCODING>();
     _add_map<OLAP_FIELD_TYPE_DOUBLE, BIT_SHUFFLE>();
     _add_map<OLAP_FIELD_TYPE_DOUBLE, PLAIN_ENCODING>();
+    _add_map<OLAP_FIELD_TYPE_CHAR, DICT_ENCODING>();
+    _add_map<OLAP_FIELD_TYPE_CHAR, PLAIN_ENCODING>();
+    _add_map<OLAP_FIELD_TYPE_VARCHAR, DICT_ENCODING>();
+    _add_map<OLAP_FIELD_TYPE_VARCHAR, PLAIN_ENCODING>();
     _add_map<OLAP_FIELD_TYPE_BOOL, RLE>();
     _add_map<OLAP_FIELD_TYPE_BOOL, BIT_SHUFFLE>();
     _add_map<OLAP_FIELD_TYPE_BOOL, PLAIN_ENCODING>();
diff --git a/be/src/olap/rowset/segment_v2/options.h b/be/src/olap/rowset/segment_v2/options.h
index 386afad..3997e0d 100644
--- a/be/src/olap/rowset/segment_v2/options.h
+++ b/be/src/olap/rowset/segment_v2/options.h
@@ -31,7 +31,6 @@ struct PageBuilderOptions {
 };
 
 struct PageDecoderOptions {
-    BinaryPlainPageDecoder* dict_decoder = nullptr;
 };
 
 } // namespace segment_v2
diff --git a/be/src/olap/types.cpp b/be/src/olap/types.cpp
index 87ef27b..36704fc 100644
--- a/be/src/olap/types.cpp
+++ b/be/src/olap/types.cpp
@@ -27,6 +27,7 @@ TypeInfo::TypeInfo(TypeTraitsClass t)
         _deep_copy(TypeTraitsClass::deep_copy),
         _deep_copy_with_arena(TypeTraitsClass::deep_copy_with_arena),
         _direct_copy(TypeTraitsClass::direct_copy),
+        _allocate_value_from_arena(TypeTraitsClass::allocate_value_from_arena),
         _from_string(TypeTraitsClass::from_string),
         _to_string(TypeTraitsClass::to_string),
         _set_to_max(TypeTraitsClass::set_to_max),
diff --git a/be/src/olap/types.h b/be/src/olap/types.h
index 2b63035..bd38110 100644
--- a/be/src/olap/types.h
+++ b/be/src/olap/types.h
@@ -64,6 +64,10 @@ public:
         _direct_copy(dest, src);
     }
 
+    inline char* allocate_value_from_arena(Arena* arena) const {
+        return _allocate_value_from_arena(arena);
+    }
+
     OLAPStatus from_string(void* buf, const std::string& scan_key) const {
         return _from_string(buf, scan_key);
     }
@@ -85,6 +89,7 @@ private:
     void (*_deep_copy)(void* dest, const void* src, MemPool* mem_pool);
     void (*_deep_copy_with_arena)(void* dest, const void* src, Arena* arena);
     void (*_direct_copy)(void* dest, const void* src);
+    char* (*_allocate_value_from_arena)(Arena* arena);
 
     OLAPStatus (*_from_string)(void* buf, const std::string& scan_key);
     std::string (*_to_string)(const void* src);
@@ -213,6 +218,10 @@ struct BaseFieldtypeTraits : public CppTypeTraits<field_type> {
         return HashUtil::hash(data, sizeof(CppType), seed);
     }
 
+    static inline char* allocate_value_from_arena(Arena* arena) {
+        return arena->Allocate(sizeof(CppType));
+    }
+
     static std::string to_string(const void* src) {
         std::stringstream stream;
         stream << *reinterpret_cast<const CppType*>(src);
@@ -568,6 +577,13 @@ struct FieldTypeTraits<OLAP_FIELD_TYPE_CHAR> : public BaseFieldtypeTraits<OLAP_F
         auto slice = reinterpret_cast<const Slice*>(data);
         return HashUtil::hash(slice->data, slice->size, seed);
     }
+    static char* allocate_value_from_arena(Arena* arena) {
+        char* type_value = arena->Allocate(sizeof(Slice));
+        auto slice = reinterpret_cast<Slice*>(type_value);
+        slice->size = OLAP_CHAR_MAX_LENGTH;
+        slice->data = arena->Allocate(OLAP_CHAR_MAX_LENGTH);
+        return type_value;
+    }
 };
 
 template<>
@@ -594,6 +610,13 @@ struct FieldTypeTraits<OLAP_FIELD_TYPE_VARCHAR> : public FieldTypeTraits<OLAP_FI
         auto slice = reinterpret_cast<Slice*>(buf);
         slice->size = 0;
     }
+    static char* allocate_value_from_arena(Arena* arena) {
+        char* type_value = arena->Allocate(sizeof(Slice));
+        auto slice = reinterpret_cast<Slice*>(type_value);
+        slice->size = OLAP_STRING_MAX_LENGTH;
+        slice->data = arena->Allocate(OLAP_STRING_MAX_LENGTH);
+        return type_value;
+    }
 };
 
 template<>
diff --git a/be/test/olap/rowset/segment_v2/binary_dict_page_test.cpp b/be/test/olap/rowset/segment_v2/binary_dict_page_test.cpp
index 9a25b7a..d082bca 100644
--- a/be/test/olap/rowset/segment_v2/binary_dict_page_test.cpp
+++ b/be/test/olap/rowset/segment_v2/binary_dict_page_test.cpp
@@ -62,8 +62,9 @@ public:
 
         // decode
         PageDecoderOptions decoder_options;
-        decoder_options.dict_decoder = dict_page_decoder.get();
         BinaryDictPageDecoder page_decoder(s, decoder_options);
+        page_decoder.set_dict_decoder(dict_page_decoder.get());
+
         status = page_decoder.init();
         ASSERT_TRUE(status.ok());
         ASSERT_EQ(slices.size(), page_decoder.count());
@@ -154,9 +155,9 @@ public:
 
             // decode
             PageDecoderOptions decoder_options;
-            decoder_options.dict_decoder = dict_page_decoder.get();
             BinaryDictPageDecoder page_decoder(results[slice_index], decoder_options);
             status = page_decoder.init();
+            page_decoder.set_dict_decoder(dict_page_decoder.get());
             ASSERT_TRUE(status.ok());
 
             //check values
diff --git a/be/test/olap/rowset/segment_v2/binary_plain_page_test.cpp b/be/test/olap/rowset/segment_v2/binary_plain_page_test.cpp
index 0ae68fb..45f0242 100644
--- a/be/test/olap/rowset/segment_v2/binary_plain_page_test.cpp
+++ b/be/test/olap/rowset/segment_v2/binary_plain_page_test.cpp
@@ -56,7 +56,7 @@ public:
         PageDecoderType page_decoder(s, decoder_options);
         Status status = page_decoder.init();
         ASSERT_TRUE(status.ok());
-        
+
         //test1
         
         size_t size = 3;
diff --git a/be/test/olap/rowset/segment_v2/segment_test.cpp b/be/test/olap/rowset/segment_v2/segment_test.cpp
index 80ce4ff..01121ce 100644
--- a/be/test/olap/rowset/segment_v2/segment_test.cpp
+++ b/be/test/olap/rowset/segment_v2/segment_test.cpp
@@ -606,6 +606,247 @@ TEST_F(SegmentReaderWriterTest, TestDefaultValueColumn) {
     }
 }
 
+void set_column_value_by_type(FieldType fieldType, int src, char* target, Arena* _arena, size_t _length = 0) {
+    if (fieldType == OLAP_FIELD_TYPE_CHAR) {
+        char* src_value = &std::to_string(src)[0];
+        int src_len = strlen(src_value);
+
+        auto* dest_slice = (Slice*)target;
+        dest_slice->size = _length;
+        dest_slice->data = _arena->Allocate(dest_slice->size);
+        memcpy(dest_slice->data, src_value, src_len);
+        memset(dest_slice->data + src_len, 0, dest_slice->size - src_len);
+    } else if (fieldType == OLAP_FIELD_TYPE_VARCHAR) {
+        char* src_value = &std::to_string(src)[0];
+        int src_len = strlen(src_value);
+
+        auto* dest_slice = (Slice*)target;
+        dest_slice->size = src_len;
+        dest_slice->data = _arena->Allocate(src_len);
+        std::memcpy(dest_slice->data, src_value, src_len);
+    } else {
+        *(int*)target = src;
+    }
+}
+
+TEST_F(SegmentReaderWriterTest, TestStringDict) {
+    size_t num_rows_per_block = 10;
+    Arena _arena;
+
+    std::shared_ptr<TabletSchema> tablet_schema(new TabletSchema());
+    tablet_schema->_num_columns = 4;
+    tablet_schema->_num_key_columns = 3;
+    tablet_schema->_num_short_key_columns = 2;
+    tablet_schema->_num_rows_per_row_block = num_rows_per_block;
+    tablet_schema->_cols.push_back(create_char_key(1));
+    tablet_schema->_cols.push_back(create_char_key(2));
+    tablet_schema->_cols.push_back(create_varchar_key(3));
+    tablet_schema->_cols.push_back(create_varchar_key(4));
+
+    //    segment write
+    std::string dname = "./ut_dir/segment_test";
+    FileUtils::create_dir(dname);
+
+    SegmentWriterOptions opts;
+    opts.num_rows_per_block = num_rows_per_block;
+
+    std::string fname = dname + "/string_case";
+
+    SegmentWriter writer(fname, 0, tablet_schema.get(), opts);
+    auto st = writer.init(10);
+    ASSERT_TRUE(st.ok());
+
+    RowCursor row;
+    auto olap_st = row.init(*tablet_schema);
+    ASSERT_EQ(OLAP_SUCCESS, olap_st);
+
+    // 0, 1, 2, 3
+    // 10, 11, 12, 13
+    // 20, 21, 22, 23
+    // convert int to string
+    for (int i = 0; i < 4096; ++i) {
+        for (int j = 0; j < 4; ++j) {
+            auto cell = row.cell(j);
+            cell.set_not_null();
+            set_column_value_by_type(tablet_schema->_cols[j]._type, i * 10 + j, (char*)cell.mutable_cell_ptr(), &_arena, tablet_schema->_cols[j]._length);
+        }
+        Status status = writer.append_row(row);
+        ASSERT_TRUE(status.ok());
+    }
+
+    uint64_t file_size = 0;
+    st = writer.finalize(&file_size);
+    ASSERT_TRUE(st.ok());
+
+    {
+        std::shared_ptr<Segment> segment(new Segment(fname, 0, tablet_schema.get()));
+        st = segment->open();
+        ASSERT_TRUE(st.ok());
+        ASSERT_EQ(4096, segment->num_rows());
+        Schema schema(*tablet_schema);
+
+        // scan all rows
+        {
+            StorageReadOptions read_opts;
+            std::unique_ptr<SegmentIterator> iter = segment->new_iterator(schema, read_opts);
+
+            RowBlockV2 block(schema, 1024);
+
+            int left = 4096;
+            int rowid = 0;
+
+            while (left > 0)  {
+                int rows_read = left > 1024 ? 1024 : left;
+                block.clear();
+                st = iter->next_batch(&block);
+                ASSERT_TRUE(st.ok());
+                ASSERT_EQ(rows_read, block.num_rows());
+                left -= rows_read;
+
+                for (int j = 0; j < block.schema()->column_ids().size(); ++j) {
+                    auto cid = block.schema()->column_ids()[j];
+                    auto column_block = block.column_block(j);
+                    for (int i = 0; i < rows_read; ++i) {
+                        int rid = rowid + i;
+                        ASSERT_FALSE(BitmapTest(column_block.null_bitmap(), i));
+                        const Slice* actual = reinterpret_cast<const Slice*>(column_block.cell_ptr(i));
+
+                        Slice expect;
+                        set_column_value_by_type(tablet_schema->_cols[j]._type, rid * 10 + cid, reinterpret_cast<char*>(&expect), &_arena, tablet_schema->_cols[j]._length);
+                        ASSERT_EQ(expect.to_string(), actual->to_string());
+                    }
+                }
+                rowid += rows_read;
+            }
+        }
+
+        // test seek, key
+        {
+            // lower bound
+            std::unique_ptr<RowCursor> lower_bound(new RowCursor());
+            lower_bound->init(*tablet_schema, 1);
+            {
+                auto cell = lower_bound->cell(0);
+                cell.set_not_null();
+                set_column_value_by_type(OLAP_FIELD_TYPE_CHAR, 40970, (char*)cell.mutable_cell_ptr(), &_arena, tablet_schema->_cols[0]._length);
+            }
+
+            StorageReadOptions read_opts;
+            read_opts.key_ranges.emplace_back(lower_bound.get(), false, nullptr, false);
+            std::unique_ptr<SegmentIterator> iter = segment->new_iterator(schema, read_opts);
+
+            RowBlockV2 block(schema, 100);
+            st = iter->next_batch(&block);
+            ASSERT_TRUE(st.is_end_of_file());
+            ASSERT_EQ(0, block.num_rows());
+        }
+
+        // test seek, key (-2, -1)
+        {
+            // lower bound
+            std::unique_ptr<RowCursor> lower_bound(new RowCursor());
+            lower_bound->init(*tablet_schema, 1);
+            {
+                auto cell = lower_bound->cell(0);
+                cell.set_not_null();
+                set_column_value_by_type(OLAP_FIELD_TYPE_CHAR, -2, (char*)cell.mutable_cell_ptr(), &_arena, tablet_schema->_cols[0]._length);
+            }
+
+            std::unique_ptr<RowCursor> upper_bound(new RowCursor());
+            upper_bound->init(*tablet_schema, 1);
+            {
+                auto cell = upper_bound->cell(0);
+                cell.set_not_null();
+                set_column_value_by_type(OLAP_FIELD_TYPE_CHAR, -1, (char*)cell.mutable_cell_ptr(), &_arena, tablet_schema->_cols[0]._length);
+            }
+
+            StorageReadOptions read_opts;
+            read_opts.key_ranges.emplace_back(lower_bound.get(), false, upper_bound.get(), false);
+            std::unique_ptr<SegmentIterator> iter = segment->new_iterator(schema, read_opts);
+
+            RowBlockV2 block(schema, 100);
+            st = iter->next_batch(&block);
+            ASSERT_TRUE(st.is_end_of_file());
+            ASSERT_EQ(0, block.num_rows());
+        }
+
+        // test char zone_map query hit;should read whole page
+        {
+            TCondition condition;
+            condition.__set_column_name("1");
+            condition.__set_condition_op(">");
+            std::vector<std::string> vals = {"100"};
+            condition.__set_condition_values(vals);
+            std::shared_ptr<Conditions> conditions(new Conditions());
+            conditions->set_tablet_schema(tablet_schema.get());
+            conditions->append_condition(condition);
+
+            StorageReadOptions read_opts;
+            read_opts.conditions = conditions.get();
+
+            std::unique_ptr<SegmentIterator> iter = segment->new_iterator(schema, read_opts);
+
+            RowBlockV2 block(schema, 1024);
+            int left = 4 * 1024;
+            int rowid = 0;
+
+            while (left > 0)  {
+                int rows_read = left > 1024 ? 1024 : left;
+                block.clear();
+                st = iter->next_batch(&block);
+                ASSERT_TRUE(st.ok());
+                ASSERT_EQ(rows_read, block.num_rows());
+                left -= rows_read;
+
+                for (int j = 0; j < block.schema()->column_ids().size(); ++j) {
+                    auto cid = block.schema()->column_ids()[j];
+                    auto column_block = block.column_block(j);
+                    for (int i = 0; i < rows_read; ++i) {
+                        int rid = rowid + i;
+                        ASSERT_FALSE(BitmapTest(column_block.null_bitmap(), i));
+
+                        const Slice* actual = reinterpret_cast<const Slice*>(column_block.cell_ptr(i));
+                        Slice expect;
+                        set_column_value_by_type(tablet_schema->_cols[j]._type, rid * 10 + cid, reinterpret_cast<char*>(&expect), &_arena, tablet_schema->_cols[j]._length);
+                        ASSERT_EQ(expect.to_string(), actual->to_string()) << "rid:" << rid << ", i:" << i;;
+                    }
+                }
+                rowid += rows_read;
+            }
+            ASSERT_EQ(4 * 1024, rowid);
+            st = iter->next_batch(&block);
+            ASSERT_TRUE(st.is_end_of_file());
+            ASSERT_EQ(0, block.num_rows());
+        }
+
+        // test char zone_map query miss;col < -1
+        {
+            TCondition condition;
+            condition.__set_column_name("1");
+            condition.__set_condition_op("<");
+            std::vector<std::string> vals = {"-2"};
+            condition.__set_condition_values(vals);
+            std::shared_ptr<Conditions> conditions(new Conditions());
+            conditions->set_tablet_schema(tablet_schema.get());
+            conditions->append_condition(condition);
+
+            StorageReadOptions read_opts;
+            read_opts.conditions = conditions.get();
+
+            std::unique_ptr<SegmentIterator> iter = segment->new_iterator(schema, read_opts);
+
+            RowBlockV2 block(schema, 1024);
+
+            st = iter->next_batch(&block);
+            ASSERT_TRUE(st.is_end_of_file());
+            ASSERT_EQ(0, block.num_rows());
+        }
+
+    }
+
+    FileUtils::remove_all(dname);
+}
+
 }
 }
 
diff --git a/be/test/olap/tablet_schema_helper.h b/be/test/olap/tablet_schema_helper.h
index 8e22b33..73ecdf3 100644
--- a/be/test/olap/tablet_schema_helper.h
+++ b/be/test/olap/tablet_schema_helper.h
@@ -55,4 +55,28 @@ TabletColumn create_int_value(
     return column;
 }
 
+TabletColumn create_char_key(int32_t id, bool is_nullable = true) {
+    TabletColumn column;
+    column._unique_id = id;
+    column._col_name = std::to_string(id);
+    column._type = OLAP_FIELD_TYPE_CHAR;
+    column._is_key = true;
+    column._is_nullable = is_nullable;
+    column._length = 8;
+    column._index_length = 1;
+    return column;
+}
+
+TabletColumn create_varchar_key(int32_t id, bool is_nullable = true) {
+    TabletColumn column;
+    column._unique_id = id;
+    column._col_name = std::to_string(id);
+    column._type = OLAP_FIELD_TYPE_VARCHAR;
+    column._is_key = true;
+    column._is_nullable = is_nullable;
+    column._length = 4;
+    column._index_length = 4;
+    return column;
+}
+
 }
diff --git a/gensrc/proto/segment_v2.proto b/gensrc/proto/segment_v2.proto
index 4a3f861..8abe0ae 100644
--- a/gensrc/proto/segment_v2.proto
+++ b/gensrc/proto/segment_v2.proto
@@ -95,7 +95,7 @@ message ColumnMetaPB {
     optional PagePointerPB zone_map_page = 8;
 
     // // dictionary page for DICT_ENCODING
-    // optional PagePointerPB dict_page = 2;
+    optional PagePointerPB dict_page = 9;
 
     // // bloom filter pages for bloom filter column
     // repeated PagePointerPB bloom_filter_pages = 3;


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org