You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by zh...@apache.org on 2019/09/30 08:25:36 UTC
[incubator-doris] branch master updated: v2 segment support string
encode(#1766) (#1816)
This is an automated email from the ASF dual-hosted git repository.
zhaoc pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-doris.git
The following commit(s) were added to refs/heads/master by this push:
new 8aa8e08 v2 segment support string encode(#1766) (#1816)
8aa8e08 is described below
commit 8aa8e08f278fabf125f9883c6eb22137ca2159b2
Author: wangbo <50...@qq.com>
AuthorDate: Mon Sep 30 16:25:31 2019 +0800
v2 segment support string encode(#1766) (#1816)
major change
change data format of binary dict page, appending (dict page data) and (dict page offset) to binary dict page;
add new decoding method for new binary dict page format
add ut for segment test
set the elements of initial array to 0 ,when calling arena.AllocateNewBlock
hard code way to choose dict coding for string
0919 commit major change
change dict file format:when saving binary dict page, separate dict page from dict page,one dict page may have multi data pages;when reading a binary dict page,one ColumnReader keeps one dict page
loading dict when calling column_reader._read_page
3.rollback BinaryDictPage
no longer using memset(0) to inital column_zonemap.max_value
0926 17 commit major change
init column_zone_map min value column_zone_map slice's data array;
set char/varchar column_zone_map'max value size to 0
add ut for char column zone map query hit/miss
0929 10 commit major change
allocate mem for column_zone_map 's max and min value
direct copy content to column_zone_map's max and min value
---
be/src/olap/field.h | 5 +
be/src/olap/olap_define.h | 5 +-
be/src/olap/rowset/segment_v2/binary_dict_page.cpp | 13 +-
be/src/olap/rowset/segment_v2/binary_dict_page.h | 6 +-
be/src/olap/rowset/segment_v2/column_reader.cpp | 23 ++
be/src/olap/rowset/segment_v2/column_reader.h | 9 +
be/src/olap/rowset/segment_v2/column_writer.cpp | 11 +
be/src/olap/rowset/segment_v2/column_writer.h | 1 +
be/src/olap/rowset/segment_v2/column_zone_map.cpp | 14 +-
be/src/olap/rowset/segment_v2/column_zone_map.h | 1 -
be/src/olap/rowset/segment_v2/encoding_info.cpp | 17 ++
be/src/olap/rowset/segment_v2/options.h | 1 -
be/src/olap/types.cpp | 1 +
be/src/olap/types.h | 23 ++
.../rowset/segment_v2/binary_dict_page_test.cpp | 5 +-
.../rowset/segment_v2/binary_plain_page_test.cpp | 2 +-
be/test/olap/rowset/segment_v2/segment_test.cpp | 241 +++++++++++++++++++++
be/test/olap/tablet_schema_helper.h | 24 ++
gensrc/proto/segment_v2.proto | 2 +-
19 files changed, 384 insertions(+), 20 deletions(-)
diff --git a/be/src/olap/field.h b/be/src/olap/field.h
index 0416789..7a2c2a4 100644
--- a/be/src/olap/field.h
+++ b/be/src/olap/field.h
@@ -57,6 +57,7 @@ public:
inline void set_to_max(char* buf) const { return _type_info->set_to_max(buf); }
inline void set_to_min(char* buf) const { return _type_info->set_to_min(buf); }
+ inline char* allocate_value_from_arena(Arena* arena) const { return _type_info->allocate_value_from_arena(arena); }
inline void agg_update(RowCursorCell* dest, const RowCursorCell& src, MemPool* mem_pool = nullptr) const {
_agg_info->update(dest, src, mem_pool);
@@ -199,6 +200,10 @@ public:
_type_info->deep_copy_with_arena(dest, src, arena);
}
+ inline void direct_copy_content(char* dest, const char* src) const {
+ _type_info->direct_copy(dest, src);
+ }
+
// Copy srouce content to destination in index format.
template<typename DstCellType, typename SrcCellType>
void to_index(DstCellType* dst, const SrcCellType& src) const;
diff --git a/be/src/olap/olap_define.h b/be/src/olap/olap_define.h
index 3735f1e..685a9ef 100644
--- a/be/src/olap/olap_define.h
+++ b/be/src/olap/olap_define.h
@@ -50,9 +50,12 @@ static const uint64_t OLAP_FIX_HEADER_MAGIC_NUMBER = 0;
// 执行be/ce时默认的候选集大小
static constexpr uint32_t OLAP_COMPACTION_DEFAULT_CANDIDATE_SIZE = 10;
-// the max length supported for string type
+// the max length supported for varchar type
static const uint16_t OLAP_STRING_MAX_LENGTH = 65535;
+//the max length supported for char type
+static const uint16_t OLAP_CHAR_MAX_LENGTH = 255;
+
static const int32_t PREFERRED_SNAPSHOT_VERSION = 3;
// the max bytes for stored string length
diff --git a/be/src/olap/rowset/segment_v2/binary_dict_page.cpp b/be/src/olap/rowset/segment_v2/binary_dict_page.cpp
index 675ed26..3c80e29 100644
--- a/be/src/olap/rowset/segment_v2/binary_dict_page.cpp
+++ b/be/src/olap/rowset/segment_v2/binary_dict_page.cpp
@@ -102,7 +102,7 @@ Slice BinaryDictPageBuilder::finish() {
Slice data_slice = _data_page_builder->finish();
_buffer.append(data_slice.data, data_slice.size);
encode_fixed32_le(&_buffer[0], _encoding_type);
- return Slice(_buffer.data(), _buffer.size());
+ return Slice(_buffer);
}
void BinaryDictPageBuilder::reset() {
@@ -147,7 +147,6 @@ BinaryDictPageDecoder::BinaryDictPageDecoder(Slice data, const PageDecoderOption
_data(data),
_options(options),
_data_page_decoder(nullptr),
- _dict_decoder(options.dict_decoder),
_parsed(false),
_encoding_type(UNKNOWN_ENCODING) { }
@@ -161,7 +160,6 @@ Status BinaryDictPageDecoder::init() {
_encoding_type = static_cast<EncodingTypePB>(type);
_data.remove_prefix(BINARY_DICT_PAGE_HEADER_SIZE);
if (_encoding_type == DICT_ENCODING) {
- DCHECK(_dict_decoder != nullptr) << "dict decoder pointer is nullptr";
_data_page_decoder.reset(new BitShufflePageDecoder<OLAP_FIELD_TYPE_INT>(_data, _options));
} else if (_encoding_type == PLAIN_ENCODING) {
DCHECK_EQ(_encoding_type, PLAIN_ENCODING);
@@ -180,12 +178,21 @@ Status BinaryDictPageDecoder::seek_to_position_in_page(size_t pos) {
return _data_page_decoder->seek_to_position_in_page(pos);
}
+bool BinaryDictPageDecoder::is_dict_encoding() const {
+ return _encoding_type == DICT_ENCODING;
+}
+
+void BinaryDictPageDecoder::set_dict_decoder(PageDecoder* dict_decoder){
+ _dict_decoder = (BinaryPlainPageDecoder*)dict_decoder;
+};
+
Status BinaryDictPageDecoder::next_batch(size_t* n, ColumnBlockView* dst) {
if (_encoding_type == PLAIN_ENCODING) {
return _data_page_decoder->next_batch(n, dst);
}
// dictionary encoding
DCHECK(_parsed);
+ DCHECK(_dict_decoder != nullptr) << "dict decoder pointer is nullptr";
if (PREDICT_FALSE(*n == 0)) {
*n = 0;
return Status::OK();
diff --git a/be/src/olap/rowset/segment_v2/binary_dict_page.h b/be/src/olap/rowset/segment_v2/binary_dict_page.h
index e434e99..7951efd 100644
--- a/be/src/olap/rowset/segment_v2/binary_dict_page.h
+++ b/be/src/olap/rowset/segment_v2/binary_dict_page.h
@@ -116,11 +116,15 @@ public:
return _data_page_decoder->current_index();
}
+ bool is_dict_encoding() const;
+
+ void set_dict_decoder(PageDecoder* dict_decoder);
+
private:
Slice _data;
PageDecoderOptions _options;
std::unique_ptr<PageDecoder> _data_page_decoder;
- BinaryPlainPageDecoder* _dict_decoder;
+ const BinaryPlainPageDecoder* _dict_decoder = nullptr;
bool _parsed;
EncodingTypePB _encoding_type;
faststring _code_buf;
diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp b/be/src/olap/rowset/segment_v2/column_reader.cpp
index c08b39a..9c7c69d 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/column_reader.cpp
@@ -32,6 +32,7 @@
#include "util/crc32c.h"
#include "util/rle_encoding.h" // for RleDecoder
#include "util/block_compression.h"
+#include "olap/rowset/segment_v2/binary_dict_page.h" // for BinaryDictPageDecoder
namespace doris {
namespace segment_v2 {
@@ -168,6 +169,10 @@ void ColumnReader::get_row_ranges_by_zone_map(CondColumn* cond_column,
_calculate_row_ranges(page_indexes, row_ranges);
}
+PagePointer ColumnReader::get_dict_page_pointer() const {
+ return _meta.dict_page();
+}
+
void ColumnReader::_get_filtered_pages(CondColumn* cond_column,
const std::vector<CondColumn*>& delete_conditions, std::vector<uint32_t>* page_indexes) {
FieldType type = _type_info->type();
@@ -426,6 +431,24 @@ Status FileColumnIterator::_read_page(const OrdinalPageIndexIterator& iter, Pars
RETURN_IF_ERROR(_reader->encoding_info()->create_page_decoder(data, options, &page->data_decoder));
RETURN_IF_ERROR(page->data_decoder->init());
+ // lazy init dict_encoding'dict for three reasons
+ // 1. a column use dictionary encoding still has non-dict-encoded data pages are seeked,load dict when necessary
+ // 2. ColumnReader which is owned by Segment and Rowset can being alive even when there is no query,it should retain memory as small as possible.
+ // 3. Iterators of the same column won't repeat load the dict page because of page cache.
+ if (_reader->encoding_info()->encoding() == DICT_ENCODING) {
+ BinaryDictPageDecoder* binary_dict_page_decoder = (BinaryDictPageDecoder*)page->data_decoder;
+ if (binary_dict_page_decoder->is_dict_encoding()) {
+ if (_dict_decoder == nullptr) {
+ PagePointer pp = _reader->get_dict_page_pointer();
+ RETURN_IF_ERROR(_reader->read_page(pp, &_dict_page_handle));
+
+ _dict_decoder.reset(new BinaryPlainPageDecoder(_dict_page_handle.data()));
+ RETURN_IF_ERROR(_dict_decoder->init());
+ }
+ binary_dict_page_decoder->set_dict_decoder(_dict_decoder.get());
+ }
+ }
+
page->offset_in_page = 0;
return Status::OK();
diff --git a/be/src/olap/rowset/segment_v2/column_reader.h b/be/src/olap/rowset/segment_v2/column_reader.h
index 6d425cb..af34db2 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.h
+++ b/be/src/olap/rowset/segment_v2/column_reader.h
@@ -29,6 +29,7 @@
#include "olap/rowset/segment_v2/ordinal_page_index.h" // for OrdinalPageIndexIterator
#include "olap/rowset/segment_v2/column_zone_map.h" // for ColumnZoneMap
#include "olap/rowset/segment_v2/row_ranges.h" // for RowRanges
+#include "olap/rowset/segment_v2/page_handle.h" // for PageHandle
namespace doris {
@@ -85,6 +86,8 @@ public:
void get_row_ranges_by_zone_map(CondColumn* cond_column,
const std::vector<CondColumn*>& delete_conditions, RowRanges* row_ranges);
+ PagePointer get_dict_page_pointer() const;
+
private:
Status _init_ordinal_index();
@@ -189,6 +192,12 @@ private:
// 3. When _page is null, it means that this reader can not be read.
std::unique_ptr<ParsedPage> _page;
+ // keep dict page decoder
+ std::unique_ptr<PageDecoder> _dict_decoder;
+
+ // keep dict page handle to avoid released
+ PageHandle _dict_page_handle;
+
// page iterator used to get next page when current page is finished.
// This value will be reset when a new seek is issued
OrdinalPageIndexIterator _page_iter;
diff --git a/be/src/olap/rowset/segment_v2/column_writer.cpp b/be/src/olap/rowset/segment_v2/column_writer.cpp
index fe8dade..4faf372 100644
--- a/be/src/olap/rowset/segment_v2/column_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/column_writer.cpp
@@ -213,6 +213,14 @@ Status ColumnWriter::write_data() {
RETURN_IF_ERROR(_write_data_page(page));
page = page->next;
}
+ // write column dict
+ if (_encoding_info->encoding() == DICT_ENCODING) {
+ Slice dict_page;
+ _page_builder->get_dictionary_page(&dict_page);
+ std::vector<Slice> origin_data;
+ origin_data.push_back(dict_page);
+ RETURN_IF_ERROR(_write_physical_page(&origin_data, &_dict_page_pp));
+ }
return Status::OK();
}
@@ -240,6 +248,9 @@ void ColumnWriter::write_meta(ColumnMetaPB* meta) {
if (_opts.need_zone_map) {
_zone_map_pp.to_proto(meta->mutable_zone_map_page());
}
+ if (_encoding_info->encoding() == DICT_ENCODING) {
+ _dict_page_pp.to_proto(meta->mutable_dict_page());
+ }
}
// write a page into file and update ordinal index
diff --git a/be/src/olap/rowset/segment_v2/column_writer.h b/be/src/olap/rowset/segment_v2/column_writer.h
index 76cd5b4..b881c13 100644
--- a/be/src/olap/rowset/segment_v2/column_writer.h
+++ b/be/src/olap/rowset/segment_v2/column_writer.h
@@ -157,6 +157,7 @@ private:
PagePointer _ordinal_index_pp;
PagePointer _zone_map_pp;
+ PagePointer _dict_page_pp;
uint64_t _written_size = 0;
};
diff --git a/be/src/olap/rowset/segment_v2/column_zone_map.cpp b/be/src/olap/rowset/segment_v2/column_zone_map.cpp
index 909a9cf..61dd3f2 100644
--- a/be/src/olap/rowset/segment_v2/column_zone_map.cpp
+++ b/be/src/olap/rowset/segment_v2/column_zone_map.cpp
@@ -28,9 +28,9 @@ ColumnZoneMapBuilder::ColumnZoneMapBuilder(const TypeInfo* type_info) : _type_in
options.data_page_size = 0;
_page_builder.reset(new BinaryPlainPageBuilder(options));
_field.reset(FieldFactory::create_by_type(_type_info->type()));
- _max_string_value = _arena.Allocate(OLAP_STRING_MAX_LENGTH);
- _zone_map.min_value = _arena.Allocate(_type_info->size());
- _zone_map.max_value = _arena.Allocate(_type_info->size());
+ _zone_map.min_value = _field->allocate_value_from_arena(&_arena);
+ _zone_map.max_value = _field->allocate_value_from_arena(&_arena);
+
_reset_zone_map();
}
@@ -38,10 +38,10 @@ Status ColumnZoneMapBuilder::add(const uint8_t *vals, size_t count) {
if (vals != nullptr) {
for (int i = 0; i < count; ++i) {
if (_field->compare(_zone_map.min_value, (char *)vals) > 0) {
- _field->deep_copy_content(_zone_map.min_value, (const char *)vals, &_arena);
+ _field->direct_copy_content(_zone_map.min_value, (const char *)vals);
}
if (_field->compare(_zone_map.max_value, (char *)vals) < 0) {
- _field->deep_copy_content(_zone_map.max_value, (const char *)vals, &_arena);
+ _field->direct_copy_content(_zone_map.max_value, (const char *)vals);
}
vals += _type_info->size();
if (!_zone_map.has_not_null) {
@@ -78,10 +78,6 @@ Status ColumnZoneMapBuilder::flush() {
}
void ColumnZoneMapBuilder::_reset_zone_map() {
- // we should allocate max varchar length and set to max for min value
- Slice *min_slice = (Slice *)_zone_map.min_value;
- min_slice->data = _max_string_value;
- min_slice->size = OLAP_STRING_MAX_LENGTH;
_field->set_to_max(_zone_map.min_value);
_field->set_to_min(_zone_map.max_value);
_zone_map.has_null = false;
diff --git a/be/src/olap/rowset/segment_v2/column_zone_map.h b/be/src/olap/rowset/segment_v2/column_zone_map.h
index 1fb01bc..3e2b7ac 100644
--- a/be/src/olap/rowset/segment_v2/column_zone_map.h
+++ b/be/src/olap/rowset/segment_v2/column_zone_map.h
@@ -73,7 +73,6 @@ private:
std::unique_ptr<Field> _field;
// memory will be managed by arena
ZoneMap _zone_map;
- char* _max_string_value;
Arena _arena;
};
diff --git a/be/src/olap/rowset/segment_v2/encoding_info.cpp b/be/src/olap/rowset/segment_v2/encoding_info.cpp
index 3cfb715..31539a2 100644
--- a/be/src/olap/rowset/segment_v2/encoding_info.cpp
+++ b/be/src/olap/rowset/segment_v2/encoding_info.cpp
@@ -20,6 +20,7 @@
#include "olap/olap_common.h"
#include "olap/rowset/segment_v2/bitshuffle_page.h"
#include "olap/rowset/segment_v2/rle_page.h"
+#include "olap/rowset/segment_v2/binary_dict_page.h"
namespace doris {
namespace segment_v2 {
@@ -67,6 +68,18 @@ struct TypeEncodingTraits<type, RLE> {
}
};
+template<FieldType type>
+struct TypeEncodingTraits<type, DICT_ENCODING> {
+ static Status create_page_builder(const PageBuilderOptions& opts, PageBuilder** builder) {
+ *builder = new BinaryDictPageBuilder(opts);
+ return Status::OK();
+ }
+ static Status create_page_decoder(const Slice& data, const PageDecoderOptions& opts, PageDecoder** decoder) {
+ *decoder = new BinaryDictPageDecoder(data, opts);
+ return Status::OK();
+ }
+};
+
template<FieldType Type, EncodingTypePB Encoding>
struct EncodingTraits : TypeEncodingTraits<Type, Encoding> {
static const FieldType type = Type;
@@ -122,6 +135,10 @@ EncodingInfoResolver::EncodingInfoResolver() {
_add_map<OLAP_FIELD_TYPE_FLOAT, PLAIN_ENCODING>();
_add_map<OLAP_FIELD_TYPE_DOUBLE, BIT_SHUFFLE>();
_add_map<OLAP_FIELD_TYPE_DOUBLE, PLAIN_ENCODING>();
+ _add_map<OLAP_FIELD_TYPE_CHAR, DICT_ENCODING>();
+ _add_map<OLAP_FIELD_TYPE_CHAR, PLAIN_ENCODING>();
+ _add_map<OLAP_FIELD_TYPE_VARCHAR, DICT_ENCODING>();
+ _add_map<OLAP_FIELD_TYPE_VARCHAR, PLAIN_ENCODING>();
_add_map<OLAP_FIELD_TYPE_BOOL, RLE>();
_add_map<OLAP_FIELD_TYPE_BOOL, BIT_SHUFFLE>();
_add_map<OLAP_FIELD_TYPE_BOOL, PLAIN_ENCODING>();
diff --git a/be/src/olap/rowset/segment_v2/options.h b/be/src/olap/rowset/segment_v2/options.h
index 386afad..3997e0d 100644
--- a/be/src/olap/rowset/segment_v2/options.h
+++ b/be/src/olap/rowset/segment_v2/options.h
@@ -31,7 +31,6 @@ struct PageBuilderOptions {
};
struct PageDecoderOptions {
- BinaryPlainPageDecoder* dict_decoder = nullptr;
};
} // namespace segment_v2
diff --git a/be/src/olap/types.cpp b/be/src/olap/types.cpp
index 87ef27b..36704fc 100644
--- a/be/src/olap/types.cpp
+++ b/be/src/olap/types.cpp
@@ -27,6 +27,7 @@ TypeInfo::TypeInfo(TypeTraitsClass t)
_deep_copy(TypeTraitsClass::deep_copy),
_deep_copy_with_arena(TypeTraitsClass::deep_copy_with_arena),
_direct_copy(TypeTraitsClass::direct_copy),
+ _allocate_value_from_arena(TypeTraitsClass::allocate_value_from_arena),
_from_string(TypeTraitsClass::from_string),
_to_string(TypeTraitsClass::to_string),
_set_to_max(TypeTraitsClass::set_to_max),
diff --git a/be/src/olap/types.h b/be/src/olap/types.h
index 2b63035..bd38110 100644
--- a/be/src/olap/types.h
+++ b/be/src/olap/types.h
@@ -64,6 +64,10 @@ public:
_direct_copy(dest, src);
}
+ inline char* allocate_value_from_arena(Arena* arena) const {
+ return _allocate_value_from_arena(arena);
+ }
+
OLAPStatus from_string(void* buf, const std::string& scan_key) const {
return _from_string(buf, scan_key);
}
@@ -85,6 +89,7 @@ private:
void (*_deep_copy)(void* dest, const void* src, MemPool* mem_pool);
void (*_deep_copy_with_arena)(void* dest, const void* src, Arena* arena);
void (*_direct_copy)(void* dest, const void* src);
+ char* (*_allocate_value_from_arena)(Arena* arena);
OLAPStatus (*_from_string)(void* buf, const std::string& scan_key);
std::string (*_to_string)(const void* src);
@@ -213,6 +218,10 @@ struct BaseFieldtypeTraits : public CppTypeTraits<field_type> {
return HashUtil::hash(data, sizeof(CppType), seed);
}
+ static inline char* allocate_value_from_arena(Arena* arena) {
+ return arena->Allocate(sizeof(CppType));
+ }
+
static std::string to_string(const void* src) {
std::stringstream stream;
stream << *reinterpret_cast<const CppType*>(src);
@@ -568,6 +577,13 @@ struct FieldTypeTraits<OLAP_FIELD_TYPE_CHAR> : public BaseFieldtypeTraits<OLAP_F
auto slice = reinterpret_cast<const Slice*>(data);
return HashUtil::hash(slice->data, slice->size, seed);
}
+ static char* allocate_value_from_arena(Arena* arena) {
+ char* type_value = arena->Allocate(sizeof(Slice));
+ auto slice = reinterpret_cast<Slice*>(type_value);
+ slice->size = OLAP_CHAR_MAX_LENGTH;
+ slice->data = arena->Allocate(OLAP_CHAR_MAX_LENGTH);
+ return type_value;
+ }
};
template<>
@@ -594,6 +610,13 @@ struct FieldTypeTraits<OLAP_FIELD_TYPE_VARCHAR> : public FieldTypeTraits<OLAP_FI
auto slice = reinterpret_cast<Slice*>(buf);
slice->size = 0;
}
+ static char* allocate_value_from_arena(Arena* arena) {
+ char* type_value = arena->Allocate(sizeof(Slice));
+ auto slice = reinterpret_cast<Slice*>(type_value);
+ slice->size = OLAP_STRING_MAX_LENGTH;
+ slice->data = arena->Allocate(OLAP_STRING_MAX_LENGTH);
+ return type_value;
+ }
};
template<>
diff --git a/be/test/olap/rowset/segment_v2/binary_dict_page_test.cpp b/be/test/olap/rowset/segment_v2/binary_dict_page_test.cpp
index 9a25b7a..d082bca 100644
--- a/be/test/olap/rowset/segment_v2/binary_dict_page_test.cpp
+++ b/be/test/olap/rowset/segment_v2/binary_dict_page_test.cpp
@@ -62,8 +62,9 @@ public:
// decode
PageDecoderOptions decoder_options;
- decoder_options.dict_decoder = dict_page_decoder.get();
BinaryDictPageDecoder page_decoder(s, decoder_options);
+ page_decoder.set_dict_decoder(dict_page_decoder.get());
+
status = page_decoder.init();
ASSERT_TRUE(status.ok());
ASSERT_EQ(slices.size(), page_decoder.count());
@@ -154,9 +155,9 @@ public:
// decode
PageDecoderOptions decoder_options;
- decoder_options.dict_decoder = dict_page_decoder.get();
BinaryDictPageDecoder page_decoder(results[slice_index], decoder_options);
status = page_decoder.init();
+ page_decoder.set_dict_decoder(dict_page_decoder.get());
ASSERT_TRUE(status.ok());
//check values
diff --git a/be/test/olap/rowset/segment_v2/binary_plain_page_test.cpp b/be/test/olap/rowset/segment_v2/binary_plain_page_test.cpp
index 0ae68fb..45f0242 100644
--- a/be/test/olap/rowset/segment_v2/binary_plain_page_test.cpp
+++ b/be/test/olap/rowset/segment_v2/binary_plain_page_test.cpp
@@ -56,7 +56,7 @@ public:
PageDecoderType page_decoder(s, decoder_options);
Status status = page_decoder.init();
ASSERT_TRUE(status.ok());
-
+
//test1
size_t size = 3;
diff --git a/be/test/olap/rowset/segment_v2/segment_test.cpp b/be/test/olap/rowset/segment_v2/segment_test.cpp
index 80ce4ff..01121ce 100644
--- a/be/test/olap/rowset/segment_v2/segment_test.cpp
+++ b/be/test/olap/rowset/segment_v2/segment_test.cpp
@@ -606,6 +606,247 @@ TEST_F(SegmentReaderWriterTest, TestDefaultValueColumn) {
}
}
+void set_column_value_by_type(FieldType fieldType, int src, char* target, Arena* _arena, size_t _length = 0) {
+ if (fieldType == OLAP_FIELD_TYPE_CHAR) {
+ char* src_value = &std::to_string(src)[0];
+ int src_len = strlen(src_value);
+
+ auto* dest_slice = (Slice*)target;
+ dest_slice->size = _length;
+ dest_slice->data = _arena->Allocate(dest_slice->size);
+ memcpy(dest_slice->data, src_value, src_len);
+ memset(dest_slice->data + src_len, 0, dest_slice->size - src_len);
+ } else if (fieldType == OLAP_FIELD_TYPE_VARCHAR) {
+ char* src_value = &std::to_string(src)[0];
+ int src_len = strlen(src_value);
+
+ auto* dest_slice = (Slice*)target;
+ dest_slice->size = src_len;
+ dest_slice->data = _arena->Allocate(src_len);
+ std::memcpy(dest_slice->data, src_value, src_len);
+ } else {
+ *(int*)target = src;
+ }
+}
+
+TEST_F(SegmentReaderWriterTest, TestStringDict) {
+ size_t num_rows_per_block = 10;
+ Arena _arena;
+
+ std::shared_ptr<TabletSchema> tablet_schema(new TabletSchema());
+ tablet_schema->_num_columns = 4;
+ tablet_schema->_num_key_columns = 3;
+ tablet_schema->_num_short_key_columns = 2;
+ tablet_schema->_num_rows_per_row_block = num_rows_per_block;
+ tablet_schema->_cols.push_back(create_char_key(1));
+ tablet_schema->_cols.push_back(create_char_key(2));
+ tablet_schema->_cols.push_back(create_varchar_key(3));
+ tablet_schema->_cols.push_back(create_varchar_key(4));
+
+ // segment write
+ std::string dname = "./ut_dir/segment_test";
+ FileUtils::create_dir(dname);
+
+ SegmentWriterOptions opts;
+ opts.num_rows_per_block = num_rows_per_block;
+
+ std::string fname = dname + "/string_case";
+
+ SegmentWriter writer(fname, 0, tablet_schema.get(), opts);
+ auto st = writer.init(10);
+ ASSERT_TRUE(st.ok());
+
+ RowCursor row;
+ auto olap_st = row.init(*tablet_schema);
+ ASSERT_EQ(OLAP_SUCCESS, olap_st);
+
+ // 0, 1, 2, 3
+ // 10, 11, 12, 13
+ // 20, 21, 22, 23
+ // convert int to string
+ for (int i = 0; i < 4096; ++i) {
+ for (int j = 0; j < 4; ++j) {
+ auto cell = row.cell(j);
+ cell.set_not_null();
+ set_column_value_by_type(tablet_schema->_cols[j]._type, i * 10 + j, (char*)cell.mutable_cell_ptr(), &_arena, tablet_schema->_cols[j]._length);
+ }
+ Status status = writer.append_row(row);
+ ASSERT_TRUE(status.ok());
+ }
+
+ uint64_t file_size = 0;
+ st = writer.finalize(&file_size);
+ ASSERT_TRUE(st.ok());
+
+ {
+ std::shared_ptr<Segment> segment(new Segment(fname, 0, tablet_schema.get()));
+ st = segment->open();
+ ASSERT_TRUE(st.ok());
+ ASSERT_EQ(4096, segment->num_rows());
+ Schema schema(*tablet_schema);
+
+ // scan all rows
+ {
+ StorageReadOptions read_opts;
+ std::unique_ptr<SegmentIterator> iter = segment->new_iterator(schema, read_opts);
+
+ RowBlockV2 block(schema, 1024);
+
+ int left = 4096;
+ int rowid = 0;
+
+ while (left > 0) {
+ int rows_read = left > 1024 ? 1024 : left;
+ block.clear();
+ st = iter->next_batch(&block);
+ ASSERT_TRUE(st.ok());
+ ASSERT_EQ(rows_read, block.num_rows());
+ left -= rows_read;
+
+ for (int j = 0; j < block.schema()->column_ids().size(); ++j) {
+ auto cid = block.schema()->column_ids()[j];
+ auto column_block = block.column_block(j);
+ for (int i = 0; i < rows_read; ++i) {
+ int rid = rowid + i;
+ ASSERT_FALSE(BitmapTest(column_block.null_bitmap(), i));
+ const Slice* actual = reinterpret_cast<const Slice*>(column_block.cell_ptr(i));
+
+ Slice expect;
+ set_column_value_by_type(tablet_schema->_cols[j]._type, rid * 10 + cid, reinterpret_cast<char*>(&expect), &_arena, tablet_schema->_cols[j]._length);
+ ASSERT_EQ(expect.to_string(), actual->to_string());
+ }
+ }
+ rowid += rows_read;
+ }
+ }
+
+ // test seek, key
+ {
+ // lower bound
+ std::unique_ptr<RowCursor> lower_bound(new RowCursor());
+ lower_bound->init(*tablet_schema, 1);
+ {
+ auto cell = lower_bound->cell(0);
+ cell.set_not_null();
+ set_column_value_by_type(OLAP_FIELD_TYPE_CHAR, 40970, (char*)cell.mutable_cell_ptr(), &_arena, tablet_schema->_cols[0]._length);
+ }
+
+ StorageReadOptions read_opts;
+ read_opts.key_ranges.emplace_back(lower_bound.get(), false, nullptr, false);
+ std::unique_ptr<SegmentIterator> iter = segment->new_iterator(schema, read_opts);
+
+ RowBlockV2 block(schema, 100);
+ st = iter->next_batch(&block);
+ ASSERT_TRUE(st.is_end_of_file());
+ ASSERT_EQ(0, block.num_rows());
+ }
+
+ // test seek, key (-2, -1)
+ {
+ // lower bound
+ std::unique_ptr<RowCursor> lower_bound(new RowCursor());
+ lower_bound->init(*tablet_schema, 1);
+ {
+ auto cell = lower_bound->cell(0);
+ cell.set_not_null();
+ set_column_value_by_type(OLAP_FIELD_TYPE_CHAR, -2, (char*)cell.mutable_cell_ptr(), &_arena, tablet_schema->_cols[0]._length);
+ }
+
+ std::unique_ptr<RowCursor> upper_bound(new RowCursor());
+ upper_bound->init(*tablet_schema, 1);
+ {
+ auto cell = upper_bound->cell(0);
+ cell.set_not_null();
+ set_column_value_by_type(OLAP_FIELD_TYPE_CHAR, -1, (char*)cell.mutable_cell_ptr(), &_arena, tablet_schema->_cols[0]._length);
+ }
+
+ StorageReadOptions read_opts;
+ read_opts.key_ranges.emplace_back(lower_bound.get(), false, upper_bound.get(), false);
+ std::unique_ptr<SegmentIterator> iter = segment->new_iterator(schema, read_opts);
+
+ RowBlockV2 block(schema, 100);
+ st = iter->next_batch(&block);
+ ASSERT_TRUE(st.is_end_of_file());
+ ASSERT_EQ(0, block.num_rows());
+ }
+
+ // test char zone_map query hit;should read whole page
+ {
+ TCondition condition;
+ condition.__set_column_name("1");
+ condition.__set_condition_op(">");
+ std::vector<std::string> vals = {"100"};
+ condition.__set_condition_values(vals);
+ std::shared_ptr<Conditions> conditions(new Conditions());
+ conditions->set_tablet_schema(tablet_schema.get());
+ conditions->append_condition(condition);
+
+ StorageReadOptions read_opts;
+ read_opts.conditions = conditions.get();
+
+ std::unique_ptr<SegmentIterator> iter = segment->new_iterator(schema, read_opts);
+
+ RowBlockV2 block(schema, 1024);
+ int left = 4 * 1024;
+ int rowid = 0;
+
+ while (left > 0) {
+ int rows_read = left > 1024 ? 1024 : left;
+ block.clear();
+ st = iter->next_batch(&block);
+ ASSERT_TRUE(st.ok());
+ ASSERT_EQ(rows_read, block.num_rows());
+ left -= rows_read;
+
+ for (int j = 0; j < block.schema()->column_ids().size(); ++j) {
+ auto cid = block.schema()->column_ids()[j];
+ auto column_block = block.column_block(j);
+ for (int i = 0; i < rows_read; ++i) {
+ int rid = rowid + i;
+ ASSERT_FALSE(BitmapTest(column_block.null_bitmap(), i));
+
+ const Slice* actual = reinterpret_cast<const Slice*>(column_block.cell_ptr(i));
+ Slice expect;
+ set_column_value_by_type(tablet_schema->_cols[j]._type, rid * 10 + cid, reinterpret_cast<char*>(&expect), &_arena, tablet_schema->_cols[j]._length);
+ ASSERT_EQ(expect.to_string(), actual->to_string()) << "rid:" << rid << ", i:" << i;;
+ }
+ }
+ rowid += rows_read;
+ }
+ ASSERT_EQ(4 * 1024, rowid);
+ st = iter->next_batch(&block);
+ ASSERT_TRUE(st.is_end_of_file());
+ ASSERT_EQ(0, block.num_rows());
+ }
+
+ // test char zone_map query miss;col < -1
+ {
+ TCondition condition;
+ condition.__set_column_name("1");
+ condition.__set_condition_op("<");
+ std::vector<std::string> vals = {"-2"};
+ condition.__set_condition_values(vals);
+ std::shared_ptr<Conditions> conditions(new Conditions());
+ conditions->set_tablet_schema(tablet_schema.get());
+ conditions->append_condition(condition);
+
+ StorageReadOptions read_opts;
+ read_opts.conditions = conditions.get();
+
+ std::unique_ptr<SegmentIterator> iter = segment->new_iterator(schema, read_opts);
+
+ RowBlockV2 block(schema, 1024);
+
+ st = iter->next_batch(&block);
+ ASSERT_TRUE(st.is_end_of_file());
+ ASSERT_EQ(0, block.num_rows());
+ }
+
+ }
+
+ FileUtils::remove_all(dname);
+}
+
}
}
diff --git a/be/test/olap/tablet_schema_helper.h b/be/test/olap/tablet_schema_helper.h
index 8e22b33..73ecdf3 100644
--- a/be/test/olap/tablet_schema_helper.h
+++ b/be/test/olap/tablet_schema_helper.h
@@ -55,4 +55,28 @@ TabletColumn create_int_value(
return column;
}
+TabletColumn create_char_key(int32_t id, bool is_nullable = true) {
+ TabletColumn column;
+ column._unique_id = id;
+ column._col_name = std::to_string(id);
+ column._type = OLAP_FIELD_TYPE_CHAR;
+ column._is_key = true;
+ column._is_nullable = is_nullable;
+ column._length = 8;
+ column._index_length = 1;
+ return column;
+}
+
+TabletColumn create_varchar_key(int32_t id, bool is_nullable = true) {
+ TabletColumn column;
+ column._unique_id = id;
+ column._col_name = std::to_string(id);
+ column._type = OLAP_FIELD_TYPE_VARCHAR;
+ column._is_key = true;
+ column._is_nullable = is_nullable;
+ column._length = 4;
+ column._index_length = 4;
+ return column;
+}
+
}
diff --git a/gensrc/proto/segment_v2.proto b/gensrc/proto/segment_v2.proto
index 4a3f861..8abe0ae 100644
--- a/gensrc/proto/segment_v2.proto
+++ b/gensrc/proto/segment_v2.proto
@@ -95,7 +95,7 @@ message ColumnMetaPB {
optional PagePointerPB zone_map_page = 8;
// // dictionary page for DICT_ENCODING
- // optional PagePointerPB dict_page = 2;
+ optional PagePointerPB dict_page = 9;
// // bloom filter pages for bloom filter column
// repeated PagePointerPB bloom_filter_pages = 3;
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org