You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by yi...@apache.org on 2022/06/30 03:27:28 UTC

[doris] branch master updated: [Fix] avoid core dump cause by malformed bitmap type data (#10458)

This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new d259770b86 [Fix] avoid core dump cause by malformed bitmap type data (#10458)
d259770b86 is described below

commit d259770b86955a4cb34102d4cc95ae1a731326af
Author: Zhengguo Yang <ya...@gmail.com>
AuthorDate: Thu Jun 30 11:27:22 2022 +0800

    [Fix] avoid core dump cause by malformed bitmap type data (#10458)
---
 be/src/olap/rowset/segment_v2/binary_dict_page.cpp   |  8 ++++----
 be/src/olap/rowset/segment_v2/binary_dict_page.h     |  4 ++--
 be/src/olap/rowset/segment_v2/binary_plain_page.h    | 20 +++++++++++++++++++-
 be/src/olap/rowset/segment_v2/column_reader.cpp      |  6 ++++--
 be/src/olap/rowset/segment_v2/encoding_info.cpp      |  4 ++--
 .../olap/rowset/segment_v2/indexed_column_reader.cpp |  4 +++-
 .../olap/rowset/segment_v2/indexed_column_writer.cpp |  6 ++++--
 be/src/olap/rowset/segment_v2/options.h              |  8 +++++---
 be/src/olap/rowset/segment_v2/parsed_page.h          |  4 ++--
 be/src/tools/meta_tool.cpp                           |  1 -
 be/src/util/bitmap_value.h                           | 14 +++++++++++++-
 .../olap/rowset/segment_v2/binary_dict_page_test.cpp | 10 ++++++----
 .../rowset/segment_v2/binary_plain_page_test.cpp     |  3 ++-
 13 files changed, 66 insertions(+), 26 deletions(-)

diff --git a/be/src/olap/rowset/segment_v2/binary_dict_page.cpp b/be/src/olap/rowset/segment_v2/binary_dict_page.cpp
index b53ff52ade..ea9fe66dfc 100644
--- a/be/src/olap/rowset/segment_v2/binary_dict_page.cpp
+++ b/be/src/olap/rowset/segment_v2/binary_dict_page.cpp
@@ -45,7 +45,7 @@ BinaryDictPageBuilder::BinaryDictPageBuilder(const PageBuilderOptions& options)
     _data_page_builder.reset(new BitshufflePageBuilder<OLAP_FIELD_TYPE_INT>(options));
     PageBuilderOptions dict_builder_options;
     dict_builder_options.data_page_size = _options.dict_page_size;
-    _dict_builder.reset(new BinaryPlainPageBuilder(dict_builder_options));
+    _dict_builder.reset(new BinaryPlainPageBuilder<OLAP_FIELD_TYPE_VARCHAR>(dict_builder_options));
     reset();
 }
 
@@ -135,7 +135,7 @@ void BinaryDictPageBuilder::reset() {
     _buffer.resize(BINARY_DICT_PAGE_HEADER_SIZE);
 
     if (_encoding_type == DICT_ENCODING && _dict_builder->is_page_full()) {
-        _data_page_builder.reset(new BinaryPlainPageBuilder(_options));
+        _data_page_builder.reset(new BinaryPlainPageBuilder<OLAP_FIELD_TYPE_VARCHAR>(_options));
         _encoding_type = PLAIN_ENCODING;
     } else {
         _data_page_builder->reset();
@@ -206,7 +206,7 @@ Status BinaryDictPageDecoder::init() {
                 _bit_shuffle_ptr = new BitShufflePageDecoder<OLAP_FIELD_TYPE_INT>(_data, _options));
     } else if (_encoding_type == PLAIN_ENCODING) {
         DCHECK_EQ(_encoding_type, PLAIN_ENCODING);
-        _data_page_decoder.reset(new BinaryPlainPageDecoder(_data, _options));
+        _data_page_decoder.reset(new BinaryPlainPageDecoder<OLAP_FIELD_TYPE_INT>(_data, _options));
     } else {
         LOG(WARNING) << "invalid encoding type:" << _encoding_type;
         return Status::Corruption(strings::Substitute("invalid encoding type:$0", _encoding_type));
@@ -228,7 +228,7 @@ bool BinaryDictPageDecoder::is_dict_encoding() const {
 }
 
 void BinaryDictPageDecoder::set_dict_decoder(PageDecoder* dict_decoder, StringRef* dict_word_info) {
-    _dict_decoder = (BinaryPlainPageDecoder*)dict_decoder;
+    _dict_decoder = (BinaryPlainPageDecoder<OLAP_FIELD_TYPE_VARCHAR>*)dict_decoder;
     _dict_word_info = dict_word_info;
 };
 
diff --git a/be/src/olap/rowset/segment_v2/binary_dict_page.h b/be/src/olap/rowset/segment_v2/binary_dict_page.h
index e1dcdba49b..d57f7cf974 100644
--- a/be/src/olap/rowset/segment_v2/binary_dict_page.h
+++ b/be/src/olap/rowset/segment_v2/binary_dict_page.h
@@ -78,7 +78,7 @@ private:
 
     std::unique_ptr<PageBuilder> _data_page_builder;
 
-    std::unique_ptr<BinaryPlainPageBuilder> _dict_builder;
+    std::unique_ptr<BinaryPlainPageBuilder<OLAP_FIELD_TYPE_VARCHAR>> _dict_builder;
 
     EncodingTypePB _encoding_type;
     struct HashOfSlice {
@@ -122,7 +122,7 @@ private:
     Slice _data;
     PageDecoderOptions _options;
     std::unique_ptr<PageDecoder> _data_page_decoder;
-    BinaryPlainPageDecoder* _dict_decoder = nullptr;
+    BinaryPlainPageDecoder<OLAP_FIELD_TYPE_VARCHAR>* _dict_decoder = nullptr;
     BitShufflePageDecoder<OLAP_FIELD_TYPE_INT>* _bit_shuffle_ptr = nullptr;
     bool _parsed;
     EncodingTypePB _encoding_type;
diff --git a/be/src/olap/rowset/segment_v2/binary_plain_page.h b/be/src/olap/rowset/segment_v2/binary_plain_page.h
index cac9da3709..1919efcba8 100644
--- a/be/src/olap/rowset/segment_v2/binary_plain_page.h
+++ b/be/src/olap/rowset/segment_v2/binary_plain_page.h
@@ -44,6 +44,7 @@
 namespace doris {
 namespace segment_v2 {
 
+template <FieldType Type>
 class BinaryPlainPageBuilder : public PageBuilder {
 public:
     BinaryPlainPageBuilder(const PageBuilderOptions& options)
@@ -64,6 +65,11 @@ public:
         // If the page is full, should stop adding more items.
         while (!is_page_full() && i < *count) {
             auto src = reinterpret_cast<const Slice*>(vals);
+            if constexpr (Type == OLAP_FIELD_TYPE_OBJECT) {
+                if (_options.need_check_bitmap) {
+                    RETURN_IF_ERROR(BitmapTypeCode::validate(*(src->data)));
+                }
+            }
             size_t offset = _buffer.size();
             _offsets.push_back(offset);
             _buffer.append(src->data, src->size);
@@ -154,6 +160,7 @@ private:
     faststring _last_value;
 };
 
+template <FieldType Type>
 class BinaryPlainPageDecoder : public PageDecoder {
 public:
     BinaryPlainPageDecoder(Slice data) : BinaryPlainPageDecoder(data, PageDecoderOptions()) {}
@@ -204,6 +211,11 @@ public:
         size_t mem_len[max_fetch];
         for (size_t i = 0; i < max_fetch; i++, out++, _cur_idx++) {
             *out = string_at_index(_cur_idx);
+            if constexpr (Type == OLAP_FIELD_TYPE_OBJECT) {
+                if (_options.need_check_bitmap) {
+                    RETURN_IF_ERROR(BitmapTypeCode::validate(*(out->data)));
+                }
+            }
             mem_len[i] = out->size;
         }
 
@@ -246,6 +258,11 @@ public:
             uint32_t len = offset(_cur_idx + 1) - start_offset;
             len_array[i] = len;
             start_offset_array[i] = start_offset;
+            if constexpr (Type == OLAP_FIELD_TYPE_OBJECT) {
+                if (_options.need_check_bitmap) {
+                    RETURN_IF_ERROR(BitmapTypeCode::validate(*(_data.data + start_offset)));
+                }
+            }
         }
         dst->insert_many_binary_data(_data.mutable_data(), len_array, start_offset_array,
                                      max_fetch);
@@ -271,8 +288,9 @@ public:
     }
 
     void get_dict_word_info(StringRef* dict_word_info) {
-        if (_num_elems <= 0) [[unlikely]]
+        if (UNLIKELY(_num_elems <= 0)) {
             return;
+        }
 
         char* data_begin = (char*)&_data[0];
         char* offset_ptr = (char*)&_data[_offsets_pos];
diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp b/be/src/olap/rowset/segment_v2/column_reader.cpp
index 7243736281..a7d8c99899 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/column_reader.cpp
@@ -747,10 +747,12 @@ Status FileColumnIterator::_read_data_page(const OrdinalPageIndexIterator& iter)
                                                    _compress_codec.get()));
                 // ignore dict_footer.dict_page_footer().encoding() due to only
                 // PLAIN_ENCODING is supported for dict page right now
-                _dict_decoder = std::make_unique<BinaryPlainPageDecoder>(dict_data);
+                _dict_decoder = std::make_unique<BinaryPlainPageDecoder<OLAP_FIELD_TYPE_VARCHAR>>(
+                        dict_data);
                 RETURN_IF_ERROR(_dict_decoder->init());
 
-                auto* pd_decoder = (BinaryPlainPageDecoder*)_dict_decoder.get();
+                auto* pd_decoder =
+                        (BinaryPlainPageDecoder<OLAP_FIELD_TYPE_VARCHAR>*)_dict_decoder.get();
                 _dict_word_info.reset(new StringRef[pd_decoder->_num_elems]);
                 pd_decoder->get_dict_word_info(_dict_word_info.get());
             }
diff --git a/be/src/olap/rowset/segment_v2/encoding_info.cpp b/be/src/olap/rowset/segment_v2/encoding_info.cpp
index ac3a1a624f..4f9e2d5d14 100644
--- a/be/src/olap/rowset/segment_v2/encoding_info.cpp
+++ b/be/src/olap/rowset/segment_v2/encoding_info.cpp
@@ -58,12 +58,12 @@ struct TypeEncodingTraits<type, PLAIN_ENCODING, CppType> {
 template <FieldType type>
 struct TypeEncodingTraits<type, PLAIN_ENCODING, Slice> {
     static Status create_page_builder(const PageBuilderOptions& opts, PageBuilder** builder) {
-        *builder = new BinaryPlainPageBuilder(opts);
+        *builder = new BinaryPlainPageBuilder<type>(opts);
         return Status::OK();
     }
     static Status create_page_decoder(const Slice& data, const PageDecoderOptions& opts,
                                       PageDecoder** decoder) {
-        *decoder = new BinaryPlainPageDecoder(data, opts);
+        *decoder = new BinaryPlainPageDecoder<type>(data, opts);
         return Status::OK();
     }
 };
diff --git a/be/src/olap/rowset/segment_v2/indexed_column_reader.cpp b/be/src/olap/rowset/segment_v2/indexed_column_reader.cpp
index 6a0b2ca81e..682f620ea8 100644
--- a/be/src/olap/rowset/segment_v2/indexed_column_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/indexed_column_reader.cpp
@@ -110,8 +110,10 @@ Status IndexedColumnIterator::_read_data_page(const PagePointer& pp) {
                                        _compress_codec.get()));
     // parse data page
     // note that page_index is not used in IndexedColumnIterator, so we pass 0
+    PageDecoderOptions opts;
+    opts.need_check_bitmap = false;
     return ParsedPage::create(std::move(handle), body, footer.data_page_footer(),
-                              _reader->encoding_info(), pp, 0, &_data_page);
+                              _reader->encoding_info(), pp, 0, &_data_page, opts);
 }
 
 Status IndexedColumnIterator::seek_to_ordinal(ordinal_t idx) {
diff --git a/be/src/olap/rowset/segment_v2/indexed_column_writer.cpp b/be/src/olap/rowset/segment_v2/indexed_column_writer.cpp
index 4c7259c90c..979481adbc 100644
--- a/be/src/olap/rowset/segment_v2/indexed_column_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/indexed_column_writer.cpp
@@ -59,8 +59,10 @@ Status IndexedColumnWriter::init() {
     // because the default encoding of a data type can be changed in the future
     DCHECK_NE(_options.encoding, DEFAULT_ENCODING);
 
-    PageBuilder* data_page_builder;
-    RETURN_IF_ERROR(encoding_info->create_page_builder(PageBuilderOptions(), &data_page_builder));
+    PageBuilder* data_page_builder = nullptr;
+    PageBuilderOptions builder_option;
+    builder_option.need_check_bitmap = false;
+    RETURN_IF_ERROR(encoding_info->create_page_builder(builder_option, &data_page_builder));
     _data_page_builder.reset(data_page_builder);
 
     if (_options.write_ordinal_index) {
diff --git a/be/src/olap/rowset/segment_v2/options.h b/be/src/olap/rowset/segment_v2/options.h
index 0cb2dafa39..9405eb19cf 100644
--- a/be/src/olap/rowset/segment_v2/options.h
+++ b/be/src/olap/rowset/segment_v2/options.h
@@ -22,17 +22,19 @@
 namespace doris {
 namespace segment_v2 {
 
-class BinaryPlainPageDecoder;
-
 static constexpr size_t DEFAULT_PAGE_SIZE = 1024 * 1024; // default size: 1M
 
 struct PageBuilderOptions {
     size_t data_page_size = DEFAULT_PAGE_SIZE;
 
     size_t dict_page_size = DEFAULT_PAGE_SIZE;
+
+    bool need_check_bitmap = true;
 };
 
-struct PageDecoderOptions {};
+struct PageDecoderOptions {
+    bool need_check_bitmap = true;
+};
 
 } // namespace segment_v2
 } // namespace doris
diff --git a/be/src/olap/rowset/segment_v2/parsed_page.h b/be/src/olap/rowset/segment_v2/parsed_page.h
index c70ebd62a6..232c347c38 100644
--- a/be/src/olap/rowset/segment_v2/parsed_page.h
+++ b/be/src/olap/rowset/segment_v2/parsed_page.h
@@ -38,7 +38,8 @@ namespace segment_v2 {
 struct ParsedPage {
     static Status create(PageHandle handle, const Slice& body, const DataPageFooterPB& footer,
                          const EncodingInfo* encoding, const PagePointer& page_pointer,
-                         uint32_t page_index, ParsedPage* result) {
+                         uint32_t page_index, ParsedPage* result,
+                         PageDecoderOptions opts = PageDecoderOptions()) {
         result->~ParsedPage();
         ParsedPage* page = new (result)(ParsedPage);
         page->page_handle = std::move(handle);
@@ -53,7 +54,6 @@ struct ParsedPage {
         }
 
         Slice data_slice(body.data, body.size - null_size);
-        PageDecoderOptions opts;
         RETURN_IF_ERROR(encoding->create_page_decoder(data_slice, opts, &page->data_decoder));
         RETURN_IF_ERROR(page->data_decoder->init());
 
diff --git a/be/src/tools/meta_tool.cpp b/be/src/tools/meta_tool.cpp
index 60b56b35cb..f4e5e6087c 100644
--- a/be/src/tools/meta_tool.cpp
+++ b/be/src/tools/meta_tool.cpp
@@ -56,7 +56,6 @@ using doris::RandomAccessFile;
 using strings::Substitute;
 using doris::segment_v2::SegmentFooterPB;
 using doris::segment_v2::ColumnReader;
-using doris::segment_v2::BinaryPlainPageDecoder;
 using doris::segment_v2::PageHandle;
 using doris::segment_v2::PagePointer;
 using doris::segment_v2::ColumnReaderOptions;
diff --git a/be/src/util/bitmap_value.h b/be/src/util/bitmap_value.h
index 4b1445ba59..7eaeb11e79 100644
--- a/be/src/util/bitmap_value.h
+++ b/be/src/util/bitmap_value.h
@@ -71,6 +71,16 @@ struct BitmapTypeCode {
         // added in 0.12
         BITMAP64 = 4
     };
+    Status static inline validate(int bitmap_type) {
+        if (UNLIKELY(bitmap_type < type::EMPTY || bitmap_type > type::BITMAP64)) {
+            std::string err_msg =
+                    fmt::format("BitmapTypeCode invalid, should between: {} and {} actrual is {}",
+                                BitmapTypeCode::EMPTY, BitmapTypeCode::BITMAP64, bitmap_type);
+            LOG(ERROR) << err_msg;
+            return Status::IOError(err_msg);
+        }
+        return Status::OK();
+    }
 };
 
 namespace detail {
@@ -1536,7 +1546,6 @@ public:
     // Deserialize a bitmap value from `src`.
     // Return false if `src` begins with unknown type code, true otherwise.
     bool deserialize(const char* src) {
-        DCHECK(*src >= BitmapTypeCode::EMPTY && *src <= BitmapTypeCode::BITMAP64);
         switch (*src) {
         case BitmapTypeCode::EMPTY:
             _type = EMPTY;
@@ -1555,6 +1564,9 @@ public:
             _bitmap = detail::Roaring64Map::read(src);
             break;
         default:
+            LOG(ERROR) << "BitmapTypeCode invalid, should between: " << BitmapTypeCode::EMPTY
+                       << " and " << BitmapTypeCode::BITMAP64 << " actrual is "
+                       << static_cast<int>(*src);
             return false;
         }
         return true;
diff --git a/be/test/olap/rowset/segment_v2/binary_dict_page_test.cpp b/be/test/olap/rowset/segment_v2/binary_dict_page_test.cpp
index 8c59da6491..bb3493efd6 100644
--- a/be/test/olap/rowset/segment_v2/binary_dict_page_test.cpp
+++ b/be/test/olap/rowset/segment_v2/binary_dict_page_test.cpp
@@ -68,8 +68,9 @@ public:
         Status status = page_builder.get_dictionary_page(&dict_slice);
         EXPECT_TRUE(status.ok());
         PageDecoderOptions dict_decoder_options;
-        std::unique_ptr<BinaryPlainPageDecoder> dict_page_decoder(
-                new BinaryPlainPageDecoder(dict_slice.slice(), dict_decoder_options));
+        std::unique_ptr<BinaryPlainPageDecoder<OLAP_FIELD_TYPE_VARCHAR>> dict_page_decoder(
+                new BinaryPlainPageDecoder<OLAP_FIELD_TYPE_VARCHAR>(dict_slice.slice(),
+                                                                    dict_decoder_options));
         status = dict_page_decoder->init();
         EXPECT_TRUE(status.ok());
         // because every slice is unique
@@ -175,8 +176,9 @@ public:
             int slice_index = random() % results.size();
             //int slice_index = 1;
             PageDecoderOptions dict_decoder_options;
-            std::unique_ptr<BinaryPlainPageDecoder> dict_page_decoder(
-                    new BinaryPlainPageDecoder(dict_slice.slice(), dict_decoder_options));
+            std::unique_ptr<BinaryPlainPageDecoder<OLAP_FIELD_TYPE_VARCHAR>> dict_page_decoder(
+                    new BinaryPlainPageDecoder<OLAP_FIELD_TYPE_VARCHAR>(dict_slice.slice(),
+                                                                        dict_decoder_options));
             status = dict_page_decoder->init();
             EXPECT_TRUE(status.ok());
 
diff --git a/be/test/olap/rowset/segment_v2/binary_plain_page_test.cpp b/be/test/olap/rowset/segment_v2/binary_plain_page_test.cpp
index 0e0d34606d..1da1db55e1 100644
--- a/be/test/olap/rowset/segment_v2/binary_plain_page_test.cpp
+++ b/be/test/olap/rowset/segment_v2/binary_plain_page_test.cpp
@@ -108,7 +108,8 @@ public:
 };
 
 TEST_F(BinaryPlainPageTest, TestBinaryPlainPageBuilderSeekByValueSmallPage) {
-    TestBinarySeekByValueSmallPage<BinaryPlainPageBuilder, BinaryPlainPageDecoder>();
+    TestBinarySeekByValueSmallPage<BinaryPlainPageBuilder<OLAP_FIELD_TYPE_VARCHAR>,
+                                   BinaryPlainPageDecoder<OLAP_FIELD_TYPE_VARCHAR>>();
 }
 
 } // namespace segment_v2


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org