You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by yi...@apache.org on 2022/06/23 04:29:18 UTC

[doris] branch master updated: [optimize](storage)optimize date in storage layer (#8967)

This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new d73f170eeb [optimize](storage)optimize date in storage layer (#8967)
d73f170eeb is described below

commit d73f170eeb5a9cc7194d5eb55df33eb7ccacbba7
Author: wangbo <wa...@apache.org>
AuthorDate: Thu Jun 23 12:29:10 2022 +0800

    [optimize](storage)optimize date in storage layer (#8967)
    
    * opt date in storage
    
    * code style
    
    Co-authored-by: Wang Bo <wa...@meituan.com>
---
 be/src/olap/comparison_predicate.cpp               | 45 ++++++++++++++++++----
 be/src/olap/in_list_predicate.h                    | 32 ++++++++++++++-
 be/src/olap/rowset/segment_v2/segment_iterator.cpp |  2 +-
 be/src/olap/schema.cpp                             |  2 +-
 be/src/olap/uint24.h                               |  2 +
 be/src/vec/columns/column_vector.h                 | 19 +++++----
 be/src/vec/columns/predicate_column.h              | 36 +++++++++++++++++
 be/src/vec/runtime/vdatetime_value.h               | 13 +++++++
 8 files changed, 131 insertions(+), 20 deletions(-)

diff --git a/be/src/olap/comparison_predicate.cpp b/be/src/olap/comparison_predicate.cpp
index fc50c354fd..91ef9f7156 100644
--- a/be/src/olap/comparison_predicate.cpp
+++ b/be/src/olap/comparison_predicate.cpp
@@ -224,6 +224,9 @@ COMPARISON_PRED_COLUMN_EVALUATE(LessEqualPredicate, <=, true)
 COMPARISON_PRED_COLUMN_EVALUATE(GreaterPredicate, >, true)
 COMPARISON_PRED_COLUMN_EVALUATE(GreaterEqualPredicate, >=, true)
 
+// todo(wb) for date type we use uint32_t to save it but using Predicate<uint24> to evaluate it.
+// This is done for compatibility with Row Version predicate.
+// We can use Predicate<uint32_t> for date after Row Version is removed.
 #define COMPARISON_PRED_COLUMN_EVALUATE_VEC(CLASS, OP)                                           \
     template <class T>                                                                           \
     void CLASS<T>::evaluate_vec(vectorized::IColumn& column, uint16_t size, bool* flags) const { \
@@ -236,15 +239,40 @@ COMPARISON_PRED_COLUMN_EVALUATE(GreaterEqualPredicate, >=, true)
             auto& null_bitmap = reinterpret_cast<const vectorized::ColumnVector<uint8_t>&>(      \
                                         *(nullable_column->get_null_map_column_ptr()))           \
                                         .get_data();                                             \
-            for (uint16_t i = 0; i < size; i++) {                                                \
-                flags[i] = (data_array[i] OP _value) && (!null_bitmap[i]);                       \
+            if constexpr (std::is_same_v<T, uint24_t>) {                                         \
+                auto& predicate_column =                                                         \
+                        reinterpret_cast<const vectorized::PredicateColumnType<uint32_t>&>(      \
+                                nullable_column->get_nested_column());                           \
+                uint32_t int32_val = 0;                                                          \
+                char* int32_val_ptr = (char*)&int32_val;                                         \
+                memory_copy(int32_val_ptr, _value.get_data(), sizeof(uint24_t));                 \
+                auto& data_array_uint32_t = predicate_column.get_data();                         \
+                for (uint16_t i = 0; i < size; i++) {                                            \
+                    flags[i] = (data_array_uint32_t[i] OP int32_val) && (!null_bitmap[i]);       \
+                }                                                                                \
+            } else {                                                                             \
+                for (uint16_t i = 0; i < size; i++) {                                            \
+                    flags[i] = (data_array[i] OP _value) && (!null_bitmap[i]);                   \
+                }                                                                                \
             }                                                                                    \
         } else {                                                                                 \
-            auto& predicate_column =                                                             \
-                    reinterpret_cast<vectorized::PredicateColumnType<T>&>(column);               \
-            auto& data_array = predicate_column.get_data();                                      \
-            for (uint16_t i = 0; i < size; i++) {                                                \
-                flags[i] = data_array[i] OP _value;                                              \
+            if constexpr (std::is_same_v<T, uint24_t>) {                                         \
+                auto& predicate_column =                                                         \
+                        reinterpret_cast<vectorized::PredicateColumnType<uint32_t>&>(column);    \
+                uint32_t int32_val = 0;                                                          \
+                char* int32_val_ptr = (char*)&int32_val;                                         \
+                memory_copy(int32_val_ptr, _value.get_data(), sizeof(uint24_t));                 \
+                auto& data_array = predicate_column.get_data();                                  \
+                for (uint16_t i = 0; i < size; i++) {                                            \
+                    flags[i] = data_array[i] OP int32_val;                                       \
+                }                                                                                \
+            } else {                                                                             \
+                auto& predicate_column =                                                         \
+                        reinterpret_cast<vectorized::PredicateColumnType<T>&>(column);           \
+                auto& data_array = predicate_column.get_data();                                  \
+                for (uint16_t i = 0; i < size; i++) {                                            \
+                    flags[i] = data_array[i] OP _value;                                          \
+                }                                                                                \
             }                                                                                    \
         }                                                                                        \
         if (_opposite) {                                                                         \
@@ -502,6 +530,7 @@ COMPARISON_PRED_BITMAP_EVALUATE(GreaterEqualPredicate, >=)
     template CLASS<decimal12_t>::CLASS(uint32_t column_id, const decimal12_t& value,           \
                                        bool opposite);                                         \
     template CLASS<uint24_t>::CLASS(uint32_t column_id, const uint24_t& value, bool opposite); \
+    template CLASS<uint32_t>::CLASS(uint32_t column_id, const uint32_t& value, bool opposite); \
     template CLASS<uint64_t>::CLASS(uint32_t column_id, const uint64_t& value, bool opposite); \
     template CLASS<bool>::CLASS(uint32_t column_id, const bool& value, bool opposite);
 
@@ -663,6 +692,8 @@ COMPARISON_PRED_COLUMN_EVALUATE_DECLARATION(GreaterEqualPredicate)
                                                    bool* flags) const;                         \
     template void CLASS<uint24_t>::evaluate_vec(vectorized::IColumn& column, uint16_t size,    \
                                                 bool* flags) const;                            \
+    template void CLASS<uint32_t>::evaluate_vec(vectorized::IColumn& column, uint16_t size,    \
+                                                bool* flags) const;                            \
     template void CLASS<uint64_t>::evaluate_vec(vectorized::IColumn& column, uint16_t size,    \
                                                 bool* flags) const;                            \
     template void CLASS<bool>::evaluate_vec(vectorized::IColumn& column, uint16_t size,        \
diff --git a/be/src/olap/in_list_predicate.h b/be/src/olap/in_list_predicate.h
index e39686abd4..04ec211568 100644
--- a/be/src/olap/in_list_predicate.h
+++ b/be/src/olap/in_list_predicate.h
@@ -290,7 +290,37 @@ private:
                             uint16_t* sel, uint16_t size) const {
         uint16_t new_size = 0;
 
-        if (column->is_column_dictionary()) {
+        if constexpr (std::is_same_v<T, uint24_t>) {
+            auto* nested_col_ptr =
+                    vectorized::check_and_get_column<vectorized::PredicateColumnType<uint32_t>>(
+                            column);
+            auto& data_array = nested_col_ptr->get_data();
+
+            uint24_t tmp_uint24_value;
+            for (uint16_t i = 0; i < size; i++) {
+                uint16_t idx = sel[i];
+                if constexpr (is_nullable) {
+                    if ((*null_map)[idx]) {
+                        if constexpr (is_opposite) {
+                            sel[new_size++] = idx;
+                        }
+                        continue;
+                    }
+                }
+
+                memcpy((char*)(&tmp_uint24_value), (char*)(&(data_array[idx])), sizeof(uint24_t));
+                if constexpr (!is_opposite) {
+                    if (_operator(_values.find(tmp_uint24_value), _values.end())) {
+                        sel[new_size++] = idx;
+                    }
+                } else {
+                    if (!_operator(_values.find(tmp_uint24_value), _values.end())) {
+                        sel[new_size++] = idx;
+                    }
+                }
+            }
+
+        } else if (column->is_column_dictionary()) {
             if constexpr (std::is_same_v<T, StringValue>) {
                 auto* nested_col_ptr = vectorized::check_and_get_column<
                         vectorized::ColumnDictionary<vectorized::Int32>>(column);
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
index 14e5b92935..b3945e11ec 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
@@ -661,7 +661,7 @@ void SegmentIterator::_vec_init_lazy_materialization() {
                 predicate->type() == PredicateType::IN_LIST ||
                 predicate->type() == PredicateType::NOT_IN_LIST ||
                 predicate->type() == PredicateType::IS_NULL ||
-                predicate->type() == PredicateType::IS_NOT_NULL || type == OLAP_FIELD_TYPE_DATE ||
+                predicate->type() == PredicateType::IS_NOT_NULL ||
                 type == OLAP_FIELD_TYPE_DECIMAL) {
                 short_cir_pred_col_id_set.insert(cid);
                 _short_cir_eval_predicate.push_back(predicate);
diff --git a/be/src/olap/schema.cpp b/be/src/olap/schema.cpp
index 8c218ab8ed..a5e7896147 100644
--- a/be/src/olap/schema.cpp
+++ b/be/src/olap/schema.cpp
@@ -151,7 +151,7 @@ vectorized::IColumn::MutablePtr Schema::get_predicate_column_ptr(FieldType type)
         return doris::vectorized::PredicateColumnType<doris::vectorized::Int128>::create();
 
     case OLAP_FIELD_TYPE_DATE:
-        return doris::vectorized::PredicateColumnType<uint24_t>::create();
+        return doris::vectorized::PredicateColumnType<uint32_t>::create();
 
     case OLAP_FIELD_TYPE_DATETIME:
         return doris::vectorized::PredicateColumnType<uint64_t>::create();
diff --git a/be/src/olap/uint24.h b/be/src/olap/uint24.h
index 1605d893c9..f56ca7ddc6 100644
--- a/be/src/olap/uint24.h
+++ b/be/src/olap/uint24.h
@@ -140,6 +140,8 @@ public:
         return std::string(buf);
     }
 
+    const uint8_t* get_data() const { return data; }
+
 private:
     uint8_t data[3];
 } __attribute__((packed));
diff --git a/be/src/vec/columns/column_vector.h b/be/src/vec/columns/column_vector.h
index e57ffe4a9c..1d42455c76 100644
--- a/be/src/vec/columns/column_vector.h
+++ b/be/src/vec/columns/column_vector.h
@@ -169,17 +169,16 @@ public:
     }
 
     void insert_date_column(const char* data_ptr, size_t num) {
-        size_t value_size = sizeof(uint24_t);
+        size_t input_value_size = sizeof(uint24_t);
+
         for (int i = 0; i < num; i++) {
-            const char* cur_ptr = data_ptr + value_size * i;
-            uint64_t value = 0;
-            value = *(unsigned char*)(cur_ptr + 2);
-            value <<= 8;
-            value |= *(unsigned char*)(cur_ptr + 1);
-            value <<= 8;
-            value |= *(unsigned char*)(cur_ptr);
-            vectorized::VecDateTimeValue date = VecDateTimeValue::create_from_olap_date(value);
-            this->insert_data(reinterpret_cast<char*>(&date), 0);
+            uint64_t val = 0;
+            memcpy((char*)(&val), data_ptr, input_value_size);
+            data_ptr += input_value_size;
+
+            VecDateTimeValue date;
+            date.set_olap_date(val);
+            data.push_back_without_reserve(unaligned_load<Int64>(reinterpret_cast<char*>(&date)));
         }
     }
 
diff --git a/be/src/vec/columns/predicate_column.h b/be/src/vec/columns/predicate_column.h
index eec3f1def7..27b1b7e5e0 100644
--- a/be/src/vec/columns/predicate_column.h
+++ b/be/src/vec/columns/predicate_column.h
@@ -63,6 +63,20 @@ private:
         }
     }
 
+    void insert_date32_to_res_column(const uint16_t* sel, size_t sel_size,
+                                     vectorized::ColumnVector<Int64>* res_ptr) {
+        res_ptr->reserve(sel_size);
+        auto& res_data = res_ptr->get_data();
+
+        for (size_t i = 0; i < sel_size; i++) {
+            uint64_t val = data[sel[i]];
+            VecDateTimeValue date;
+            date.set_olap_date(val);
+            res_data.push_back_without_reserve(
+                    unaligned_load<Int64>(reinterpret_cast<char*>(&date)));
+        }
+    }
+
     void insert_datetime_to_res_column(const uint16_t* sel, size_t sel_size,
                                        vectorized::ColumnVector<Int64>* res_ptr) {
         for (size_t i = 0; i < sel_size; i++) {
@@ -205,6 +219,21 @@ public:
         }
     }
 
+    void insert_many_date(const char* data_ptr, size_t num) {
+        size_t intput_type_size = sizeof(uint24_t);
+        size_t res_type_size = sizeof(uint32_t);
+        char* input_data_ptr = const_cast<char*>(data_ptr);
+
+        char* res_ptr = (char*)data.get_end_ptr();
+        memset(res_ptr, 0, res_type_size * num);
+        for (int i = 0; i < num; i++) {
+            memcpy(res_ptr, input_data_ptr, intput_type_size);
+            res_ptr += res_type_size;
+            input_data_ptr += intput_type_size;
+        }
+        data.set_end_ptr(res_ptr);
+    }
+
     void insert_many_fix_len_data(const char* data_ptr, size_t num) override {
         if constexpr (std::is_same_v<T, decimal12_t>) {
             insert_many_in_copy_way(data_ptr, num);
@@ -212,6 +241,10 @@ public:
             insert_many_in_copy_way(data_ptr, num);
         } else if constexpr (std::is_same_v<T, StringValue>) {
             // here is unreachable, just for compilation to be able to pass
+        } else if constexpr (std::is_same_v<
+                                     T,
+                                     uint32_t>) { // todo(wb) a trick type judge here,need refactor
+            insert_many_date(data_ptr, num);
         } else {
             insert_many_default_type(data_ptr, num);
         }
@@ -405,6 +438,9 @@ public:
         } else if constexpr (std::is_same_v<T, uint24_t>) {
             insert_date_to_res_column(sel, sel_size,
                                       reinterpret_cast<vectorized::ColumnVector<Int64>*>(col_ptr));
+        } else if constexpr (std::is_same_v<T, uint32_t>) { // a trick type judge, need refactor it.
+            insert_date32_to_res_column(
+                    sel, sel_size, reinterpret_cast<vectorized::ColumnVector<Int64>*>(col_ptr));
         } else if constexpr (std::is_same_v<T, doris::vectorized::Int128>) {
             insert_default_value_res_column(
                     sel, sel_size,
diff --git a/be/src/vec/runtime/vdatetime_value.h b/be/src/vec/runtime/vdatetime_value.h
index 126b36f5b4..4c1f4cc137 100644
--- a/be/src/vec/runtime/vdatetime_value.h
+++ b/be/src/vec/runtime/vdatetime_value.h
@@ -223,6 +223,19 @@ public:
         return check_range_and_set_time(year, month, day, hour, minute, second, _type);
     }
 
+    //note(wb) not check in this method
+    void inline set_olap_date(uint64_t olap_date_val) {
+        _neg = 0;
+        _type = TIME_DATE;
+
+        _day = olap_date_val & 0x1f;
+        _month = (olap_date_val >> 5) & 0x0f;
+        _year = olap_date_val >> 9;
+        _hour = 0;
+        _minute = 0;
+        _second = 0;
+    }
+
     uint64_t to_olap_date() const {
         uint64_t val;
         val = _year;


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org