You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by yi...@apache.org on 2022/07/27 08:44:04 UTC

[doris] branch master updated: Optimize regexp and like using hyperscan (#11116)

This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 4ea2c04676 Optimize regexp and like using hyperscan (#11116)
4ea2c04676 is described below

commit 4ea2c04676e104a9e25576cf7d4e357577707abf
Author: Kang <kx...@gmail.com>
AuthorDate: Wed Jul 27 16:43:58 2022 +0800

    Optimize regexp and like using hyperscan (#11116)
    
    * use hyperscan instead of re2 for regexp and like function
---
 be/CMakeLists.txt             |   4 ++
 be/src/vec/functions/like.cpp | 137 ++++++++++++++++++++++++++----------------
 be/src/vec/functions/like.h   |  47 ++++++++++++---
 3 files changed, 125 insertions(+), 63 deletions(-)

diff --git a/be/CMakeLists.txt b/be/CMakeLists.txt
index c4ad9093e6..7f11c4cb9c 100644
--- a/be/CMakeLists.txt
+++ b/be/CMakeLists.txt
@@ -144,6 +144,9 @@ set_target_properties(backtrace PROPERTIES IMPORTED_LOCATION ${THIRDPARTY_DIR}/l
 add_library(re2 STATIC IMPORTED)
 set_target_properties(re2 PROPERTIES IMPORTED_LOCATION ${THIRDPARTY_DIR}/lib/libre2.a)
 
+add_library(hyperscan STATIC IMPORTED)
+set_target_properties(hyperscan PROPERTIES IMPORTED_LOCATION ${THIRDPARTY_DIR}/lib64/libhs.a)
+
 add_library(odbc STATIC IMPORTED)
 set_target_properties(odbc PROPERTIES IMPORTED_LOCATION ${THIRDPARTY_DIR}/lib/libodbc.a)
 
@@ -638,6 +641,7 @@ set(COMMON_THIRDPARTY
     thriftnb
     glog
     re2
+    hyperscan
     pprof
     lz4
     libevent
diff --git a/be/src/vec/functions/like.cpp b/be/src/vec/functions/like.cpp
index 522fb0b621..8f84c0f21b 100644
--- a/be/src/vec/functions/like.cpp
+++ b/be/src/vec/functions/like.cpp
@@ -82,6 +82,61 @@ Status FunctionLikeBase::constant_substring_fn(LikeSearchState* state, const Str
     return Status::OK();
 }
 
+Status FunctionLikeBase::constant_regex_fn(LikeSearchState* state, const StringValue& val,
+                                           const StringValue& pattern, unsigned char* result) {
+    auto ret = hs_scan(state->hs_database.get(), val.ptr, val.len, 0, state->hs_scratch.get(),
+                       state->hs_match_handler, (void*)result);
+    if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) {
+        return Status::RuntimeError(fmt::format("hyperscan error: {}", ret));
+    }
+
+    return Status::OK();
+}
+
+Status FunctionLikeBase::regexp_fn(LikeSearchState* state, const StringValue& val,
+                                   const StringValue& pattern, unsigned char* result) {
+    std::string re_pattern(pattern.ptr, pattern.len);
+
+    hs_database_t* database = nullptr;
+    hs_scratch_t* scratch = nullptr;
+    RETURN_IF_ERROR(hs_prepare(nullptr, re_pattern.c_str(), &database, &scratch));
+
+    auto ret =
+            hs_scan(database, val.ptr, val.len, 0, scratch, state->hs_match_handler, (void*)result);
+    if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) {
+        return Status::RuntimeError(fmt::format("hyperscan error: {}", ret));
+    }
+
+    hs_free_scratch(scratch);
+    hs_free_database(database);
+
+    return Status::OK();
+}
+
+// hyperscan compile expression to database and allocate scratch space
+Status FunctionLikeBase::hs_prepare(FunctionContext* context, const char* expression,
+                                    hs_database_t** database, hs_scratch_t** scratch) {
+    hs_compile_error_t* compile_err;
+    if (hs_compile(expression, HS_FLAG_DOTALL, HS_MODE_BLOCK, NULL, database, &compile_err) !=
+        HS_SUCCESS) {
+        hs_free_compile_error(compile_err);
+        *database = nullptr;
+        if (context) context->set_error("hs_compile regex pattern error");
+        return Status::RuntimeError("hs_compile regex pattern error");
+    }
+    hs_free_compile_error(compile_err);
+
+    if (hs_alloc_scratch(*database, scratch) != HS_SUCCESS) {
+        hs_free_database(*database);
+        *database = nullptr;
+        *scratch = nullptr;
+        if (context) context->set_error("hs_alloc_scratch allocate scratch space error");
+        return Status::RuntimeError("hs_alloc_scratch allocate scratch space error");
+    }
+
+    return Status::OK();
+}
+
 Status FunctionLikeBase::execute_impl(FunctionContext* context, Block& block,
                                       const ColumnNumbers& arguments, size_t result,
                                       size_t /*input_rows_count*/) {
@@ -183,28 +238,20 @@ Status FunctionLikeBase::vector_vector(const ColumnString::Chars& values,
 Status FunctionLike::like_fn(LikeSearchState* state, const StringValue& val,
                              const StringValue& pattern, unsigned char* result) {
     std::string re_pattern;
-    RE2::Options opts;
-    opts.set_never_nl(false);
-    opts.set_dot_nl(true);
     convert_like_pattern(state, std::string(pattern.ptr, pattern.len), &re_pattern);
-    re2::RE2 re(re_pattern, opts);
-    if (re.ok()) {
-        *result = RE2::FullMatch(re2::StringPiece(val.ptr, val.len), re);
-        return Status::OK();
-    } else {
-        return Status::RuntimeError("Invalid pattern: {}", pattern.debug_string());
-    }
-}
 
-Status FunctionLike::constant_regex_full_fn(LikeSearchState* state, const StringValue& val,
-                                            const StringValue& pattern, unsigned char* result) {
-    *result = RE2::FullMatch(re2::StringPiece(val.ptr, val.len), *state->regex.get());
-    return Status::OK();
+    return regexp_fn(state, val, {re_pattern.c_str(), (int)re_pattern.size()}, result);
 }
 
 void FunctionLike::convert_like_pattern(LikeSearchState* state, const std::string& pattern,
                                         std::string* re_pattern) {
     re_pattern->clear();
+
+    // add ^ to pattern head to match line head
+    if (pattern.size() > 0 && pattern[0] != '%') {
+        re_pattern->append("^");
+    }
+
     bool is_escaped = false;
     for (size_t i = 0; i < pattern.size(); ++i) {
         if (!is_escaped && pattern[i] == '%') {
@@ -229,6 +276,11 @@ void FunctionLike::convert_like_pattern(LikeSearchState* state, const std::strin
             is_escaped = false;
         }
     }
+
+    // add $ to pattern tail to match line tail
+    if (pattern.size() > 0 && pattern[pattern.size() - 1] != '%') {
+        re_pattern->append("$");
+    }
 }
 
 void FunctionLike::remove_escape_character(std::string* search_string) {
@@ -279,14 +331,15 @@ Status FunctionLike::prepare(FunctionContext* context, FunctionContext::Function
         } else {
             std::string re_pattern;
             convert_like_pattern(&state->search_state, pattern_str, &re_pattern);
-            RE2::Options opts;
-            opts.set_never_nl(false);
-            opts.set_dot_nl(true);
-            state->search_state.regex = std::make_unique<RE2>(re_pattern, opts);
-            if (!state->search_state.regex->ok()) {
-                return Status::InternalError("Invalid regex expression: {}", pattern_str);
-            }
-            state->function = constant_regex_full_fn;
+
+            hs_database_t* database = nullptr;
+            hs_scratch_t* scratch = nullptr;
+            RETURN_IF_ERROR(hs_prepare(context, re_pattern.c_str(), &database, &scratch));
+
+            state->search_state.hs_database.reset(database);
+            state->search_state.hs_scratch.reset(scratch);
+
+            state->function = constant_regex_fn;
         }
     }
     return Status::OK();
@@ -319,39 +372,17 @@ Status FunctionRegexp::prepare(FunctionContext* context,
             state->search_state.set_search_string(search_string);
             state->function = constant_substring_fn;
         } else {
-            RE2::Options opts;
-            opts.set_never_nl(false);
-            opts.set_dot_nl(true);
-            state->search_state.regex = std::make_unique<RE2>(pattern_str, opts);
-            if (!state->search_state.regex->ok()) {
-                return Status::InternalError("Invalid regex expression: {}", pattern_str);
-            }
-            state->function = constant_regex_partial_fn;
-        }
-    }
-    return Status::OK();
-}
+            hs_database_t* database = nullptr;
+            hs_scratch_t* scratch = nullptr;
+            RETURN_IF_ERROR(hs_prepare(context, pattern_str.c_str(), &database, &scratch));
 
-Status FunctionRegexp::constant_regex_partial_fn(LikeSearchState* state, const StringValue& val,
-                                                 const StringValue& pattern,
-                                                 unsigned char* result) {
-    *result = RE2::PartialMatch(re2::StringPiece(val.ptr, val.len), *state->regex);
-    return Status::OK();
-}
+            state->search_state.hs_database.reset(database);
+            state->search_state.hs_scratch.reset(scratch);
 
-Status FunctionRegexp::regexp_fn(LikeSearchState* state, const StringValue& val,
-                                 const StringValue& pattern, unsigned char* result) {
-    std::string re_pattern(pattern.ptr, pattern.len);
-    RE2::Options opts;
-    opts.set_never_nl(false);
-    opts.set_dot_nl(true);
-    re2::RE2 re(re_pattern, opts);
-    if (re.ok()) {
-        *result = RE2::PartialMatch(re2::StringPiece(val.ptr, val.len), re);
-        return Status::OK();
-    } else {
-        return Status::RuntimeError("Invalid pattern: {}", pattern.debug_string());
+            state->function = constant_regex_fn;
+        }
     }
+    return Status::OK();
 }
 
 } // namespace doris::vectorized
diff --git a/be/src/vec/functions/like.h b/be/src/vec/functions/like.h
index e19b92d0c2..df164b20d0 100644
--- a/be/src/vec/functions/like.h
+++ b/be/src/vec/functions/like.h
@@ -17,6 +17,8 @@
 
 #pragma once
 
+#include <hs/hs.h>
+
 #include <functional>
 #include <memory>
 
@@ -56,6 +58,31 @@ struct LikeSearchState {
     /// Used for RLIKE and REGEXP predicates if the pattern is a constant argument.
     std::unique_ptr<re2::RE2> regex;
 
+    template <typename Deleter, Deleter deleter>
+    struct HyperscanDeleter {
+        template <typename T>
+        void operator()(T* ptr) const {
+            deleter(ptr);
+        }
+    };
+
+    // hyperscan compiled pattern database and scratch space, reused for performance
+    std::unique_ptr<hs_database_t, HyperscanDeleter<decltype(&hs_free_database), &hs_free_database>>
+            hs_database;
+    std::unique_ptr<hs_scratch_t, HyperscanDeleter<decltype(&hs_free_scratch), &hs_free_scratch>>
+            hs_scratch;
+
+    // hyperscan match callback
+    static int hs_match_handler(unsigned int /* from */,       // NOLINT
+                                unsigned long long /* from */, // NOLINT
+                                unsigned long long /* to */,   // NOLINT
+                                unsigned int /* flags */, void* ctx) {
+        // set result to 1 for matched row
+        *((unsigned char*)ctx) = 1;
+        /// return non-zero to indicate hyperscan stop after first matched
+        return 1;
+    }
+
     LikeSearchState() : escape_char('\\') {}
 
     void set_search_string(const std::string& search_string_arg) {
@@ -105,6 +132,16 @@ protected:
 
     static Status constant_substring_fn(LikeSearchState* state, const StringValue& val,
                                         const StringValue& pattern, unsigned char* result);
+
+    static Status constant_regex_fn(LikeSearchState* state, const StringValue& val,
+                                    const StringValue& pattern, unsigned char* result);
+
+    static Status regexp_fn(LikeSearchState* state, const StringValue& val,
+                            const StringValue& pattern, unsigned char* result);
+
+    // hyperscan compile expression to database and allocate scratch space
+    static Status hs_prepare(FunctionContext* context, const char* expression,
+                             hs_database_t** database, hs_scratch_t** scratch);
 };
 
 class FunctionLike : public FunctionLikeBase {
@@ -121,9 +158,6 @@ private:
     static Status like_fn(LikeSearchState* state, const StringValue& val,
                           const StringValue& pattern, unsigned char* result);
 
-    static Status constant_regex_full_fn(LikeSearchState* state, const StringValue& val,
-                                         const StringValue& pattern, unsigned char* result);
-
     static void convert_like_pattern(LikeSearchState* state, const std::string& pattern,
                                      std::string* re_pattern);
 
@@ -139,13 +173,6 @@ public:
     String get_name() const override { return name; }
 
     Status prepare(FunctionContext* context, FunctionContext::FunctionStateScope scope) override;
-
-private:
-    static Status regexp_fn(LikeSearchState* state, const StringValue& val,
-                            const StringValue& pattern, unsigned char* result);
-
-    static Status constant_regex_partial_fn(LikeSearchState* state, const StringValue& val,
-                                            const StringValue& pattern, unsigned char* result);
 };
 
 void register_function_like(SimpleFunctionFactory& factory) {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org