You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by yi...@apache.org on 2022/07/27 08:44:04 UTC
[doris] branch master updated: Optimize regexp and like using hyperscan (#11116)
This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 4ea2c04676 Optimize regexp and like using hyperscan (#11116)
4ea2c04676 is described below
commit 4ea2c04676e104a9e25576cf7d4e357577707abf
Author: Kang <kx...@gmail.com>
AuthorDate: Wed Jul 27 16:43:58 2022 +0800
Optimize regexp and like using hyperscan (#11116)
* use hyperscan instead of re2 for regexp and like function
---
be/CMakeLists.txt | 4 ++
be/src/vec/functions/like.cpp | 137 ++++++++++++++++++++++++++----------------
be/src/vec/functions/like.h | 47 ++++++++++++---
3 files changed, 125 insertions(+), 63 deletions(-)
diff --git a/be/CMakeLists.txt b/be/CMakeLists.txt
index c4ad9093e6..7f11c4cb9c 100644
--- a/be/CMakeLists.txt
+++ b/be/CMakeLists.txt
@@ -144,6 +144,9 @@ set_target_properties(backtrace PROPERTIES IMPORTED_LOCATION ${THIRDPARTY_DIR}/l
add_library(re2 STATIC IMPORTED)
set_target_properties(re2 PROPERTIES IMPORTED_LOCATION ${THIRDPARTY_DIR}/lib/libre2.a)
+add_library(hyperscan STATIC IMPORTED)
+set_target_properties(hyperscan PROPERTIES IMPORTED_LOCATION ${THIRDPARTY_DIR}/lib64/libhs.a)
+
add_library(odbc STATIC IMPORTED)
set_target_properties(odbc PROPERTIES IMPORTED_LOCATION ${THIRDPARTY_DIR}/lib/libodbc.a)
@@ -638,6 +641,7 @@ set(COMMON_THIRDPARTY
thriftnb
glog
re2
+ hyperscan
pprof
lz4
libevent
diff --git a/be/src/vec/functions/like.cpp b/be/src/vec/functions/like.cpp
index 522fb0b621..8f84c0f21b 100644
--- a/be/src/vec/functions/like.cpp
+++ b/be/src/vec/functions/like.cpp
@@ -82,6 +82,61 @@ Status FunctionLikeBase::constant_substring_fn(LikeSearchState* state, const Str
return Status::OK();
}
+Status FunctionLikeBase::constant_regex_fn(LikeSearchState* state, const StringValue& val,
+ const StringValue& pattern, unsigned char* result) {
+ auto ret = hs_scan(state->hs_database.get(), val.ptr, val.len, 0, state->hs_scratch.get(),
+ state->hs_match_handler, (void*)result);
+ if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) {
+ return Status::RuntimeError(fmt::format("hyperscan error: {}", ret));
+ }
+
+ return Status::OK();
+}
+
+Status FunctionLikeBase::regexp_fn(LikeSearchState* state, const StringValue& val,
+ const StringValue& pattern, unsigned char* result) {
+ std::string re_pattern(pattern.ptr, pattern.len);
+
+ hs_database_t* database = nullptr;
+ hs_scratch_t* scratch = nullptr;
+ RETURN_IF_ERROR(hs_prepare(nullptr, re_pattern.c_str(), &database, &scratch));
+
+ auto ret =
+ hs_scan(database, val.ptr, val.len, 0, scratch, state->hs_match_handler, (void*)result);
+ if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) {
+ return Status::RuntimeError(fmt::format("hyperscan error: {}", ret));
+ }
+
+ hs_free_scratch(scratch);
+ hs_free_database(database);
+
+ return Status::OK();
+}
+
+// hyperscan compile expression to database and allocate scratch space
+Status FunctionLikeBase::hs_prepare(FunctionContext* context, const char* expression,
+ hs_database_t** database, hs_scratch_t** scratch) {
+ hs_compile_error_t* compile_err;
+ if (hs_compile(expression, HS_FLAG_DOTALL, HS_MODE_BLOCK, NULL, database, &compile_err) !=
+ HS_SUCCESS) {
+ hs_free_compile_error(compile_err);
+ *database = nullptr;
+ if (context) context->set_error("hs_compile regex pattern error");
+ return Status::RuntimeError("hs_compile regex pattern error");
+ }
+ hs_free_compile_error(compile_err);
+
+ if (hs_alloc_scratch(*database, scratch) != HS_SUCCESS) {
+ hs_free_database(*database);
+ *database = nullptr;
+ *scratch = nullptr;
+ if (context) context->set_error("hs_alloc_scratch allocate scratch space error");
+ return Status::RuntimeError("hs_alloc_scratch allocate scratch space error");
+ }
+
+ return Status::OK();
+}
+
Status FunctionLikeBase::execute_impl(FunctionContext* context, Block& block,
const ColumnNumbers& arguments, size_t result,
size_t /*input_rows_count*/) {
@@ -183,28 +238,20 @@ Status FunctionLikeBase::vector_vector(const ColumnString::Chars& values,
Status FunctionLike::like_fn(LikeSearchState* state, const StringValue& val,
const StringValue& pattern, unsigned char* result) {
std::string re_pattern;
- RE2::Options opts;
- opts.set_never_nl(false);
- opts.set_dot_nl(true);
convert_like_pattern(state, std::string(pattern.ptr, pattern.len), &re_pattern);
- re2::RE2 re(re_pattern, opts);
- if (re.ok()) {
- *result = RE2::FullMatch(re2::StringPiece(val.ptr, val.len), re);
- return Status::OK();
- } else {
- return Status::RuntimeError("Invalid pattern: {}", pattern.debug_string());
- }
-}
-Status FunctionLike::constant_regex_full_fn(LikeSearchState* state, const StringValue& val,
- const StringValue& pattern, unsigned char* result) {
- *result = RE2::FullMatch(re2::StringPiece(val.ptr, val.len), *state->regex.get());
- return Status::OK();
+ return regexp_fn(state, val, {re_pattern.c_str(), (int)re_pattern.size()}, result);
}
void FunctionLike::convert_like_pattern(LikeSearchState* state, const std::string& pattern,
std::string* re_pattern) {
re_pattern->clear();
+
+ // add ^ to pattern head to match line head
+ if (pattern.size() > 0 && pattern[0] != '%') {
+ re_pattern->append("^");
+ }
+
bool is_escaped = false;
for (size_t i = 0; i < pattern.size(); ++i) {
if (!is_escaped && pattern[i] == '%') {
@@ -229,6 +276,11 @@ void FunctionLike::convert_like_pattern(LikeSearchState* state, const std::strin
is_escaped = false;
}
}
+
+ // add $ to pattern tail to match line tail
+ if (pattern.size() > 0 && pattern[pattern.size() - 1] != '%') {
+ re_pattern->append("$");
+ }
}
void FunctionLike::remove_escape_character(std::string* search_string) {
@@ -279,14 +331,15 @@ Status FunctionLike::prepare(FunctionContext* context, FunctionContext::Function
} else {
std::string re_pattern;
convert_like_pattern(&state->search_state, pattern_str, &re_pattern);
- RE2::Options opts;
- opts.set_never_nl(false);
- opts.set_dot_nl(true);
- state->search_state.regex = std::make_unique<RE2>(re_pattern, opts);
- if (!state->search_state.regex->ok()) {
- return Status::InternalError("Invalid regex expression: {}", pattern_str);
- }
- state->function = constant_regex_full_fn;
+
+ hs_database_t* database = nullptr;
+ hs_scratch_t* scratch = nullptr;
+ RETURN_IF_ERROR(hs_prepare(context, re_pattern.c_str(), &database, &scratch));
+
+ state->search_state.hs_database.reset(database);
+ state->search_state.hs_scratch.reset(scratch);
+
+ state->function = constant_regex_fn;
}
}
return Status::OK();
@@ -319,39 +372,17 @@ Status FunctionRegexp::prepare(FunctionContext* context,
state->search_state.set_search_string(search_string);
state->function = constant_substring_fn;
} else {
- RE2::Options opts;
- opts.set_never_nl(false);
- opts.set_dot_nl(true);
- state->search_state.regex = std::make_unique<RE2>(pattern_str, opts);
- if (!state->search_state.regex->ok()) {
- return Status::InternalError("Invalid regex expression: {}", pattern_str);
- }
- state->function = constant_regex_partial_fn;
- }
- }
- return Status::OK();
-}
+ hs_database_t* database = nullptr;
+ hs_scratch_t* scratch = nullptr;
+ RETURN_IF_ERROR(hs_prepare(context, pattern_str.c_str(), &database, &scratch));
-Status FunctionRegexp::constant_regex_partial_fn(LikeSearchState* state, const StringValue& val,
- const StringValue& pattern,
- unsigned char* result) {
- *result = RE2::PartialMatch(re2::StringPiece(val.ptr, val.len), *state->regex);
- return Status::OK();
-}
+ state->search_state.hs_database.reset(database);
+ state->search_state.hs_scratch.reset(scratch);
-Status FunctionRegexp::regexp_fn(LikeSearchState* state, const StringValue& val,
- const StringValue& pattern, unsigned char* result) {
- std::string re_pattern(pattern.ptr, pattern.len);
- RE2::Options opts;
- opts.set_never_nl(false);
- opts.set_dot_nl(true);
- re2::RE2 re(re_pattern, opts);
- if (re.ok()) {
- *result = RE2::PartialMatch(re2::StringPiece(val.ptr, val.len), re);
- return Status::OK();
- } else {
- return Status::RuntimeError("Invalid pattern: {}", pattern.debug_string());
+ state->function = constant_regex_fn;
+ }
}
+ return Status::OK();
}
} // namespace doris::vectorized
diff --git a/be/src/vec/functions/like.h b/be/src/vec/functions/like.h
index e19b92d0c2..df164b20d0 100644
--- a/be/src/vec/functions/like.h
+++ b/be/src/vec/functions/like.h
@@ -17,6 +17,8 @@
#pragma once
+#include <hs/hs.h>
+
#include <functional>
#include <memory>
@@ -56,6 +58,31 @@ struct LikeSearchState {
/// Used for RLIKE and REGEXP predicates if the pattern is a constant argument.
std::unique_ptr<re2::RE2> regex;
+ template <typename Deleter, Deleter deleter>
+ struct HyperscanDeleter {
+ template <typename T>
+ void operator()(T* ptr) const {
+ deleter(ptr);
+ }
+ };
+
+ // hyperscan compiled pattern database and scratch space, reused for performance
+ std::unique_ptr<hs_database_t, HyperscanDeleter<decltype(&hs_free_database), &hs_free_database>>
+ hs_database;
+ std::unique_ptr<hs_scratch_t, HyperscanDeleter<decltype(&hs_free_scratch), &hs_free_scratch>>
+ hs_scratch;
+
+ // hyperscan match callback
+ static int hs_match_handler(unsigned int /* from */, // NOLINT
+ unsigned long long /* from */, // NOLINT
+ unsigned long long /* to */, // NOLINT
+ unsigned int /* flags */, void* ctx) {
+ // set result to 1 for matched row
+ *((unsigned char*)ctx) = 1;
+ /// return non-zero to indicate hyperscan stop after first matched
+ return 1;
+ }
+
LikeSearchState() : escape_char('\\') {}
void set_search_string(const std::string& search_string_arg) {
@@ -105,6 +132,16 @@ protected:
static Status constant_substring_fn(LikeSearchState* state, const StringValue& val,
const StringValue& pattern, unsigned char* result);
+
+ static Status constant_regex_fn(LikeSearchState* state, const StringValue& val,
+ const StringValue& pattern, unsigned char* result);
+
+ static Status regexp_fn(LikeSearchState* state, const StringValue& val,
+ const StringValue& pattern, unsigned char* result);
+
+ // hyperscan compile expression to database and allocate scratch space
+ static Status hs_prepare(FunctionContext* context, const char* expression,
+ hs_database_t** database, hs_scratch_t** scratch);
};
class FunctionLike : public FunctionLikeBase {
@@ -121,9 +158,6 @@ private:
static Status like_fn(LikeSearchState* state, const StringValue& val,
const StringValue& pattern, unsigned char* result);
- static Status constant_regex_full_fn(LikeSearchState* state, const StringValue& val,
- const StringValue& pattern, unsigned char* result);
-
static void convert_like_pattern(LikeSearchState* state, const std::string& pattern,
std::string* re_pattern);
@@ -139,13 +173,6 @@ public:
String get_name() const override { return name; }
Status prepare(FunctionContext* context, FunctionContext::FunctionStateScope scope) override;
-
-private:
- static Status regexp_fn(LikeSearchState* state, const StringValue& val,
- const StringValue& pattern, unsigned char* result);
-
- static Status constant_regex_partial_fn(LikeSearchState* state, const StringValue& val,
- const StringValue& pattern, unsigned char* result);
};
void register_function_like(SimpleFunctionFactory& factory) {
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org