You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by kx...@apache.org on 2023/06/06 15:15:09 UTC
[doris] 07/36: [Optimize](function) Optimize locate function by compare across strings (#20290)
This is an automated email from the ASF dual-hosted git repository.
kxiao pushed a commit to branch branch-2.0-beta
in repository https://gitbox.apache.org/repos/asf/doris.git
commit 9e44e20be570bd9780f900bf739841b9a57804ba
Author: ZhangYu0123 <67...@users.noreply.github.com>
AuthorDate: Mon Jun 5 12:43:14 2023 +0800
[Optimize](function) Optimize locate function by compare across strings (#20290)
Optimize locate function by compare across strings. about 90% speed up test by sum()
---
be/src/vec/functions/function_string.cpp | 44 +++++++++++++++-------
.../string_functions/test_string_function.out | 24 +++++++++---
.../string_functions/test_string_function.groovy | 16 +++++---
3 files changed, 59 insertions(+), 25 deletions(-)
diff --git a/be/src/vec/functions/function_string.cpp b/be/src/vec/functions/function_string.cpp
index 670644bba4..dbfbff8800 100644
--- a/be/src/vec/functions/function_string.cpp
+++ b/be/src/vec/functions/function_string.cpp
@@ -205,28 +205,46 @@ struct StringInStrImpl {
res.resize(size);
if (rdata.size == 0) {
- for (int i = 0; i < size; ++i) {
- res[i] = 1;
- }
+ std::fill(res.begin(), res.end(), 1);
return Status::OK();
}
+ const UInt8* begin = ldata.data();
+ const UInt8* end = begin + ldata.size();
+ const UInt8* pos = begin;
+
+ /// Current index in the array of strings.
+ size_t i = 0;
+ std::fill(res.begin(), res.end(), 0);
+
StringRef rstr_ref(rdata.data, rdata.size);
StringSearch search(&rstr_ref);
- for (int i = 0; i < size; ++i) {
- const char* l_raw_str = reinterpret_cast<const char*>(&ldata[loffsets[i - 1]]);
- int l_str_size = loffsets[i] - loffsets[i - 1];
+ while (pos < end) {
+ // search return matched substring start offset
+ pos = (UInt8*)search.search((char*)pos, end - pos);
+ if (pos >= end) {
+ break;
+ }
- StringRef lstr_ref(l_raw_str, l_str_size);
+ /// Determine which index it refers to.
+ /// begin + value_offsets[i] is the start offset of string at i+1
+ while (begin + loffsets[i] < pos) {
+ ++i;
+ }
- // Hive returns positions starting from 1.
- int loc = search.search(&lstr_ref);
- if (loc > 0) {
- size_t len = std::min(lstr_ref.size, (size_t)loc);
- loc = simd::VStringFunctions::get_char_len(lstr_ref.data, len);
+ /// We check that the entry does not pass through the boundaries of strings.
+ if (pos + rdata.size <= begin + loffsets[i]) {
+ int loc = pos - begin - loffsets[i - 1];
+ int l_str_size = loffsets[i] - loffsets[i - 1];
+ size_t len = std::min(l_str_size, loc);
+ loc = simd::VStringFunctions::get_char_len((char*)(begin + loffsets[i - 1]), len);
+ res[i] = loc + 1;
}
- res[i] = loc + 1;
+
+ // move to next string offset
+ pos = begin + loffsets[i];
+ ++i;
}
return Status::OK();
diff --git a/regression-test/data/query_p0/sql_functions/string_functions/test_string_function.out b/regression-test/data/query_p0/sql_functions/string_functions/test_string_function.out
index 587319531e..b51fb32d61 100644
--- a/regression-test/data/query_p0/sql_functions/string_functions/test_string_function.out
+++ b/regression-test/data/query_p0/sql_functions/string_functions/test_string_function.out
@@ -122,18 +122,24 @@ A
-- !sql --
AB
--- !sql --
+-- !sql_instr --
2
--- !sql --
+-- !sql_instr --
0
--- !sql --
+-- !sql_instr --
\N
--- !sql --
+-- !sql_instr --
\N
+-- !sql_instr --
+1
+
+-- !sql_instr --
+5
+
-- !sql --
abc123
@@ -152,12 +158,18 @@ doris
-- !sql --
3
--- !sql --
+-- !sql_locate --
4
--- !sql --
+-- !sql_locate --
0
+-- !sql_locate --
+1
+
+-- !sql_locate --
+5
+
-- !sql --
xyxhi
diff --git a/regression-test/suites/query_p0/sql_functions/string_functions/test_string_function.groovy b/regression-test/suites/query_p0/sql_functions/string_functions/test_string_function.groovy
index ae33e448e7..6a06992322 100644
--- a/regression-test/suites/query_p0/sql_functions/string_functions/test_string_function.groovy
+++ b/regression-test/suites/query_p0/sql_functions/string_functions/test_string_function.groovy
@@ -69,10 +69,12 @@ suite("test_string_function") {
qt_sql "select unhex('41');"
qt_sql "select unhex('4142');"
- qt_sql "select instr(\"abc\", \"b\");"
- qt_sql "select instr(\"abc\", \"d\");"
- qt_sql "select instr(\"abc\", null);"
- qt_sql "select instr(null, \"a\");"
+ qt_sql_instr "select instr(\"abc\", \"b\");"
+ qt_sql_instr "select instr(\"abc\", \"d\");"
+ qt_sql_instr "select instr(\"abc\", null);"
+ qt_sql_instr "select instr(null, \"a\");"
+ qt_sql_instr "SELECT instr('foobar', '');"
+ qt_sql_instr "SELECT instr('上海天津北京杭州', '北京');"
qt_sql "SELECT lcase(\"AbC123\");"
qt_sql "SELECT lower(\"AbC123\");"
@@ -84,8 +86,10 @@ suite("test_string_function") {
qt_sql "select length(\"abc\");"
- qt_sql "SELECT LOCATE('bar', 'foobarbar');"
- qt_sql "SELECT LOCATE('xbar', 'foobar');"
+ qt_sql_locate "SELECT LOCATE('bar', 'foobarbar');"
+ qt_sql_locate "SELECT LOCATE('xbar', 'foobar');"
+ qt_sql_locate "SELECT LOCATE('', 'foobar');"
+ qt_sql_locate "SELECT LOCATE('北京', '上海天津北京杭州');"
qt_sql "SELECT lpad(\"hi\", 5, \"xy\");"
qt_sql "SELECT lpad(\"hi\", 1, \"xy\");"
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org