You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by kx...@apache.org on 2023/06/06 15:15:09 UTC

[doris] 07/36: [Optimize](function) Optimize locate function by compare across strings (#20290)

This is an automated email from the ASF dual-hosted git repository.

kxiao pushed a commit to branch branch-2.0-beta
in repository https://gitbox.apache.org/repos/asf/doris.git

commit 9e44e20be570bd9780f900bf739841b9a57804ba
Author: ZhangYu0123 <67...@users.noreply.github.com>
AuthorDate: Mon Jun 5 12:43:14 2023 +0800

    [Optimize](function) Optimize locate function by compare across strings (#20290)
    
    Optimize locate function by compare across strings. about 90% speed up test by sum()
---
 be/src/vec/functions/function_string.cpp           | 44 +++++++++++++++-------
 .../string_functions/test_string_function.out      | 24 +++++++++---
 .../string_functions/test_string_function.groovy   | 16 +++++---
 3 files changed, 59 insertions(+), 25 deletions(-)

diff --git a/be/src/vec/functions/function_string.cpp b/be/src/vec/functions/function_string.cpp
index 670644bba4..dbfbff8800 100644
--- a/be/src/vec/functions/function_string.cpp
+++ b/be/src/vec/functions/function_string.cpp
@@ -205,28 +205,46 @@ struct StringInStrImpl {
         res.resize(size);
 
         if (rdata.size == 0) {
-            for (int i = 0; i < size; ++i) {
-                res[i] = 1;
-            }
+            std::fill(res.begin(), res.end(), 1);
             return Status::OK();
         }
 
+        const UInt8* begin = ldata.data();
+        const UInt8* end = begin + ldata.size();
+        const UInt8* pos = begin;
+
+        /// Current index in the array of strings.
+        size_t i = 0;
+        std::fill(res.begin(), res.end(), 0);
+
         StringRef rstr_ref(rdata.data, rdata.size);
         StringSearch search(&rstr_ref);
 
-        for (int i = 0; i < size; ++i) {
-            const char* l_raw_str = reinterpret_cast<const char*>(&ldata[loffsets[i - 1]]);
-            int l_str_size = loffsets[i] - loffsets[i - 1];
+        while (pos < end) {
+            // search return matched substring start offset
+            pos = (UInt8*)search.search((char*)pos, end - pos);
+            if (pos >= end) {
+                break;
+            }
 
-            StringRef lstr_ref(l_raw_str, l_str_size);
+            /// Determine which index it refers to.
+            /// begin + value_offsets[i] is the start offset of string at i+1
+            while (begin + loffsets[i] < pos) {
+                ++i;
+            }
 
-            // Hive returns positions starting from 1.
-            int loc = search.search(&lstr_ref);
-            if (loc > 0) {
-                size_t len = std::min(lstr_ref.size, (size_t)loc);
-                loc = simd::VStringFunctions::get_char_len(lstr_ref.data, len);
+            /// We check that the entry does not pass through the boundaries of strings.
+            if (pos + rdata.size <= begin + loffsets[i]) {
+                int loc = pos - begin - loffsets[i - 1];
+                int l_str_size = loffsets[i] - loffsets[i - 1];
+                size_t len = std::min(l_str_size, loc);
+                loc = simd::VStringFunctions::get_char_len((char*)(begin + loffsets[i - 1]), len);
+                res[i] = loc + 1;
             }
-            res[i] = loc + 1;
+
+            // move to next string offset
+            pos = begin + loffsets[i];
+            ++i;
         }
 
         return Status::OK();
diff --git a/regression-test/data/query_p0/sql_functions/string_functions/test_string_function.out b/regression-test/data/query_p0/sql_functions/string_functions/test_string_function.out
index 587319531e..b51fb32d61 100644
--- a/regression-test/data/query_p0/sql_functions/string_functions/test_string_function.out
+++ b/regression-test/data/query_p0/sql_functions/string_functions/test_string_function.out
@@ -122,18 +122,24 @@ A
 -- !sql --
 AB
 
--- !sql --
+-- !sql_instr --
 2
 
--- !sql --
+-- !sql_instr --
 0
 
--- !sql --
+-- !sql_instr --
 \N
 
--- !sql --
+-- !sql_instr --
 \N
 
+-- !sql_instr --
+1
+
+-- !sql_instr --
+5
+
 -- !sql --
 abc123
 
@@ -152,12 +158,18 @@ doris
 -- !sql --
 3
 
--- !sql --
+-- !sql_locate --
 4
 
--- !sql --
+-- !sql_locate --
 0
 
+-- !sql_locate --
+1
+
+-- !sql_locate --
+5
+
 -- !sql --
 xyxhi
 
diff --git a/regression-test/suites/query_p0/sql_functions/string_functions/test_string_function.groovy b/regression-test/suites/query_p0/sql_functions/string_functions/test_string_function.groovy
index ae33e448e7..6a06992322 100644
--- a/regression-test/suites/query_p0/sql_functions/string_functions/test_string_function.groovy
+++ b/regression-test/suites/query_p0/sql_functions/string_functions/test_string_function.groovy
@@ -69,10 +69,12 @@ suite("test_string_function") {
     qt_sql "select unhex('41');"
     qt_sql "select unhex('4142');"
 
-    qt_sql "select instr(\"abc\", \"b\");"
-    qt_sql "select instr(\"abc\", \"d\");"
-    qt_sql "select instr(\"abc\", null);"
-    qt_sql "select instr(null, \"a\");"
+    qt_sql_instr "select instr(\"abc\", \"b\");"
+    qt_sql_instr "select instr(\"abc\", \"d\");"
+    qt_sql_instr "select instr(\"abc\", null);"
+    qt_sql_instr "select instr(null, \"a\");"
+    qt_sql_instr "SELECT instr('foobar', '');"
+    qt_sql_instr "SELECT instr('上海天津北京杭州', '北京');"
 
     qt_sql "SELECT lcase(\"AbC123\");"
     qt_sql "SELECT lower(\"AbC123\");"
@@ -84,8 +86,10 @@ suite("test_string_function") {
 
     qt_sql "select length(\"abc\");"
 
-    qt_sql "SELECT LOCATE('bar', 'foobarbar');"
-    qt_sql "SELECT LOCATE('xbar', 'foobar');"
+    qt_sql_locate "SELECT LOCATE('bar', 'foobarbar');"
+    qt_sql_locate "SELECT LOCATE('xbar', 'foobar');"
+    qt_sql_locate "SELECT LOCATE('', 'foobar');"
+    qt_sql_locate "SELECT LOCATE('北京', '上海天津北京杭州');"
 
     qt_sql "SELECT lpad(\"hi\", 5, \"xy\");"
     qt_sql "SELECT lpad(\"hi\", 1, \"xy\");"


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org