You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2019/08/15 14:02:30 UTC

[impala] 02/04: IMPALA-7770: SPLIT_PART to support negative indexes

This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 1f904719e4c7e398a7a8e152d82c8da1590b6d25
Author: norbertluksa <no...@cloudera.com>
AuthorDate: Thu Jul 18 18:20:28 2019 +0200

    IMPALA-7770: SPLIT_PART to support negative indexes
    
    Third parameter of SPLIT_PART (nth field) accepts now
    negative values, and searches the string backwards.
    
    Testing:
     * Added unit tests to expr-test.cc
    
    Change-Id: I2db762989a90bd95661a59eb9c11a29eb2edfafb
    Reviewed-on: http://gerrit.cloudera.org:8080/13880
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/exprs/expr-test.cc           |  9 +++++++-
 be/src/exprs/string-functions-ir.cc | 45 ++++++++++++++++++++++++++-----------
 2 files changed, 40 insertions(+), 14 deletions(-)

diff --git a/be/src/exprs/expr-test.cc b/be/src/exprs/expr-test.cc
index be80c2b..a406d29 100644
--- a/be/src/exprs/expr-test.cc
+++ b/be/src/exprs/expr-test.cc
@@ -4152,18 +4152,25 @@ TEST_P(ExprTest, StringFunctions) {
   TestIsNull("chr(NULL)", TYPE_STRING);
 
   TestStringValue("split_part('abc~!def~!ghi', '~!', 1)", "abc");
+  TestStringValue("split_part('abc~!def~!ghi', '~!', -1)", "ghi");
   TestStringValue("split_part('abc~!~def~!~ghi', '~!', 2)", "~def");
+  TestStringValue("split_part('abc~!~def~!~ghi', '~!', -2)", "~def");
   TestStringValue("split_part('abc@@def@@ghi', '@@', 3)", "ghi");
+  TestStringValue("split_part('abc@@def@@ghi', '@@', -3)", "abc");
   TestStringValue("split_part('abc@@def@@@@ghi', '@@', 4)", "ghi");
+  TestStringValue("split_part('abc@@def@@@@ghi', '@@', -4)", "abc");
   TestStringValue("split_part('abc@@def@@ghi', '@@', 4)", "");
+  TestStringValue("split_part('abc@@def@@ghi', '@@', -4)", "");
   TestStringValue("split_part('', '@@', 1)", "");
+  TestStringValue("split_part('', '@@', -1)", "");
   TestStringValue("split_part('abcdef', '', 1)", "abcdef");
+  TestStringValue("split_part('abcdef', '', -1)", "abcdef");
   TestStringValue("split_part('', '', 1)", "");
+  TestStringValue("split_part('', '', -1)", "");
   TestIsNull("split_part(NULL, NULL, 1)", TYPE_STRING);
   TestIsNull("split_part('abcdefabc', NULL, 1)", TYPE_STRING);
   TestIsNull("split_part(NULL, 'xyz', 1)", TYPE_STRING);
   TestError("split_part('abc@@def@@ghi', '@@', 0)");
-  TestError("split_part('abc@@def@@ghi', '@@', -1)");
 
   TestStringValue("lower('')", "");
   TestStringValue("lower('HELLO')", "hello");
diff --git a/be/src/exprs/string-functions-ir.cc b/be/src/exprs/string-functions-ir.cc
index 5606fcb..11336e3 100644
--- a/be/src/exprs/string-functions-ir.cc
+++ b/be/src/exprs/string-functions-ir.cc
@@ -1018,22 +1018,28 @@ StringVal StringFunctions::Chr(FunctionContext* ctx, const IntVal& val) {
 }
 
 // Similar to strstr() except that the strings are not null-terminated
-static char* LocateSubstring(char* haystack, int hay_len, const char* needle, int needle_len) {
+// Parameter 'direction' controls the direction of searching, can be either 1 or -1
+static char* LocateSubstring(char* haystack, const int hay_len, const char* needle,
+    const int needle_len, const int direction = 1) {
   DCHECK_GT(needle_len, 0);
   DCHECK(needle != NULL);
   DCHECK(hay_len == 0 || haystack != NULL);
+  DCHECK(direction == 1 || direction == -1);
+  if (hay_len < needle_len) return nullptr;
+  char* start = haystack;
+  if (direction == -1) start += hay_len - needle_len;
   for (int i = 0; i < hay_len - needle_len + 1; ++i) {
-    char* possible_needle = haystack + i;
+    char* possible_needle = start + direction * i;
     if (strncmp(possible_needle, needle, needle_len) == 0) return possible_needle;
   }
-  return NULL;
+  return nullptr;
 }
 
 StringVal StringFunctions::SplitPart(FunctionContext* context,
     const StringVal& str, const StringVal& delim, const BigIntVal& field) {
   if (str.is_null || delim.is_null || field.is_null) return StringVal::null();
   int field_pos = field.val;
-  if (field_pos <= 0) {
+  if (field_pos == 0) {
     stringstream ss;
     ss << "Invalid field position: " << field.val;
     context->SetError(ss.str().c_str());
@@ -1041,23 +1047,36 @@ StringVal StringFunctions::SplitPart(FunctionContext* context,
   }
   if (delim.len == 0) return str;
   char* str_start = reinterpret_cast<char*>(str.ptr);
-  char* str_part = str_start;
   char* delimiter = reinterpret_cast<char*>(delim.ptr);
-  for (int cur_pos = 1; ; ++cur_pos) {
-    int remaining_len = str.len - (str_part - str_start);
-    char* delim_ref = LocateSubstring(str_part, remaining_len, delimiter, delim.len);
-    if (delim_ref == NULL) {
+  const int DIRECTION = field_pos > 0 ? 1 : -1;
+  char* window_start = str_start;
+  char* window_end = str_start + str.len;
+  for (int cur_pos = DIRECTION; ; cur_pos += DIRECTION) {
+    int remaining_len = window_end - window_start;
+    char* delim_ref = LocateSubstring(window_start, remaining_len, delimiter, delim.len,
+        DIRECTION);
+    if (delim_ref == nullptr) {
       if (cur_pos == field_pos) {
-        return StringVal(reinterpret_cast<uint8_t*>(str_part), remaining_len);
+        return StringVal(reinterpret_cast<uint8_t*>(window_start), remaining_len);
       }
       // Return empty string if required field position is not found.
       return StringVal();
     }
     if (cur_pos == field_pos) {
-      return StringVal(reinterpret_cast<uint8_t*>(str_part),
-          delim_ref - str_part);
+      if (DIRECTION < 0) {
+        window_start = delim_ref + delim.len;
+      }
+      else {
+        window_end = delim_ref;
+      }
+      return StringVal(reinterpret_cast<uint8_t*>(window_start),
+          window_end - window_start);
+    }
+    if (DIRECTION < 0) {
+      window_end = delim_ref;
+    } else {
+      window_start = delim_ref + delim.len;
     }
-    str_part = delim_ref + delim.len;
   }
   return StringVal();
 }