You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by mj...@apache.org on 2016/09/13 20:28:50 UTC

incubator-impala git commit: IMPALA-3973: add position and occurrence to instr()

Repository: incubator-impala
Updated Branches:
  refs/heads/master 94fc6a3d0 -> 64ffbc4b5


IMPALA-3973: add position and occurrence to instr()

Change-Id: Ie9648de458d243306fa14adc5e7f7002bf6f67fd
Reviewed-on: http://gerrit.cloudera.org:8080/4094
Tested-by: Internal Jenkins
Reviewed-by: Matthew Jacobs <mj...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/64ffbc4b
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/64ffbc4b
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/64ffbc4b

Branch: refs/heads/master
Commit: 64ffbc4b5112b21e738a16218cb396c9b65ff70d
Parents: 94fc6a3
Author: Zoltan Ivanfi <zi...@cloudera.com>
Authored: Tue Aug 23 13:26:05 2016 +0200
Committer: Matthew Jacobs <mj...@cloudera.com>
Committed: Tue Sep 13 20:28:27 2016 +0000

----------------------------------------------------------------------
 be/src/experiments/CMakeLists.txt            |   2 +-
 be/src/experiments/string-search-sse-test.cc | 119 +++++++++++++++++++
 be/src/experiments/string-search-test.cc     | 123 --------------------
 be/src/exprs/expr-test.cc                    |  65 +++++++++++
 be/src/exprs/string-functions-ir.cc          |  69 +++++++++--
 be/src/exprs/string-functions.h              |   5 +
 be/src/runtime/CMakeLists.txt                |   1 +
 be/src/runtime/string-search-test.cc         | 135 ++++++++++++++++++++++
 be/src/runtime/string-search.h               |  71 +++++++++++-
 common/function-registry/impala_functions.py |   3 +
 10 files changed, 458 insertions(+), 135 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/64ffbc4b/be/src/experiments/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/be/src/experiments/CMakeLists.txt b/be/src/experiments/CMakeLists.txt
index 743d2ab..e4d4895 100644
--- a/be/src/experiments/CMakeLists.txt
+++ b/be/src/experiments/CMakeLists.txt
@@ -38,4 +38,4 @@ target_link_libraries(tuple-splitter-test Experiments ${IMPALA_LINK_LIBS})
 target_link_libraries(hash-partition-test ${IMPALA_LINK_LIBS})
 target_link_libraries(compression-test ${IMPALA_LINK_LIBS})
 
-ADD_BE_TEST(string-search-test)
+ADD_BE_TEST(string-search-sse-test)

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/64ffbc4b/be/src/experiments/string-search-sse-test.cc
----------------------------------------------------------------------
diff --git a/be/src/experiments/string-search-sse-test.cc b/be/src/experiments/string-search-sse-test.cc
new file mode 100644
index 0000000..e570a6b
--- /dev/null
+++ b/be/src/experiments/string-search-sse-test.cc
@@ -0,0 +1,119 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+#include <string>
+
+#include "experiments/string-search-sse.h"
+
+#include "common/names.h"
+
+namespace impala {
+
+// Test string search for haystack/needle, up to len for each.
+// If the length is -1, use the full string length
+// haystack/needle are null terminated
+void TestSearch(const char* haystack_orig, const char* needle_orig, int haystack_len = -1,
+    int needle_len = -1) {
+  string haystack_copy(haystack_orig);
+  string needle_copy(needle_orig);
+  const char* haystack = haystack_copy.c_str();
+  const char* needle = needle_copy.c_str();
+
+  string haystack_buffer, needle_buffer;
+
+  const char* haystack_null_terminated = haystack;
+  const char* needle_null_terminated = needle;
+
+  // Make null terminated versions (for libc).
+  if (haystack_len != -1) {
+    haystack_buffer = string(haystack, haystack_len);
+    haystack_null_terminated = haystack_buffer.c_str();
+  } else {
+    haystack_len = strlen(haystack);
+  }
+
+  if (needle_len != -1) {
+    needle_buffer = string(needle, needle_len);
+    needle_null_terminated = needle_buffer.c_str();
+  } else {
+    needle_len = strlen(needle);
+  }
+
+  // Call libc for correct result
+  const char* libc_result = strstr(haystack_null_terminated, needle_null_terminated);
+  int libc_offset = (libc_result == NULL) ? -1 : libc_result - haystack_null_terminated;
+
+  StringValue haystack_str_val(const_cast<char*>(haystack), haystack_len);
+
+  // Call our strstr with null terminated needle
+  StringSearchSSE needle1(needle_null_terminated);
+  int null_offset = needle1.Search(haystack_str_val);
+  EXPECT_EQ(null_offset, libc_offset);
+
+  // Call our strstr with non-null terminated needle
+  StringValue needle_str_val(const_cast<char*>(needle), needle_len);
+  StringSearchSSE needle2(&needle_str_val);
+  int not_null_offset = needle2.Search(haystack_str_val);
+  EXPECT_EQ(not_null_offset, libc_offset);
+
+  // Ensure haystack/needle are unmodified
+  EXPECT_EQ(strlen(needle_null_terminated), needle_len);
+  EXPECT_EQ(strlen(haystack_null_terminated), haystack_len);
+  EXPECT_EQ(strcmp(haystack, haystack_orig), 0);
+  EXPECT_EQ(strcmp(needle, needle_orig), 0);
+}
+
+TEST(StringSearchTest, Basic) {
+  // Basic Tests
+  TestSearch("abcd", "a");
+  TestSearch("abcd", "ab");
+  TestSearch("abcd", "c");
+  TestSearch("abcd", "cd");
+  TestSearch("", "");
+  TestSearch("abc", "");
+  TestSearch("", "a");
+
+  // Test single chars
+  TestSearch("a", "a");
+  TestSearch("a", "b");
+  TestSearch("abc", "b");
+
+  // Haystack is not full string
+  TestSearch("abcabcd", "abc", 5);
+  TestSearch("abcabcd", "abcd", 5);
+  TestSearch("abcabcd", "abcd", 0);
+
+  // Haystack and needle not full len
+  TestSearch("abcde", "abaaaaaa", 4, 2);
+  TestSearch("abcde", "abaaaaaa", 4, 3);
+  TestSearch("abcdabaaaaa", "abaaaaaa", 4, 3);
+  TestSearch("abcdabaaaaa", "abaaaaaa", 4, 0);
+  TestSearch("abcdabaaaaa", "abaaaaaa", 0, 0);
+
+  // Test last bit, this is the interesting edge case
+  TestSearch("abcde", "e");
+  TestSearch("abcde", "de");
+  TestSearch("abcdede", "cde");
+  TestSearch("abcdacde", "cde");
+}
+}
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/64ffbc4b/be/src/experiments/string-search-test.cc
----------------------------------------------------------------------
diff --git a/be/src/experiments/string-search-test.cc b/be/src/experiments/string-search-test.cc
deleted file mode 100644
index cc46297..0000000
--- a/be/src/experiments/string-search-test.cc
+++ /dev/null
@@ -1,123 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include <string>
-#include <gtest/gtest.h>
-
-#include "experiments/string-search-sse.h"
-
-#include "common/names.h"
-
-namespace impala {
-
-// Test string search for haystack/needle, up to len for each.
-// If the length is -1, use the full string length
-// haystack/needle are null terminated
-void TestSearch(const char* haystack_orig, const char* needle_orig,
-    int haystack_len = -1, int needle_len = -1) {
-
-  string haystack_copy(haystack_orig);
-  string needle_copy(needle_orig);
-  const char* haystack = haystack_copy.c_str();
-  const char* needle = needle_copy.c_str();
-
-  string haystack_buffer, needle_buffer;
-
-  const char* haystack_null_terminated = haystack;
-  const char* needle_null_terminated = needle;
-
-  // Make null terminated versions (for libc).
-  if (haystack_len != -1) {
-    haystack_buffer = string(haystack, haystack_len);
-    haystack_null_terminated = haystack_buffer.c_str();
-  } else {
-    haystack_len = strlen(haystack);
-  }
-
-  if (needle_len != -1) {
-    needle_buffer = string(needle, needle_len);
-    needle_null_terminated = needle_buffer.c_str();
-  } else {
-    needle_len = strlen(needle);
-  }
-
-  // Call libc for correct result
-  const char* libc_result = strstr(haystack_null_terminated, needle_null_terminated);
-  int libc_offset = (libc_result == NULL) ? -1 : libc_result - haystack_null_terminated;
-
-  StringValue haystack_str_val(const_cast<char*>(haystack), haystack_len);
-
-  // Call our strstr with null terminated needle
-  StringSearchSSE needle1(needle_null_terminated);
-  int null_offset = needle1.Search(haystack_str_val);
-  EXPECT_EQ(null_offset, libc_offset);
-
-  // Call our strstr with non-null terminated needle
-  StringValue needle_str_val(const_cast<char*>(needle), needle_len);
-  StringSearchSSE needle2(&needle_str_val);
-  int not_null_offset = needle2.Search(haystack_str_val);
-  EXPECT_EQ(not_null_offset, libc_offset);
-
-  // Ensure haystack/needle are unmodified
-  EXPECT_EQ(strlen(needle_null_terminated), needle_len);
-  EXPECT_EQ(strlen(haystack_null_terminated), haystack_len);
-  EXPECT_EQ(strcmp(haystack, haystack_orig), 0);
-  EXPECT_EQ(strcmp(needle, needle_orig), 0);
-}
-
-
-TEST(StringSearchTest, Basic) {
-  // Basic Tests
-  TestSearch("abcd", "a");
-  TestSearch("abcd", "ab");
-  TestSearch("abcd", "c");
-  TestSearch("abcd", "cd");
-  TestSearch("", "");
-  TestSearch("abc", "");
-  TestSearch("", "a");
-
-  // Test single chars
-  TestSearch("a", "a");
-  TestSearch("a", "b");
-  TestSearch("abc", "b");
-
-  // Haystack is not full string
-  TestSearch("abcabcd", "abc", 5);
-  TestSearch("abcabcd", "abcd", 5);
-  TestSearch("abcabcd", "abcd", 0);
-
-  // Haystack and needle not full len
-  TestSearch("abcde", "abaaaaaa", 4, 2);
-  TestSearch("abcde", "abaaaaaa", 4, 3);
-  TestSearch("abcdabaaaaa", "abaaaaaa", 4, 3);
-  TestSearch("abcdabaaaaa", "abaaaaaa", 4, 0);
-  TestSearch("abcdabaaaaa", "abaaaaaa", 0, 0);
-
-  // Test last bit, this is the interesting edge case
-  TestSearch("abcde", "e");
-  TestSearch("abcde", "de");
-  TestSearch("abcdede", "cde");
-  TestSearch("abcdacde", "cde");
-}
-
-}
-
-int main(int argc, char **argv) {
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/64ffbc4b/be/src/exprs/expr-test.cc
----------------------------------------------------------------------
diff --git a/be/src/exprs/expr-test.cc b/be/src/exprs/expr-test.cc
index 67bbe0a..8a19603 100644
--- a/be/src/exprs/expr-test.cc
+++ b/be/src/exprs/expr-test.cc
@@ -2064,9 +2064,74 @@ TEST_F(ExprTest, StringFunctions) {
   TestValue("instr('', '')", TYPE_INT, 0);
   TestValue("instr('', 'abc')", TYPE_INT, 0);
   TestValue("instr('abc', '')", TYPE_INT, 0);
+  TestValue("instr('abcdef', 'xyz')", TYPE_INT, 0);
+  TestValue("instr('xyz', 'abcdef')", TYPE_INT, 0);
   TestValue("instr('abc', 'abc')", TYPE_INT, 1);
   TestValue("instr('xyzabc', 'abc')", TYPE_INT, 4);
   TestValue("instr('xyzabcxyz', 'bcx')", TYPE_INT, 5);
+
+  TestValue("instr('', '', -1)", TYPE_INT, 0);
+  TestValue("instr('', 'abc', -1)", TYPE_INT, 0);
+  TestValue("instr('abc', '', -1)", TYPE_INT, 0);
+  TestValue("instr('abc', 'abc', -1)", TYPE_INT, 1);
+  TestValue("instr('xyzabc', 'abc', -1)", TYPE_INT, 4);
+  TestValue("instr('xyzabcxyz', 'bcx', -1)", TYPE_INT, 5);
+
+  TestValue("instr('corporate floor', 'or', 0)", TYPE_INT, 0);
+  TestValue("instr('corporate floor', 'or', 14)", TYPE_INT, 14);
+  TestValue("instr('corporate floor', 'or', 15)", TYPE_INT, 0);
+  TestValue("instr('corporate floor', 'or', -14)", TYPE_INT, 2);
+  TestValue("instr('corporate floor', 'or', -15)", TYPE_INT, 0);
+
+  TestValue("instr('corporate floor', 'or', 0, 1)", TYPE_INT, 0);
+  TestValue("instr('corporate floor', 'or', 14, 1)", TYPE_INT, 14);
+  TestValue("instr('corporate floor', 'or', 15, 1)", TYPE_INT, 0);
+  TestValue("instr('corporate floor', 'or', -14, 1)", TYPE_INT, 2);
+  TestValue("instr('corporate floor', 'or', -15, 1)", TYPE_INT, 0);
+
+  TestValue("instr('corporate floor', 'or', 2, 2)", TYPE_INT, 5);
+  TestValue("instr('corporate floor', 'or', 3, 2)", TYPE_INT, 14);
+  TestValue("instr('corporate floor', 'or', -3, 2)", TYPE_INT, 2);
+  TestValue("instr('corporate floor', 'or', -2, 2)", TYPE_INT, 5);
+
+  TestValue("instr('corporate floor', 'or', 3, 3)", TYPE_INT, 0);
+  TestValue("instr('corporate floor', 'or', -3, 3)", TYPE_INT, 0);
+  TestValue("instr('corporate floor', '', 3, 3)", TYPE_INT, 0);
+  TestValue("instr('corporate floor', '', -3, 3)", TYPE_INT, 0);
+
+  TestValue("instr('abababa', 'aba', 1, 1)", TYPE_INT, 1);
+  TestValue("instr('abababa', 'aba', 1, 2)", TYPE_INT, 3);
+  TestValue("instr('abababa', 'aba', 1, 3)", TYPE_INT, 5);
+  TestValue("instr('abababa', 'aba', cast(1 as bigint), cast(3 as bigint))", TYPE_INT, 5);
+
+  TestError("instr('corporate floor', 'or', 0, 0)");
+  TestError("instr('corporate floor', 'or', 0, -1)");
+  TestError("instr('corporate floor', 'or', 1, 0)");
+  TestError("instr('corporate floor', 'or', 1, -1)");
+
+  TestIsNull("instr(NULL, 'or', 2)", TYPE_INT);
+  TestIsNull("instr('corporate floor', NULL, 2)", TYPE_INT);
+  TestIsNull("instr('corporate floor', 'or', NULL)", TYPE_INT);
+
+  TestIsNull("instr(NULL, 'or', 2, 2)", TYPE_INT);
+  TestIsNull("instr('corporate floor', NULL, 2, 2)", TYPE_INT);
+  TestIsNull("instr('corporate floor', 'or', NULL, 2)", TYPE_INT);
+  TestIsNull("instr('corporate floor', 'or', 2, NULL)", TYPE_INT);
+
+  TestValue("instr('a', 'a', 1, 1)", TYPE_INT, 1);
+  TestValue("instr('axyz', 'a', 1, 1)", TYPE_INT, 1);
+  TestValue("instr('xyza', 'a', 1, 1)", TYPE_INT, 4);
+  TestValue("instr('a', 'a', 1, 2)", TYPE_INT, 0);
+  TestValue("instr('axyz', 'a', 1, 2)", TYPE_INT, 0);
+  TestValue("instr('xyza', 'a', 1, 2)", TYPE_INT, 0);
+
+  TestValue("instr('a', 'a', -1, 1)", TYPE_INT, 1);
+  TestValue("instr('axyz', 'a', -1, 1)", TYPE_INT, 1);
+  TestValue("instr('xyza', 'a', -1, 1)", TYPE_INT, 4);
+  TestValue("instr('a', 'a', -1, 2)", TYPE_INT, 0);
+  TestValue("instr('axyz', 'a', -1, 2)", TYPE_INT, 0);
+  TestValue("instr('xyza', 'a', -1, 2)", TYPE_INT, 0);
+
   TestIsNull("instr(NULL, 'bcx')", TYPE_INT);
   TestIsNull("instr('xyzabcxyz', NULL)", TYPE_INT);
   TestIsNull("instr(NULL, NULL)", TYPE_INT);

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/64ffbc4b/be/src/exprs/string-functions-ir.cc
----------------------------------------------------------------------
diff --git a/be/src/exprs/string-functions-ir.cc b/be/src/exprs/string-functions-ir.cc
index 06b16af..39cf898 100644
--- a/be/src/exprs/string-functions-ir.cc
+++ b/be/src/exprs/string-functions-ir.cc
@@ -284,13 +284,68 @@ IntVal StringFunctions::Ascii(FunctionContext* context, const StringVal& str) {
 }
 
 IntVal StringFunctions::Instr(FunctionContext* context, const StringVal& str,
-    const StringVal& substr) {
-  if (str.is_null || substr.is_null) return IntVal::null();
-  StringValue str_sv = StringValue::FromStringVal(str);
-  StringValue substr_sv = StringValue::FromStringVal(substr);
-  StringSearch search(&substr_sv);
-  // Hive returns positions starting from 1.
-  return IntVal(search.Search(&str_sv) + 1);
+    const StringVal& substr, const BigIntVal& start_position,
+    const BigIntVal& occurrence) {
+  if (str.is_null || substr.is_null || start_position.is_null || occurrence.is_null) {
+    return IntVal::null();
+  }
+  if (occurrence.val <= 0) {
+    stringstream ss;
+    ss << "Invalid occurrence parameter to instr function: " << occurrence.val;
+    context->SetError(ss.str().c_str());
+    return IntVal(0);
+  }
+  if (start_position.val == 0) return IntVal(0);
+
+  StringValue haystack = StringValue::FromStringVal(str);
+  StringValue needle = StringValue::FromStringVal(substr);
+  StringSearch search(&needle);
+  if (start_position.val > 0) {
+    // A positive starting position indicates regular searching from the left.
+    int search_start_pos = start_position.val - 1;
+    if (search_start_pos >= haystack.len) return IntVal(0);
+    int match_pos = -1;
+    for (int match_num = 0; match_num < occurrence.val; ++match_num) {
+      DCHECK_LE(search_start_pos, haystack.len);
+      StringValue haystack_substring = haystack.Substring(search_start_pos);
+      int match_pos_in_substring = search.Search(&haystack_substring);
+      if (match_pos_in_substring < 0) return IntVal(0);
+      match_pos = search_start_pos + match_pos_in_substring;
+      search_start_pos = match_pos + 1;
+    }
+    // Return positions starting from 1 at the leftmost position.
+    return IntVal(match_pos + 1);
+  } else {
+    // A negative starting position indicates searching from the right.
+    int search_start_pos = haystack.len + start_position.val;
+    // The needle must fit between search_start_pos and the end of the string
+    if (search_start_pos + needle.len > haystack.len) {
+      search_start_pos = haystack.len - needle.len;
+    }
+    if (search_start_pos < 0) return IntVal(0);
+    int match_pos = -1;
+    for (int match_num = 0; match_num < occurrence.val; ++match_num) {
+      DCHECK_GE(search_start_pos + needle.len, 0);
+      DCHECK_LE(search_start_pos + needle.len, haystack.len);
+      StringValue haystack_substring =
+          haystack.Substring(0, search_start_pos + needle.len);
+      match_pos = search.RSearch(&haystack_substring);
+      if (match_pos < 0) return IntVal(0);
+      search_start_pos = match_pos - 1;
+    }
+    // Return positions starting from 1 at the leftmost position.
+    return IntVal(match_pos + 1);
+  }
+}
+
+IntVal StringFunctions::Instr(FunctionContext* context, const StringVal& str,
+    const StringVal& substr, const BigIntVal& start_position) {
+  return Instr(context, str, substr, start_position, BigIntVal(1));
+}
+
+IntVal StringFunctions::Instr(
+    FunctionContext* context, const StringVal& str, const StringVal& substr) {
+  return Instr(context, str, substr, BigIntVal(1), BigIntVal(1));
 }
 
 IntVal StringFunctions::Locate(FunctionContext* context, const StringVal& substr,

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/64ffbc4b/be/src/exprs/string-functions.h
----------------------------------------------------------------------
diff --git a/be/src/exprs/string-functions.h b/be/src/exprs/string-functions.h
index 85fda0e..fb3a900 100644
--- a/be/src/exprs/string-functions.h
+++ b/be/src/exprs/string-functions.h
@@ -60,7 +60,12 @@ class StringFunctions {
   static StringVal Ltrim(FunctionContext*, const StringVal& str);
   static StringVal Rtrim(FunctionContext*, const StringVal& str);
   static IntVal Ascii(FunctionContext*, const StringVal& str);
+  static IntVal Instr(FunctionContext*, const StringVal& str, const StringVal& substr,
+      const BigIntVal& start_position, const BigIntVal& occurrence);
+  static IntVal Instr(FunctionContext*, const StringVal& str, const StringVal& substr,
+      const BigIntVal& start_position);
   static IntVal Instr(FunctionContext*, const StringVal& str, const StringVal& substr);
+
   static IntVal Locate(FunctionContext*, const StringVal& substr, const StringVal& str);
   static IntVal LocatePos(FunctionContext*, const StringVal& substr, const StringVal& str,
       const BigIntVal& start_pos);

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/64ffbc4b/be/src/runtime/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/be/src/runtime/CMakeLists.txt b/be/src/runtime/CMakeLists.txt
index f0f722d..6bdce48 100644
--- a/be/src/runtime/CMakeLists.txt
+++ b/be/src/runtime/CMakeLists.txt
@@ -78,6 +78,7 @@ ADD_BE_TEST(disk-io-mgr-test)
 ADD_BE_TEST(buffered-block-mgr-test)
 ADD_BE_TEST(parallel-executor-test)
 ADD_BE_TEST(raw-value-test)
+ADD_BE_TEST(string-search-test)
 ADD_BE_TEST(string-value-test)
 ADD_BE_TEST(thread-resource-mgr-test)
 ADD_BE_TEST(mem-tracker-test)

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/64ffbc4b/be/src/runtime/string-search-test.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/string-search-test.cc b/be/src/runtime/string-search-test.cc
new file mode 100644
index 0000000..a7804cc
--- /dev/null
+++ b/be/src/runtime/string-search-test.cc
@@ -0,0 +1,135 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include "runtime/string-search.h"
+
+namespace impala {
+
+StringValue StrValFromCString(const char* str, int len) {
+  if (len == -1) len = strlen(str);
+  return StringValue(const_cast<char*>(str), len);
+}
+
+// Test string search for haystack/needle, up to len for each.
+// If the length is -1, use the full string length.
+// Searching in a substring allows checking whether the search respects the
+// length stored in the StringValue or also reads parts of the string which
+// should be excluded.
+int TestSearch(const char* haystack, const char* needle, int haystack_len = -1,
+    int needle_len = -1) {
+  StringValue needle_str_val = StrValFromCString(needle, needle_len);
+  StringValue haystack_str_val = StrValFromCString(haystack, haystack_len);
+  StringSearch search(&needle_str_val);
+  return search.Search(&haystack_str_val);
+}
+
+// Test reverse string search for haystack/needle, up to len for each.
+// If the length is -1, use the full string length.
+// Searching in a substring allows checking whether the search respects the
+// length stored in the StringValue or also reads parts of the string which
+// should be excluded.
+int TestRSearch(const char* haystack, const char* needle, int haystack_len = -1,
+    int needle_len = -1) {
+  StringValue needle_str_val = StrValFromCString(needle, needle_len);
+  StringValue haystack_str_val = StrValFromCString(haystack, haystack_len);
+  StringSearch search(&needle_str_val);
+  return search.RSearch(&haystack_str_val);
+}
+
+TEST(StringSearchTest, RegularSearch) {
+  // Basic Tests
+  EXPECT_EQ(0, TestSearch("abcdabcd", "a"));
+  EXPECT_EQ(0, TestSearch("abcdabcd", "ab"));
+  EXPECT_EQ(2, TestSearch("abcdabcd", "c"));
+  EXPECT_EQ(2, TestSearch("abcdabcd", "cd"));
+  EXPECT_EQ(-1, TestSearch("", ""));
+  EXPECT_EQ(-1, TestSearch("abc", ""));
+  EXPECT_EQ(-1, TestSearch("", "a"));
+
+  // Test single chars
+  EXPECT_EQ(0, TestSearch("a", "a"));
+  EXPECT_EQ(-1, TestSearch("a", "b"));
+  EXPECT_EQ(1, TestSearch("abc", "b"));
+
+  // Test searching in a substring of the haystack
+  EXPECT_EQ(0, TestSearch("abcabcd", "abc", 5));
+  EXPECT_EQ(-1, TestSearch("abcabcd", "abcd", 5));
+  EXPECT_EQ(-1, TestSearch("abcabcd", "abcd", 0));
+
+  // Test searching a substring of the needle in a substring of the haystack.
+  EXPECT_EQ(0, TestSearch("abcde", "abaaaaaa", 4, 2));
+  EXPECT_EQ(-1, TestSearch("abcde", "abaaaaaa", 4, 3));
+  EXPECT_EQ(-1, TestSearch("abcdabaaaaa", "abaaaaaa", 4, 3));
+  EXPECT_EQ(-1, TestSearch("abcdabaaaaa", "abaaaaaa", 4, 0));
+  EXPECT_EQ(-1, TestSearch("abcdabaaaaa", "abaaaaaa"));
+
+  // Test with needle matching the end of the substring.
+  EXPECT_EQ(4, TestSearch("abcde", "e"));
+  EXPECT_EQ(3, TestSearch("abcde", "de"));
+  EXPECT_EQ(2, TestSearch("abcdede", "cde"));
+  EXPECT_EQ(5, TestSearch("abcdacde", "cde"));
+
+  // Test correct skip_ handling, which depends on which char of the needle is
+  // the same as the last one.
+  EXPECT_EQ(2, TestSearch("ababcacac", "abcacac"));
+}
+
+TEST(StringSearchTest, ReverseSearch) {
+  // Basic Tests
+  EXPECT_EQ(4, TestRSearch("abcdabcd", "a"));
+  EXPECT_EQ(4, TestRSearch("abcdabcd", "ab"));
+  EXPECT_EQ(6, TestRSearch("abcdabcd", "c"));
+  EXPECT_EQ(6, TestRSearch("abcdabcd", "cd"));
+  EXPECT_EQ(-1, TestRSearch("", ""));
+  EXPECT_EQ(-1, TestRSearch("abc", ""));
+  EXPECT_EQ(-1, TestRSearch("", "a"));
+
+  // Test single chars
+  EXPECT_EQ(0, TestRSearch("a", "a"));
+  EXPECT_EQ(-1, TestRSearch("a", "b"));
+  EXPECT_EQ(1, TestRSearch("abc", "b"));
+
+  // Test searching in a substring of the haystack
+  EXPECT_EQ(0, TestRSearch("abcabcd", "abc", 5));
+  EXPECT_EQ(-1, TestRSearch("abcabcd", "abcd", 5));
+  EXPECT_EQ(-1, TestRSearch("abcabcd", "abcd", 0));
+
+  // Test searching a substring of the needle in a substring of the haystack.
+  EXPECT_EQ(0, TestRSearch("abcde", "abaaaaaa", 4, 2));
+  EXPECT_EQ(-1, TestRSearch("abcde", "abaaaaaa", 4, 3));
+  EXPECT_EQ(-1, TestRSearch("abcdabaaaaa", "abaaaaaa", 4, 3));
+  EXPECT_EQ(-1, TestRSearch("abcdabaaaaa", "abaaaaaa", 4, 0));
+  EXPECT_EQ(-1, TestRSearch("abcdabaaaaa", "abaaaaaa"));
+
+  // Test with needle matching the end of the substring.
+  EXPECT_EQ(4, TestRSearch("abcde", "e"));
+  EXPECT_EQ(3, TestRSearch("abcde", "de"));
+  EXPECT_EQ(2, TestRSearch("abcdede", "cde"));
+  EXPECT_EQ(5, TestRSearch("abcdacde", "cde"));
+
+  // Test correct rskip_ handling, which depends on which char of the needle is
+  // the same as the first one.
+  EXPECT_EQ(0, TestRSearch("cacacbaba", "cacacba"));
+}
+}
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/64ffbc4b/be/src/runtime/string-search.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/string-search.h b/be/src/runtime/string-search.h
index e1a45e0..5bb5d02 100644
--- a/be/src/runtime/string-search.h
+++ b/be/src/runtime/string-search.h
@@ -15,12 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 
-
 #ifndef IMPALA_RUNTIME_STRING_SEARCH_H
 #define IMPALA_RUNTIME_STRING_SEARCH_H
 
-#include <vector>
+#include <algorithm>
 #include <cstring>
+#include <vector>
+
 #include <boost/cstdint.hpp>
 
 #include "common/logging.h"
@@ -44,7 +45,8 @@ class StringSearch {
   StringSearch() : pattern_(NULL), mask_(0) {}
 
   /// Initialize/Precompute a StringSearch object from the pattern
-  StringSearch(const StringValue* pattern) : pattern_(pattern), mask_(0), skip_(0) {
+  StringSearch(const StringValue* pattern)
+    : pattern_(pattern), mask_(0), skip_(0), rskip_(0) {
     // Special cases
     if (pattern_->len <= 1) {
       return;
@@ -53,13 +55,25 @@ class StringSearch {
     // Build compressed lookup table
     int mlast = pattern_->len - 1;
     skip_ = mlast - 1;
+    rskip_ = mlast - 1;
 
+    // In the Python implementation, building the bloom filter happens at the
+    // beginning of the Search operation. We build it during construction
+    // instead, so that the same StringSearch instance can be reused multiple
+    // times without rebuilding the bloom filter every time.
     for (int i = 0; i < mlast; ++i) {
       BloomAdd(pattern_->ptr[i]);
       if (pattern_->ptr[i] == pattern_->ptr[mlast])
         skip_ = mlast - i - 1;
     }
     BloomAdd(pattern_->ptr[mlast]);
+
+    // The order of iteration doesn't have any effect on the bloom filter, but
+    // it does on skip_. For this reason we need to calculate a separate rskip_
+    // for reverse search.
+    for (int i = mlast; i > 0; i--) {
+      if (pattern_->ptr[i] == pattern_->ptr[0]) rskip_ = i - 1;
+    }
   }
 
   /// Search for this pattern in str.
@@ -114,6 +128,54 @@ class StringSearch {
     return -1;
   }
 
+  /// Search for this pattern in str backwards.
+  ///   Returns the offset into str if the pattern exists
+  ///   Returns -1 if the pattern is not found
+  int RSearch(const StringValue* str) const {
+    // Special cases
+    if (str == NULL || pattern_ == NULL || pattern_->len == 0) {
+      return -1;
+    }
+
+    int mlast = pattern_->len - 1;
+    int w = str->len - pattern_->len;
+    int n = str->len;
+    int m = pattern_->len;
+    const char* s = str->ptr;
+    const char* p = pattern_->ptr;
+
+    // Special case if pattern->len == 1
+    if (m == 1) {
+      const char* result = reinterpret_cast<const char*>(memrchr(s, p[0], n));
+      if (result != NULL) return result - s;
+      return -1;
+    }
+
+    // General case.
+    int j;
+    for (int i = w; i >= 0; i--) {
+      if (s[i] == p[0]) {
+        // candidate match
+        for (j = mlast; j > 0; j--)
+          if (s[i + j] != p[j]) break;
+        if (j == 0) {
+          return i;
+        }
+        // miss: check if previous character is part of pattern
+        if (i > 0 && !BloomQuery(s[i - 1]))
+          i = i - m;
+        else
+          i = i - rskip_;
+      } else {
+        // skip: check if previous character is part of pattern
+        if (i > 0 && !BloomQuery(s[i - 1])) {
+          i = i - m;
+        }
+      }
+    }
+    return -1;
+  }
+
  private:
   static const int BLOOM_WIDTH = 64;
 
@@ -127,7 +189,8 @@ class StringSearch {
 
   const StringValue* pattern_;
   int64_t mask_;
-  int64_t skip_;
+  int skip_;
+  int rskip_;
 };
 
 }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/64ffbc4b/common/function-registry/impala_functions.py
----------------------------------------------------------------------
diff --git a/common/function-registry/impala_functions.py b/common/function-registry/impala_functions.py
index 10aac10..2141a35 100644
--- a/common/function-registry/impala_functions.py
+++ b/common/function-registry/impala_functions.py
@@ -431,6 +431,9 @@ visible_functions = [
   [['rtrim'], 'STRING', ['STRING'], 'impala::StringFunctions::Rtrim'],
   [['ascii'], 'INT', ['STRING'], 'impala::StringFunctions::Ascii'],
   [['instr'], 'INT', ['STRING', 'STRING'], 'impala::StringFunctions::Instr'],
+  [['instr'], 'INT', ['STRING', 'STRING', 'BIGINT'], 'impala::StringFunctions::Instr'],
+  [['instr'], 'INT', ['STRING', 'STRING', 'BIGINT', 'BIGINT'],
+   'impala::StringFunctions::Instr'],
   [['locate'], 'INT', ['STRING', 'STRING'], 'impala::StringFunctions::Locate'],
   [['locate'], 'INT', ['STRING', 'STRING', 'BIGINT'],
    'impala::StringFunctions::LocatePos'],