You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ph...@apache.org on 2018/02/02 18:51:37 UTC
[13/19] impala git commit: IMPALA-3282: Adds regexp_escape built-in
function
IMPALA-3282: Adds regexp_escape built-in function
Escapes the following special characters in RE2 library:
.\+*?[^]$(){}=!<>|:-
Testing:
Add some unit tests into ExprTest.StringRegexpFunctions
Add some E2E tests into exprs.test
Change-Id: I84c3e0ded26f6eb20794c38b75be9b25cd111e4b
Reviewed-on: http://gerrit.cloudera.org:8080/8900
Reviewed-by: Tim Armstrong <ta...@cloudera.com>
Tested-by: Impala Public Jenkins
Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/9c08ca2d
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/9c08ca2d
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/9c08ca2d
Branch: refs/heads/2.x
Commit: 9c08ca2dd43c086e74c24a5f3e79f83fd9c5fecc
Parents: 530fa27
Author: Jinchul <ji...@gmail.com>
Authored: Tue Dec 19 11:29:16 2017 +0900
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Fri Feb 2 01:10:15 2018 +0000
----------------------------------------------------------------------
be/src/exprs/expr-test.cc | 40 +++++++++++++++++++-
be/src/exprs/string-functions-ir.cc | 23 +++++++++++
be/src/exprs/string-functions.h | 1 +
common/function-registry/impala_functions.py | 1 +
.../queries/QueryTest/exprs.test | 28 ++++++++++++++
5 files changed, 91 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/impala/blob/9c08ca2d/be/src/exprs/expr-test.cc
----------------------------------------------------------------------
diff --git a/be/src/exprs/expr-test.cc b/be/src/exprs/expr-test.cc
index c6e81f1..5d63f2d 100644
--- a/be/src/exprs/expr-test.cc
+++ b/be/src/exprs/expr-test.cc
@@ -4244,10 +4244,46 @@ TEST_F(ExprTest, StringRegexpFunctions) {
TestIsNull("regexp_match_count(NULL, '.*')", TYPE_INT);
TestIsNull("regexp_match_count('a123', NULL)", TYPE_INT);
TestIsNull("regexp_match_count(NULL, NULL)", TYPE_INT);
+
+ TestIsNull("regexp_escape(NULL)", TYPE_STRING);
+ TestStringValue("regexp_escape('')", "");
+ // Test special character escape
+ // .\+*?[^]$(){}=!<>|:-
+ TestStringValue("regexp_escape('Hello.world')", R"(Hello\.world)");
+ TestStringValue(R"(regexp_escape('Hello\\world'))", R"(Hello\\world)");
+ TestStringValue("regexp_escape('Hello+world')", R"(Hello\+world)");
+ TestStringValue("regexp_escape('Hello*world')", R"(Hello\*world)");
+ TestStringValue("regexp_escape('Hello?world')", R"(Hello\?world)");
+ TestStringValue("regexp_escape('Hello[world')", R"(Hello\[world)");
+ TestStringValue("regexp_escape('Hello^world')", R"(Hello\^world)");
+ TestStringValue("regexp_escape('Hello]world')", R"(Hello\]world)");
+ TestStringValue("regexp_escape('Hello$world')", R"(Hello\$world)");
+ TestStringValue("regexp_escape('Hello(world')", R"(Hello\(world)");
+ TestStringValue("regexp_escape('Hello)world')", R"(Hello\)world)");
+ TestStringValue("regexp_escape('Hello{world')", R"(Hello\{world)");
+ TestStringValue("regexp_escape('Hello}world')", R"(Hello\}world)");
+ TestStringValue("regexp_escape('Hello=world')", R"(Hello\=world)");
+ TestStringValue("regexp_escape('Hello!world')", R"(Hello\!world)");
+ TestStringValue("regexp_escape('Hello<world')", R"(Hello\<world)");
+ TestStringValue("regexp_escape('Hello>world')", R"(Hello\>world)");
+ TestStringValue("regexp_escape('Hello|world')", R"(Hello\|world)");
+ TestStringValue("regexp_escape('Hello:world')", R"(Hello\:world)");
+ TestStringValue("regexp_escape('Hello-world')", R"(Hello\-world)");
+ // Mixed case
+ TestStringValue(R"(regexp_escape('a.b\\c+d*e?f[g]h$i(j)k{l}m=n!o<p>q|r:s-t'))",
+ R"(a\.b\\c\+d\*e\?f\[g\]h\$i\(j\)k\{l\}m\=n\!o\<p\>q\|r\:s\-t)");
+ // Mixed case with other regexp_* functions
+ TestStringValue(R"(regexp_extract(regexp_escape('Hello\\world'),)"
+ R"('([[:alpha:]]+)(\\\\\\\\)([[:alpha:]]+)', 0))", R"(Hello\\world)");
+ TestStringValue(R"(regexp_extract(regexp_escape('Hello\\world'),)"
+ R"('([[:alpha:]]+)(\\\\\\\\)([[:alpha:]]+)', 1))", "Hello");
+ TestStringValue(R"(regexp_extract(regexp_escape('Hello\\world'),)"
+ R"('([[:alpha:]]+)(\\\\\\\\)([[:alpha:]]+)', 2))", R"(\\)");
+ TestStringValue(R"(regexp_extract(regexp_escape('Hello\\world'),)"
+ R"('([[:alpha:]]+)(\\\\\\\\)([[:alpha:]]+)', 3))", "world");
}
-TEST_F(ExprTest, StringParseUrlFunction) {
- // TODO: For now, our parse_url my not behave exactly like Hive
+TEST_F(ExprTest, StringParseUrlFunction) { // TODO: For now, our parse_url my not behave exactly like Hive
// when given malformed URLs.
// If necessary, we can closely follow Java's URL implementation
// to behave exactly like Hive.
http://git-wip-us.apache.org/repos/asf/impala/blob/9c08ca2d/be/src/exprs/string-functions-ir.cc
----------------------------------------------------------------------
diff --git a/be/src/exprs/string-functions-ir.cc b/be/src/exprs/string-functions-ir.cc
index 50378bd..50c37b2 100644
--- a/be/src/exprs/string-functions-ir.cc
+++ b/be/src/exprs/string-functions-ir.cc
@@ -26,6 +26,7 @@
#include "exprs/anyval-util.h"
#include "exprs/scalar-expr.h"
+#include "gutil/strings/charset.h"
#include "runtime/string-value.inline.h"
#include "runtime/tuple-row.h"
#include "util/bit-util.h"
@@ -670,6 +671,28 @@ void StringFunctions::RegexpClose(
context->SetFunctionState(scope, nullptr);
}
+StringVal StringFunctions::RegexpEscape(FunctionContext* context, const StringVal& str) {
+ if (str.is_null) return StringVal::null();
+ if (str.len == 0) return str;
+
+ static const strings::CharSet REGEX_ESCAPE_CHARACTERS(".\\+*?[^]$(){}=!<>|:-");
+ const uint8_t* const start_ptr = str.ptr;
+ const uint8_t* const end_ptr = start_ptr + str.len;
+ StringVal result(context, str.len * 2);
+ if (UNLIKELY(result.is_null)) return StringVal::null();
+ uint8_t* dest_ptr = result.ptr;
+ for (const uint8_t* c = start_ptr; c < end_ptr; ++c) {
+ if (REGEX_ESCAPE_CHARACTERS.Test(*c)) {
+ *dest_ptr++ = '\\';
+ }
+ *dest_ptr++ = *c;
+ }
+ result.len = dest_ptr - result.ptr;
+ DCHECK_GE(result.len, str.len);
+
+ return result;
+}
+
StringVal StringFunctions::RegexpExtract(FunctionContext* context, const StringVal& str,
const StringVal& pattern, const BigIntVal& index) {
if (str.is_null || pattern.is_null || index.is_null) return StringVal::null();
http://git-wip-us.apache.org/repos/asf/impala/blob/9c08ca2d/be/src/exprs/string-functions.h
----------------------------------------------------------------------
diff --git a/be/src/exprs/string-functions.h b/be/src/exprs/string-functions.h
index 91ad2cc..45876f8 100644
--- a/be/src/exprs/string-functions.h
+++ b/be/src/exprs/string-functions.h
@@ -117,6 +117,7 @@ class StringFunctions {
re2::RE2::Options* opts);
static void RegexpPrepare(FunctionContext*, FunctionContext::FunctionStateScope);
static void RegexpClose(FunctionContext*, FunctionContext::FunctionStateScope);
+ static StringVal RegexpEscape(FunctionContext*, const StringVal& str);
static StringVal RegexpExtract(FunctionContext*, const StringVal& str,
const StringVal& pattern, const BigIntVal& index);
static StringVal RegexpReplace(FunctionContext*, const StringVal& str,
http://git-wip-us.apache.org/repos/asf/impala/blob/9c08ca2d/common/function-registry/impala_functions.py
----------------------------------------------------------------------
diff --git a/common/function-registry/impala_functions.py b/common/function-registry/impala_functions.py
index b78062b..8174abb 100644
--- a/common/function-registry/impala_functions.py
+++ b/common/function-registry/impala_functions.py
@@ -462,6 +462,7 @@ visible_functions = [
[['locate'], 'INT', ['STRING', 'STRING'], 'impala::StringFunctions::Locate'],
[['locate'], 'INT', ['STRING', 'STRING', 'BIGINT'],
'impala::StringFunctions::LocatePos'],
+ [['regexp_escape'], 'STRING', ['STRING'], 'impala::StringFunctions::RegexpEscape'],
[['regexp_extract'], 'STRING', ['STRING', 'STRING', 'BIGINT'],
'impala::StringFunctions::RegexpExtract',
'_ZN6impala15StringFunctions13RegexpPrepareEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE',
http://git-wip-us.apache.org/repos/asf/impala/blob/9c08ca2d/testdata/workloads/functional-query/queries/QueryTest/exprs.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/exprs.test b/testdata/workloads/functional-query/queries/QueryTest/exprs.test
index a15f3b5..4d8b193 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/exprs.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/exprs.test
@@ -2452,6 +2452,34 @@ select regexp_match_count(tmp.str, tmp.pattern, tmp.start_pos, tmp.params) from
Illegal match parameter x
====
---- QUERY
+select regexp_escape(tmp.str) from (values
+('a.b\\c+d*e?f[g]h$i(j)k{l}m=n!o<p>q|r:s-t' as str)) as tmp
+---- RESULTS
+'a\\.b\\\\c\\+d\\*e\\?f\\[g\\]h\\$i\\(j\\)k\\{l\\}m\\=n\\!o\\<p\\>q\\|r\\:s\\-t'
+---- TYPES
+string
+====
+---- QUERY
+select regexp_extract(regexp_escape(tmp.str),
+tmp.pattern, tmp.index) from (values
+('Hello\\world' as str, '([[:alpha:]]+)(\\\\\\\\)([[:alpha:]]+)' as pattern, 2 as index)
+) as tmp
+---- RESULTS
+'\\\\'
+---- TYPES
+string
+====
+---- QUERY
+select regexp_extract(regexp_escape(tmp.str),
+tmp.pattern, tmp.index) from (values
+('Hello\\world' as str, '([[:alpha:]]+)(\\\\\\\\)([[:alpha:]]+)' as pattern, 3 as index)
+) as tmp
+---- RESULTS
+'world'
+---- TYPES
+string
+====
+---- QUERY
# IMPALA-2147: IS [NOT] DISTINCT FROM and "<=>"
select NULL <=> NULL
---- RESULTS