You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pinot.apache.org by at...@apache.org on 2023/06/22 18:55:34 UTC
[pinot] branch master updated: Fixes SQL wildcard escaping in LIKE queries (#10897)
This is an automated email from the ASF dual-hosted git repository.
atri pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git
The following commit(s) were added to refs/heads/master by this push:
new 84aa659b71 Fixes SQL wildcard escaping in LIKE queries (#10897)
84aa659b71 is described below
commit 84aa659b713c3970da41a43925427708bc72a8c2
Author: Aravind Suresh <ar...@gmail.com>
AuthorDate: Fri Jun 23 00:25:29 2023 +0530
Fixes SQL wildcard escaping in LIKE queries (#10897)
* Fixes SQL wildcard escaping in LIKE queries
* Updated unit tests
* Fixed lint issues
* Minor refactor
* Addressed reviewer comments
* Fixed lint
* Addressed reviewer comments
* Minor fixes
* Minor refactor
---
.../common/utils/RegexpPatternConverterUtils.java | 87 +++++++++++++++-------
.../utils/RegexpPatternConverterUtilsTest.java | 37 +++++++++
2 files changed, 97 insertions(+), 27 deletions(-)
diff --git a/pinot-common/src/main/java/org/apache/pinot/common/utils/RegexpPatternConverterUtils.java b/pinot-common/src/main/java/org/apache/pinot/common/utils/RegexpPatternConverterUtils.java
index c62cf86f23..31a0d59d95 100644
--- a/pinot-common/src/main/java/org/apache/pinot/common/utils/RegexpPatternConverterUtils.java
+++ b/pinot-common/src/main/java/org/apache/pinot/common/utils/RegexpPatternConverterUtils.java
@@ -18,6 +18,9 @@
*/
package org.apache.pinot.common.utils;
+
+import com.google.common.primitives.Chars;
+
/**
* Utility for converting regex patterns.
*/
@@ -25,9 +28,14 @@ public class RegexpPatternConverterUtils {
private RegexpPatternConverterUtils() {
}
- /* Represents all metacharacters to be processed */
- public static final String[] REGEXP_METACHARACTERS =
- {"\\", "^", "$", ".", "{", "}", "[", "]", "(", ")", "*", "+", "?", "|", "<", ">", "-", "&", "/"};
+ /*
+ * Represents all metacharacters to be processed.
+ * This excludes the \ (back slash) character as that doubles up as an escape character as well.
+ * So it is handled separately in the conversion logic.
+ */
+ public static final char[] REGEXP_METACHARACTERS = new char[]{
+ '^', '$', '.', '{', '}', '[', ']', '(', ')', '*', '+', '?', '|', '<', '>', '-', '&', '/'};
+ public static final char BACK_SLASH = '\\';
/**
* Converts a LIKE pattern into REGEXP_LIKE pattern.
@@ -64,24 +72,61 @@ public class RegexpPatternConverterUtils {
break;
}
- String escaped = escapeMetaCharacters(likePattern.substring(start, end));
- StringBuilder sb = new StringBuilder(escaped.length() + 2);
- sb.append(prefix);
- sb.append(escaped);
- sb.append(suffix);
+ likePattern = likePattern.substring(start, end);
+ return escapeMetaCharsAndWildcards(likePattern, prefix, suffix);
+ }
+ /**
+ * Escapes the provided pattern by considering the following constraints:
+ * <ul>
+ * <li> SQL wildcards escaping is handled (_, %) </li>
+ * <li> Regex meta characters escaping is handled </li>
+ * </ul>
+ * @param input the provided input string
+ * @param prefix the prefix to be added to the output string
+ * @param suffix the suffix to be added to the output string
+ * @return the final output string
+ */
+ private static String escapeMetaCharsAndWildcards(String input, String prefix, String suffix) {
+ StringBuilder sb = new StringBuilder();
+ sb.append(prefix);
+ // handling SQL wildcards (_, %) by replacing them with corresponding regex equivalents
+ // we ignore them if the SQL wildcards are escaped
int i = 0;
- while (i < sb.length()) {
- char c = sb.charAt(i);
- if (c == '_') {
- sb.replace(i, i + 1, ".");
- } else if (c == '%') {
- sb.replace(i, i + 1, ".*");
- i++;
+ int len = input.length();
+ boolean isPrevCharBackSlash = false;
+ while (i < len) {
+ char c = input.charAt(i);
+ switch (c) {
+ case '_':
+ sb.append(isPrevCharBackSlash ? c : ".");
+ break;
+ case '%':
+ sb.append(isPrevCharBackSlash ? c : ".*");
+ break;
+ default:
+ // either the current character is a meta-character
+ // OR
+ // this means the previous character is a \
+ // but it was not used for escaping SQL wildcards
+ // so let's escape this \ in the output
+ // this case is separately handled outside the meta characters list
+ if (Chars.indexOf(REGEXP_METACHARACTERS, c) >= 0 || isPrevCharBackSlash) {
+ sb.append(BACK_SLASH);
+ }
+ sb.append(c);
+ break;
}
+ isPrevCharBackSlash = (c == BACK_SLASH);
i++;
}
+ // handle trailing \
+ if (isPrevCharBackSlash) {
+ sb.append(BACK_SLASH);
+ }
+
+ sb.append(suffix);
return sb.toString();
}
@@ -103,18 +148,6 @@ public class RegexpPatternConverterUtils {
return -1;
}
- /**
- * Add escape characters before special characters
- */
- private static String escapeMetaCharacters(String pattern) {
- for (String metaCharacter : REGEXP_METACHARACTERS) {
- if (pattern.contains(metaCharacter)) {
- pattern = pattern.replace(metaCharacter, "\\" + metaCharacter);
- }
- }
- return pattern;
- }
-
/**
* Converts a REGEXP_LIKE pattern into Lucene REGEXP pattern.
*/
diff --git a/pinot-common/src/test/java/org/apache/pinot/common/utils/RegexpPatternConverterUtilsTest.java b/pinot-common/src/test/java/org/apache/pinot/common/utils/RegexpPatternConverterUtilsTest.java
index 8c94cb4886..0341ee7606 100644
--- a/pinot-common/src/test/java/org/apache/pinot/common/utils/RegexpPatternConverterUtilsTest.java
+++ b/pinot-common/src/test/java/org/apache/pinot/common/utils/RegexpPatternConverterUtilsTest.java
@@ -125,4 +125,41 @@ public class RegexpPatternConverterUtilsTest {
String regexpLikePattern = RegexpPatternConverterUtils.likeToRegexpLike("z%");
assertEquals(regexpLikePattern, "^z");
}
+
+ @Test
+ public void testEscapedWildcard1() {
+ // the first underscore (_ in _b) is escaped, so it is meant to match an actual "_b" string in the provided
+ // string
+ // the second underscore (_ in b_) is not escaped, so it is a SQL wildcard that is used to match a single
+ // character, which in the regex space is "."
+ String regexpLikePattern = RegexpPatternConverterUtils.likeToRegexpLike("a\\_b_\\");
+ assertEquals(regexpLikePattern, "^a\\_b.\\\\$");
+ String luceneRegExpPattern = RegexpPatternConverterUtils.regexpLikeToLuceneRegExp(regexpLikePattern);
+ assertEquals(luceneRegExpPattern, "a\\_b.\\\\");
+ }
+
+ @Test
+ public void testEscapedWildcard2() {
+ // the % (% in %b) is escaped, so it is meant to match an actual "%b" string in the provided
+ // string
+ // the "\" before c is a normal "\", so it is meant to match an actual "\" string in the provided
+ // string, this is done because "c" is not a SQL wildcard - hence the "\" before that is used as-is
+ // and is not used for escaping "c"
+ // so, this "\" is escaped in the output as it is a regex metacharacter and the converted regex
+ // will match "a%b\cde" in the provided string
+ String regexpLikePattern = RegexpPatternConverterUtils.likeToRegexpLike("a\\%b\\cde");
+ assertEquals(regexpLikePattern, "^a\\%b\\\\cde$");
+ String luceneRegExpPattern = RegexpPatternConverterUtils.regexpLikeToLuceneRegExp(regexpLikePattern);
+ assertEquals(luceneRegExpPattern, "a\\%b\\\\cde");
+ }
+ @Test
+ public void testEscapedWildcard3() {
+ // here the "\" character is used to escape _, so _ here is not treated as a SQL wildcard
+ // but it is meant to actually match "_" in the provided string
+ // so the corresponding regex doesn't convert the "_" to "."
+ String regexpLikePattern = RegexpPatternConverterUtils.likeToRegexpLike("%2\\_2%");
+ assertEquals(regexpLikePattern, "2\\_2");
+ String luceneRegExpPattern = RegexpPatternConverterUtils.regexpLikeToLuceneRegExp(regexpLikePattern);
+ assertEquals(luceneRegExpPattern, ".*2\\_2.*");
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@pinot.apache.org
For additional commands, e-mail: commits-help@pinot.apache.org