You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pinot.apache.org by at...@apache.org on 2023/06/22 18:55:34 UTC

[pinot] branch master updated: Fixes SQL wildcard escaping in LIKE queries (#10897)

This is an automated email from the ASF dual-hosted git repository.

atri pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git


The following commit(s) were added to refs/heads/master by this push:
     new 84aa659b71 Fixes SQL wildcard escaping in LIKE queries (#10897)
84aa659b71 is described below

commit 84aa659b713c3970da41a43925427708bc72a8c2
Author: Aravind Suresh <ar...@gmail.com>
AuthorDate: Fri Jun 23 00:25:29 2023 +0530

    Fixes SQL wildcard escaping in LIKE queries (#10897)
    
    * Fixes SQL wildcard escaping in LIKE queries
    
    * Updated unit tests
    
    * Fixed lint issues
    
    * Minor refactor
    
    * Addressed reviewer comments
    
    * Fixed lint
    
    * Addressed reviewer comments
    
    * Minor fixes
    
    * Minor refactor
---
 .../common/utils/RegexpPatternConverterUtils.java  | 87 +++++++++++++++-------
 .../utils/RegexpPatternConverterUtilsTest.java     | 37 +++++++++
 2 files changed, 97 insertions(+), 27 deletions(-)

diff --git a/pinot-common/src/main/java/org/apache/pinot/common/utils/RegexpPatternConverterUtils.java b/pinot-common/src/main/java/org/apache/pinot/common/utils/RegexpPatternConverterUtils.java
index c62cf86f23..31a0d59d95 100644
--- a/pinot-common/src/main/java/org/apache/pinot/common/utils/RegexpPatternConverterUtils.java
+++ b/pinot-common/src/main/java/org/apache/pinot/common/utils/RegexpPatternConverterUtils.java
@@ -18,6 +18,9 @@
  */
 package org.apache.pinot.common.utils;
 
+
+import com.google.common.primitives.Chars;
+
 /**
  * Utility for converting regex patterns.
  */
@@ -25,9 +28,14 @@ public class RegexpPatternConverterUtils {
   private RegexpPatternConverterUtils() {
   }
 
-  /* Represents all metacharacters to be processed */
-  public static final String[] REGEXP_METACHARACTERS =
-      {"\\", "^", "$", ".", "{", "}", "[", "]", "(", ")", "*", "+", "?", "|", "<", ">", "-", "&", "/"};
+  /*
+   * Represents all metacharacters to be processed.
+   * This excludes the \ (back slash) character as that doubles up as an escape character as well.
+   * So it is handled separately in the conversion logic.
+   */
+  public static final char[] REGEXP_METACHARACTERS = new char[]{
+          '^', '$', '.', '{', '}', '[', ']', '(', ')', '*', '+', '?', '|', '<', '>', '-', '&', '/'};
+  public static final char BACK_SLASH = '\\';
 
   /**
    * Converts a LIKE pattern into REGEXP_LIKE pattern.
@@ -64,24 +72,61 @@ public class RegexpPatternConverterUtils {
         break;
     }
 
-    String escaped = escapeMetaCharacters(likePattern.substring(start, end));
-    StringBuilder sb = new StringBuilder(escaped.length() + 2);
-    sb.append(prefix);
-    sb.append(escaped);
-    sb.append(suffix);
+    likePattern = likePattern.substring(start, end);
+    return escapeMetaCharsAndWildcards(likePattern, prefix, suffix);
+  }
 
+  /**
+   * Escapes the provided pattern by considering the following constraints:
+   * <ul>
+   *     <li> SQL wildcards escaping is handled (_, %) </li>
+   *     <li> Regex meta characters escaping is handled </li>
+   * </ul>
+   * @param input the provided input string
+   * @param prefix the prefix to be added to the output string
+   * @param suffix the suffix to be added to the output string
+   * @return the final output string
+   */
+  private static String escapeMetaCharsAndWildcards(String input, String prefix, String suffix) {
+    StringBuilder sb = new StringBuilder();
+    sb.append(prefix);
+    // handling SQL wildcards (_, %) by replacing them with corresponding regex equivalents
+    // we ignore them if the SQL wildcards are escaped
     int i = 0;
-    while (i < sb.length()) {
-      char c = sb.charAt(i);
-      if (c == '_') {
-        sb.replace(i, i + 1, ".");
-      } else if (c == '%') {
-        sb.replace(i, i + 1, ".*");
-        i++;
+    int len = input.length();
+    boolean isPrevCharBackSlash = false;
+    while (i < len) {
+      char c = input.charAt(i);
+      switch (c) {
+        case '_':
+          sb.append(isPrevCharBackSlash ? c : ".");
+          break;
+        case '%':
+          sb.append(isPrevCharBackSlash ? c : ".*");
+          break;
+        default:
+          // either the current character is a meta-character
+          // OR
+          // this means the previous character is a \
+          // but it was not used for escaping SQL wildcards
+          // so let's escape this \ in the output
+          // this case is separately handled outside the meta characters list
+          if (Chars.indexOf(REGEXP_METACHARACTERS, c) >= 0 || isPrevCharBackSlash) {
+            sb.append(BACK_SLASH);
+          }
+          sb.append(c);
+          break;
       }
+      isPrevCharBackSlash = (c == BACK_SLASH);
       i++;
     }
 
+    // handle trailing \
+    if (isPrevCharBackSlash) {
+      sb.append(BACK_SLASH);
+    }
+
+    sb.append(suffix);
     return sb.toString();
   }
 
@@ -103,18 +148,6 @@ public class RegexpPatternConverterUtils {
     return -1;
   }
 
-  /**
-   * Add escape characters before special characters
-   */
-  private static String escapeMetaCharacters(String pattern) {
-    for (String metaCharacter : REGEXP_METACHARACTERS) {
-      if (pattern.contains(metaCharacter)) {
-        pattern = pattern.replace(metaCharacter, "\\" + metaCharacter);
-      }
-    }
-    return pattern;
-  }
-
   /**
    * Converts a REGEXP_LIKE pattern into Lucene REGEXP pattern.
    */
diff --git a/pinot-common/src/test/java/org/apache/pinot/common/utils/RegexpPatternConverterUtilsTest.java b/pinot-common/src/test/java/org/apache/pinot/common/utils/RegexpPatternConverterUtilsTest.java
index 8c94cb4886..0341ee7606 100644
--- a/pinot-common/src/test/java/org/apache/pinot/common/utils/RegexpPatternConverterUtilsTest.java
+++ b/pinot-common/src/test/java/org/apache/pinot/common/utils/RegexpPatternConverterUtilsTest.java
@@ -125,4 +125,41 @@ public class RegexpPatternConverterUtilsTest {
     String regexpLikePattern = RegexpPatternConverterUtils.likeToRegexpLike("z%");
     assertEquals(regexpLikePattern, "^z");
   }
+
+  @Test
+  public void testEscapedWildcard1() {
+    // the first underscore (_ in _b) is escaped, so it is meant to match an actual "_b" string in the provided
+    // string
+    // the second underscore (_ in b_) is not escaped, so it is a SQL wildcard that is used to match a single
+    // character, which in the regex space is "."
+    String regexpLikePattern = RegexpPatternConverterUtils.likeToRegexpLike("a\\_b_\\");
+    assertEquals(regexpLikePattern, "^a\\_b.\\\\$");
+    String luceneRegExpPattern = RegexpPatternConverterUtils.regexpLikeToLuceneRegExp(regexpLikePattern);
+    assertEquals(luceneRegExpPattern, "a\\_b.\\\\");
+  }
+
+  @Test
+  public void testEscapedWildcard2() {
+    // the % (% in %b) is escaped, so it is meant to match an actual "%b" string in the provided
+    // string
+    // the "\" before c is a normal "\", so it is meant to match an actual "\" string in the provided
+    // string, this is done because "c" is not a SQL wildcard - hence the "\" before that is used as-is
+    // and is not used for escaping "c"
+    // so, this "\" is escaped in the output as it is a regex metacharacter and the converted regex
+    // will match "a%b\cde" in the provided string
+    String regexpLikePattern = RegexpPatternConverterUtils.likeToRegexpLike("a\\%b\\cde");
+    assertEquals(regexpLikePattern, "^a\\%b\\\\cde$");
+    String luceneRegExpPattern = RegexpPatternConverterUtils.regexpLikeToLuceneRegExp(regexpLikePattern);
+    assertEquals(luceneRegExpPattern, "a\\%b\\\\cde");
+  }
+  @Test
+  public void testEscapedWildcard3() {
+    // here the "\" character is used to escape _, so _ here is not treated as a SQL wildcard
+    // but it is meant to actually match "_" in the provided string
+    // so the corresponding regex doesn't convert the "_" to "."
+    String regexpLikePattern = RegexpPatternConverterUtils.likeToRegexpLike("%2\\_2%");
+    assertEquals(regexpLikePattern, "2\\_2");
+    String luceneRegExpPattern = RegexpPatternConverterUtils.regexpLikeToLuceneRegExp(regexpLikePattern);
+    assertEquals(luceneRegExpPattern, ".*2\\_2.*");
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@pinot.apache.org
For additional commands, e-mail: commits-help@pinot.apache.org