You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@solr.apache.org by th...@apache.org on 2022/05/19 17:32:11 UTC

[solr] branch main updated: SOLR-16199: Improve handling of LIKE queries with wildcard (#870)

This is an automated email from the ASF dual-hosted git repository.

thelabdude pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/solr.git


The following commit(s) were added to refs/heads/main by this push:
     new b28e7514deb SOLR-16199: Improve handling of LIKE queries with wildcard (#870)
b28e7514deb is described below

commit b28e7514deb92c55b24aa102b65cd9992a47beff
Author: Kiran Chitturi <ki...@lucidworks.com>
AuthorDate: Thu May 19 10:32:04 2022 -0700

    SOLR-16199: Improve handling of LIKE queries with wildcard (#870)
    
    * SOLR-16199 Improve handling of LIKE queries with wildcard
    
    * rewrite escape logic
    
    * Fix spotless issue
    
    Co-authored-by: kiran chitturi <kc...@apple.com>
---
 solr/CHANGES.txt                                   |   3 +
 .../org/apache/solr/handler/sql/SolrFilter.java    | 107 ++++++++++++++++----
 .../apache/solr/handler/sql/TestSQLHandler.java    | 109 +++++++++++++++++++--
 3 files changed, 191 insertions(+), 28 deletions(-)

diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 4ffa7b7087b..253a90152d0 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -816,6 +816,9 @@ Bug Fixes
 
 * SOLR-16022: Enforce special character requirements on passwords with length less than 15 (Timothy Potter)
 
+* SOLR-16199: Improve query syntax construction for SQL LIKE clause with phrases and wildcards
+  (Kiran Chitturi, Aroop Ganguly, Amrit Sarkar via Timothy Potter)
+
 ==================  8.11.1 ==================
 
 Bug Fixes
diff --git a/solr/modules/sql/src/java/org/apache/solr/handler/sql/SolrFilter.java b/solr/modules/sql/src/java/org/apache/solr/handler/sql/SolrFilter.java
index 216ed4913a3..8c913755c38 100644
--- a/solr/modules/sql/src/java/org/apache/solr/handler/sql/SolrFilter.java
+++ b/solr/modules/sql/src/java/org/apache/solr/handler/sql/SolrFilter.java
@@ -346,21 +346,54 @@ class SolrFilter extends Filter implements SolrRel {
     }
 
     protected String translateLike(RexNode like) {
-      Pair<String, RexLiteral> pair = getFieldValuePair(like);
+      Pair<Pair<String, RexLiteral>, Character> pairWithEscapeCharacter =
+          getFieldValuePairWithEscapeCharacter(like);
+      Pair<String, RexLiteral> pair = pairWithEscapeCharacter.getKey();
+      Character escapeChar = pairWithEscapeCharacter.getValue();
+
       String terms = pair.getValue().toString().trim();
-      terms = terms.replace("'", "").replace('%', '*').replace('_', '?');
-      boolean wrappedQuotes = false;
+      terms = translateLikeTermToSolrSyntax(terms, escapeChar);
+
       if (!terms.startsWith("(") && !terms.startsWith("[") && !terms.startsWith("{")) {
-        // restore the * and ? after escaping
-        terms =
-            "\""
-                + ClientUtils.escapeQueryChars(terms).replace("\\*", "*").replace("\\?", "?")
-                + "\"";
-        wrappedQuotes = true;
-      }
+        terms = escapeWithWildcard(terms);
+
+        // if terms contains multiple words and one or more wildcard chars, then we need to employ
+        // the complexphrase parser
+        // but that expects the terms wrapped in double-quotes, not parens
+        boolean hasMultipleTerms = terms.split("\\s+").length > 1;
+        if (hasMultipleTerms && (terms.contains("*") || terms.contains("?"))) {
+          String quotedTerms = "\"" + terms.substring(1, terms.length() - 1) + "\"";
+          return "{!complexphrase}" + pair.getKey() + ":" + quotedTerms;
+        }
+      } // else treat as an embedded Solr query and pass-through
+
+      return pair.getKey() + ":" + terms;
+    }
 
-      String query = pair.getKey() + ":" + terms;
-      return wrappedQuotes ? "{!complexphrase}" + query : query;
+    private String translateLikeTermToSolrSyntax(String term, Character escapeChar) {
+      boolean isEscaped = false;
+      StringBuilder sb = new StringBuilder();
+      // Special character % and _ are escaped with escape character and single quote is escaped
+      // with another single quote
+      // If single quote is escaped with escape character, calcite parser fails
+      for (int i = 0; i < term.length(); i++) {
+        char c = term.charAt(i);
+        if (!isEscaped && escapeChar != null && escapeChar == c) {
+          isEscaped = true;
+        } else if (c == '%' && !isEscaped) {
+          sb.append('*');
+        } else if (c == '_' && !isEscaped) {
+          sb.append("?");
+        } else if (c == '\'') {
+          if (i > 0 && term.charAt(i - 1) == '\'') {
+            sb.append(c);
+          }
+        } else {
+          sb.append(c);
+          if (isEscaped) isEscaped = false;
+        }
+      }
+      return sb.toString();
     }
 
     protected String translateComparison(RexNode node) {
@@ -414,18 +447,30 @@ class SolrFilter extends Filter implements SolrRel {
 
       String terms = toSolrLiteral(key, value).trim();
 
-      boolean wrappedQuotes = false;
       if (!terms.startsWith("(") && !terms.startsWith("[") && !terms.startsWith("{")) {
-        terms = "\"" + ClientUtils.escapeQueryChars(terms) + "\"";
-        wrappedQuotes = true;
+        if (terms.contains("*") || terms.contains("?")) {
+          terms = escapeWithWildcard(terms);
+        } else {
+          terms = "\"" + ClientUtils.escapeQueryChars(terms) + "\"";
+        }
       }
 
-      String clause = key + ":" + terms;
-      if (terms.contains("*") && wrappedQuotes) {
-        clause = "{!complexphrase}" + clause;
-      }
+      return key + ":" + terms;
+    }
 
-      return clause;
+    // Wrap filter criteria containing wildcard with parens and unescape the wildcards after
+    // escaping protected query chars
+    private String escapeWithWildcard(String terms) {
+      String escaped =
+          ClientUtils.escapeQueryChars(terms)
+              .replace("\\*", "*")
+              .replace("\\?", "?")
+              .replace("\\ ", " ");
+      // if multiple terms, then wrap with parens
+      if (escaped.split("\\s+").length > 1) {
+        escaped = "(" + escaped + ")";
+      }
+      return escaped;
     }
 
     // translate to a literal string value for Solr queries, such as translating a
@@ -480,6 +525,28 @@ class SolrFilter extends Filter implements SolrRel {
       return timestamp;
     }
 
+    protected Pair<Pair<String, RexLiteral>, Character> getFieldValuePairWithEscapeCharacter(
+        RexNode node) {
+      if (!(node instanceof RexCall)) {
+        throw new AssertionError("expected RexCall for predicate but found: " + node);
+      }
+      RexCall call = (RexCall) node;
+      if (call.getOperands().size() == 3) {
+        RexNode escapeNode = call.getOperands().get(2);
+        Character escapeChar = null;
+        if (escapeNode.getKind() == SqlKind.LITERAL) {
+          RexLiteral literal = (RexLiteral) escapeNode;
+          if (literal.getTypeName() == SqlTypeName.CHAR) {
+            escapeChar = literal.getValueAs(Character.class);
+          }
+        }
+        return Pair.of(
+            translateBinary2(call.getOperands().get(0), call.getOperands().get(1)), escapeChar);
+      } else {
+        return Pair.of(getFieldValuePair(node), null);
+      }
+    }
+
     protected Pair<String, RexLiteral> getFieldValuePair(RexNode node) {
       if (!(node instanceof RexCall)) {
         throw new AssertionError("expected RexCall for predicate but found: " + node);
diff --git a/solr/modules/sql/src/test/org/apache/solr/handler/sql/TestSQLHandler.java b/solr/modules/sql/src/test/org/apache/solr/handler/sql/TestSQLHandler.java
index fde59889b31..eefc20f0309 100644
--- a/solr/modules/sql/src/test/org/apache/solr/handler/sql/TestSQLHandler.java
+++ b/solr/modules/sql/src/test/org/apache/solr/handler/sql/TestSQLHandler.java
@@ -2434,18 +2434,95 @@ public class TestSQLHandler extends SolrCloudTestCase {
   @Test
   public void testLike() throws Exception {
     new UpdateRequest()
-        .add("id", "1", "a_s", "hello-1", "b_s", "foo")
-        .add("id", "2", "a_s", "world-2", "b_s", "foo")
-        .add("id", "3", "a_s", "hello-3", "b_s", "foo")
-        .add("id", "4", "a_s", "world-4", "b_s", "foo")
-        .add("id", "5", "a_s", "hello-5", "b_s", "foo")
-        .add("id", "6", "a_s", "world-6", "b_s", "bar")
+        .add(
+            "id",
+            "1",
+            "a_s",
+            "hello-1",
+            "b_s",
+            "foo",
+            "c_t",
+            "the quick brown fox jumped over the lazy dog")
+        .add(
+            "id",
+            "2",
+            "a_s",
+            "world-2",
+            "b_s",
+            "foo",
+            "c_t",
+            "the sly black dog jumped over the sleeping pig")
+        .add(
+            "id",
+            "3",
+            "a_s",
+            "hello-3",
+            "b_s",
+            "foo",
+            "c_t",
+            "the quick brown fox jumped over the lazy dog")
+        .add(
+            "id",
+            "4",
+            "a_s",
+            "world-4",
+            "b_s",
+            "foo",
+            "c_t",
+            "the sly black dog jumped over the sleepy pig")
+        .add(
+            "id",
+            "5",
+            "a_s",
+            "hello-5",
+            "b_s",
+            "foo",
+            "c_t",
+            "the quick brown fox jumped over the lazy dog")
+        .add(
+            "id",
+            "6",
+            "a_s",
+            "w_orld-6",
+            "b_s",
+            "bar",
+            "c_t",
+            "the sly black dog jumped over the sleepin piglet")
+        .add(
+            "id",
+            "7",
+            "a_s",
+            "world%_7",
+            "b_s",
+            "zaz",
+            "c_t",
+            "the lazy dog jumped over the quick brown fox")
+        .add(
+            "id",
+            "8",
+            "a_s",
+            "world'\\9",
+            "b_s",
+            "zaz",
+            "c_t",
+            "the lazy dog ducked over the quick brown fox")
         .commit(cluster.getSolrClient(), COLLECTIONORALIAS);
 
     expectResults("SELECT a_s FROM $ALIAS WHERE a_s LIKE 'h_llo-%'", 3);
 
+    Throwable exception =
+        expectThrows(
+            IOException.class,
+            () ->
+                expectResults("SELECT a_s FROM $ALIAS WHERE a_s LIKE 'world\\'%' ESCAPE '\\'", 1));
+    assertTrue(exception.getMessage().contains("parse failed: Lexical error"));
+    expectResults("SELECT a_s FROM $ALIAS WHERE a_s LIKE 'world''%'", 1);
+    expectResults("SELECT a_s FROM $ALIAS WHERE a_s LIKE 'world''\\\\%' ESCAPE '\\'", 1);
+    expectResults("SELECT a_s FROM $ALIAS WHERE a_s LIKE 'w\\_o_ld%' ESCAPE '\\'", 1);
+    expectResults("SELECT a_s FROM $ALIAS WHERE a_s LIKE 'world\\%\\__' ESCAPE '\\'", 1);
+
     // not technically valid SQL but we support it for legacy purposes, see: SOLR-15463
-    expectResults("SELECT a_s FROM $ALIAS WHERE a_s='world-*'", 3);
+    expectResults("SELECT a_s FROM $ALIAS WHERE a_s='world-*'", 2);
 
     // no results
     expectResults("SELECT a_s FROM $ALIAS WHERE a_s LIKE '%MATCHNONE%'", 0);
@@ -2454,7 +2531,7 @@ public class TestSQLHandler extends SolrCloudTestCase {
     expectResults("SELECT b_s FROM $ALIAS WHERE b_s LIKE 'foo'", 5);
 
     // NOT LIKE
-    expectResults("SELECT b_s FROM $ALIAS WHERE b_s NOT LIKE 'f%'", 1);
+    expectResults("SELECT b_s FROM $ALIAS WHERE b_s NOT LIKE 'f%'", 3);
 
     // leading wildcard
     expectResults("SELECT b_s FROM $ALIAS WHERE b_s LIKE '%oo'", 5);
@@ -2463,6 +2540,22 @@ public class TestSQLHandler extends SolrCloudTestCase {
     expectResults("SELECT b_s FROM $ALIAS WHERE b_s LIKE '(fo%)'", 5);
 
     expectResults("SELECT b_s FROM $ALIAS WHERE b_s LIKE '(ba*)'", 1);
+
+    expectResults("SELECT b_s FROM $ALIAS WHERE c_t LIKE 'fox'", 5);
+    expectResults("SELECT b_s FROM $ALIAS WHERE c_t LIKE 'sleep% pig%'", 3);
+    expectResults("SELECT b_s FROM $ALIAS WHERE c_t LIKE 'sleep% pigle%'", 1);
+    expectResults("SELECT b_s FROM $ALIAS WHERE c_t LIKE 'sleep% piglet'", 1);
+    expectResults("SELECT b_s FROM $ALIAS WHERE c_t LIKE 'sleep% pigle%'", 1);
+    expectResults("SELECT b_s FROM $ALIAS WHERE c_t LIKE 'sleep% piglet'", 1);
+
+    expectResults("SELECT b_s FROM $ALIAS WHERE c_t LIKE 'jump%'", 7);
+    expectResults("SELECT b_s FROM $ALIAS WHERE c_t LIKE '%ump%'", 7);
+
+    expectResults("SELECT b_s FROM $ALIAS WHERE c_t LIKE '(\"dog pig\"~5)'", 2);
+    expectResults("SELECT b_s FROM $ALIAS WHERE c_t LIKE 'jumped over'", 8);
+    expectResults("SELECT b_s FROM $ALIAS WHERE c_t LIKE 'quick brown fox'", 5);
+    expectResults("SELECT b_s FROM $ALIAS WHERE b_s LIKE 'foo*'", 5);
+    expectResults("SELECT b_s FROM $ALIAS WHERE c_t LIKE '*og'", 8);
   }
 
   @Test