You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ya...@apache.org on 2020/12/08 11:58:26 UTC
[spark] branch branch-3.0 updated: [SPARK-33677][SQL] Skip LikeSimplification rule if pattern contains any escapeChar

This is an automated email from the ASF dual-hosted git repository.

yamamuro pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new ea7c2a1  [SPARK-33677][SQL] Skip LikeSimplification rule if pattern contains any escapeChar
ea7c2a1 is described below

commit ea7c2a15a02d8c8cf3e3f5a1260da76829d59596
Author: luluorta <lu...@gmail.com>
AuthorDate: Tue Dec 8 20:45:25 2020 +0900

    [SPARK-33677][SQL] Skip LikeSimplification rule if pattern contains any escapeChar
    
    ### What changes were proposed in this pull request?
    `LikeSimplification` rule does not work correctly for many cases that have patterns containing escape characters, for example:
    
    `SELECT s LIKE 'm%aca' ESCAPE '%' FROM t`
    `SELECT s LIKE 'maacaa' ESCAPE 'a' FROM t`
    
    For simpilicy, this PR makes this rule just be skipped if `pattern` contains any `escapeChar`.
    
    ### Why are the changes needed?
    Result corrupt.
    
    ### Does this PR introduce _any_ user-facing change?
    No
    
    ### How was this patch tested?
    Added Unit test.
    
    Closes #30625 from luluorta/SPARK-33677.
    
    Authored-by: luluorta <lu...@gmail.com>
    Signed-off-by: Takeshi Yamamuro <ya...@apache.org>
    (cherry picked from commit 99613cd5815b2de12274027dee0c0a6c0c57bd95)
    Signed-off-by: Takeshi Yamamuro <ya...@apache.org>
---
 .../spark/sql/catalyst/optimizer/expressions.scala | 18 +++++---
 .../optimizer/LikeSimplificationSuite.scala        | 48 ++++++++++++++++++++++
 .../scala/org/apache/spark/sql/SQLQuerySuite.scala | 14 +++++++
 3 files changed, 74 insertions(+), 6 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
index 7773f5c..7cbeb47 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
@@ -525,27 +525,33 @@ object LikeSimplification extends Rule[LogicalPlan] {
   private val equalTo = "([^_%]*)".r
 
   def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
-    case Like(input, Literal(pattern, StringType), escapeChar) =>
+    case l @ Like(input, Literal(pattern, StringType), escapeChar) =>
       if (pattern == null) {
         // If pattern is null, return null value directly, since "col like null" == null.
         Literal(null, BooleanType)
       } else {
-        val escapeStr = String.valueOf(escapeChar)
         pattern.toString match {
-          case startsWith(prefix) if !prefix.endsWith(escapeStr) =>
+          // There are three different situations when pattern containing escapeChar:
+          // 1. pattern contains invalid escape sequence, e.g. 'm\aca'
+          // 2. pattern contains escaped wildcard character, e.g. 'ma\%ca'
+          // 3. pattern contains escaped escape character, e.g. 'ma\\ca'
+          // Although there are patterns can be optimized if we handle the escape first, we just
+          // skip this rule if pattern contains any escapeChar for simplicity.
+          case p if p.contains(escapeChar) => l
+          case startsWith(prefix) =>
             StartsWith(input, Literal(prefix))
           case endsWith(postfix) =>
             EndsWith(input, Literal(postfix))
           // 'a%a' pattern is basically same with 'a%' && '%a'.
           // However, the additional `Length` condition is required to prevent 'a' match 'a%a'.
-          case startsAndEndsWith(prefix, postfix) if !prefix.endsWith(escapeStr) =>
+          case startsAndEndsWith(prefix, postfix) =>
             And(GreaterThanOrEqual(Length(input), Literal(prefix.length + postfix.length)),
               And(StartsWith(input, Literal(prefix)), EndsWith(input, Literal(postfix))))
-          case contains(infix) if !infix.endsWith(escapeStr) =>
+          case contains(infix) =>
             Contains(input, Literal(infix))
           case equalTo(str) =>
             EqualTo(input, Literal(str))
-          case _ => Like(input, Literal.create(pattern, StringType), escapeChar)
+          case _ => l
         }
       }
   }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LikeSimplificationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LikeSimplificationSuite.scala
index 436f62e..1812dce 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LikeSimplificationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LikeSimplificationSuite.scala
@@ -116,4 +116,52 @@ class LikeSimplificationSuite extends PlanTest {
     val optimized2 = Optimize.execute(originalQuery2.analyze)
     comparePlans(optimized2, originalQuery2.analyze)
   }
+
+  test("SPARK-33677: LikeSimplification should be skipped if pattern contains any escapeChar") {
+    val originalQuery1 =
+      testRelation
+        .where(('a like "abc%") || ('a like "\\abc%"))
+    val optimized1 = Optimize.execute(originalQuery1.analyze)
+    val correctAnswer1 = testRelation
+      .where(StartsWith('a, "abc") || ('a like "\\abc%"))
+      .analyze
+    comparePlans(optimized1, correctAnswer1)
+
+    val originalQuery2 =
+      testRelation
+        .where(('a like "%xyz") || ('a like "%xyz\\"))
+    val optimized2 = Optimize.execute(originalQuery2.analyze)
+    val correctAnswer2 = testRelation
+      .where(EndsWith('a, "xyz") || ('a like "%xyz\\"))
+      .analyze
+    comparePlans(optimized2, correctAnswer2)
+
+    val originalQuery3 =
+      testRelation
+        .where(('a like ("@bc%def", '@')) || ('a like "abc%def"))
+    val optimized3 = Optimize.execute(originalQuery3.analyze)
+    val correctAnswer3 = testRelation
+      .where(('a like ("@bc%def", '@')) ||
+        (Length('a) >= 6 && (StartsWith('a, "abc") && EndsWith('a, "def"))))
+      .analyze
+    comparePlans(optimized3, correctAnswer3)
+
+    val originalQuery4 =
+      testRelation
+        .where(('a like "%mn%") || ('a like ("%mn%", '%')))
+    val optimized4 = Optimize.execute(originalQuery4.analyze)
+    val correctAnswer4 = testRelation
+      .where(Contains('a, "mn") || ('a like ("%mn%", '%')))
+      .analyze
+    comparePlans(optimized4, correctAnswer4)
+
+    val originalQuery5 =
+      testRelation
+        .where(('a like "abc") || ('a like ("abbc", 'b')))
+    val optimized5 = Optimize.execute(originalQuery5.analyze)
+    val correctAnswer5 = testRelation
+      .where(('a === "abc") || ('a like ("abbc", 'b')))
+      .analyze
+    comparePlans(optimized5, correctAnswer5)
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index bcd3e79..1e2bb48 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -3535,6 +3535,20 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark
       }
     }
   }
+
+  test("SPARK-33677: LikeSimplification should be skipped if pattern contains any escapeChar") {
+    withTempView("df") {
+      Seq("m@ca").toDF("s").createOrReplaceTempView("df")
+
+      val e = intercept[AnalysisException] {
+        sql("SELECT s LIKE 'm%@ca' ESCAPE '%' FROM df").collect()
+      }
+      assert(e.message.contains("the pattern 'm%@ca' is invalid, " +
+        "the escape character is not allowed to precede '@'"))
+
+      checkAnswer(sql("SELECT s LIKE 'm@@ca' ESCAPE '@' FROM df"), Row(true))
+    }
+  }
 }
 
 case class Foo(bar: Option[String])


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org