You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ya...@apache.org on 2021/01/08 01:44:58 UTC
[spark] branch branch-3.0 updated: [SPARK-33100][SQL][3.0] Ignore a semicolon inside a bracketed comment in spark-sql

This is an automated email from the ASF dual-hosted git repository.

yamamuro pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new e7d5344  [SPARK-33100][SQL][3.0] Ignore a semicolon inside a bracketed comment in spark-sql
e7d5344 is described below

commit e7d53449f198bd8c5ee97d58f285994e31ea2d1a
Author: fwang12 <fw...@ebay.com>
AuthorDate: Fri Jan 8 10:44:12 2021 +0900

    [SPARK-33100][SQL][3.0] Ignore a semicolon inside a bracketed comment in spark-sql
    
    ### What changes were proposed in this pull request?
    Now the spark-sql does not support parse the sql statements with bracketed comments.
    For the sql statements:
    ```
    /* SELECT 'test'; */
    SELECT 'test';
    ```
    Would be split to two statements:
    The first one: `/* SELECT 'test'`
    The second one: `*/ SELECT 'test'`
    
    Then it would throw an exception because the first one is illegal.
    In this PR, we ignore the content in bracketed comments while splitting the sql statements.
    Besides, we ignore the comment without any content.
    
    NOTE: This backport comes from https://github.com/apache/spark/pull/29982
    
    ### Why are the changes needed?
    Spark-sql might split the statements inside bracketed comments and it is not correct.
    
    ### Does this PR introduce _any_ user-facing change?
    No.
    
    ### How was this patch tested?
    Added UT.
    
    Closes #31033 from turboFei/SPARK-33100.
    
    Authored-by: fwang12 <fw...@ebay.com>
    Signed-off-by: Takeshi Yamamuro <ya...@apache.org>
---
 .../sql/hive/thriftserver/SparkSQLCLIDriver.scala  | 50 ++++++++++++++++++----
 .../spark/sql/hive/thriftserver/CliSuite.scala     | 23 ++++++++++
 2 files changed, 65 insertions(+), 8 deletions(-)

diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
index 6abb905..581aa68 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
@@ -518,15 +518,32 @@ private[hive] class SparkSQLCLIDriver extends CliDriver with Logging {
   // Note: [SPARK-31595] if there is a `'` in a double quoted string, or a `"` in a single quoted
   // string, the origin implementation from Hive will not drop the trailing semicolon as expected,
   // hence we refined this function a little bit.
+  // Note: [SPARK-33100] Ignore a semicolon inside a bracketed comment in spark-sql.
   private def splitSemiColon(line: String): JList[String] = {
     var insideSingleQuote = false
     var insideDoubleQuote = false
-    var insideComment = false
+    var insideSimpleComment = false
+    var bracketedCommentLevel = 0
     var escape = false
     var beginIndex = 0
+    var leavingBracketedComment = false
+    var isStatement = false
     val ret = new JArrayList[String]
 
+    def insideBracketedComment: Boolean = bracketedCommentLevel > 0
+    def insideComment: Boolean = insideSimpleComment || insideBracketedComment
+    def statementInProgress(index: Int): Boolean = isStatement || (!insideComment &&
+      index > beginIndex && !s"${line.charAt(index)}".trim.isEmpty)
+
     for (index <- 0 until line.length) {
+      // Checks if we need to decrement a bracketed comment level; the last character '/' of
+      // bracketed comments is still inside the comment, so `insideBracketedComment` must keep true
+      // in the previous loop and we decrement the level here if needed.
+      if (leavingBracketedComment) {
+        bracketedCommentLevel -= 1
+        leavingBracketedComment = false
+      }
+
       if (line.charAt(index) == '\'' && !insideComment) {
         // take a look to see if it is escaped
         // See the comment above about SPARK-31595
@@ -549,21 +566,34 @@ private[hive] class SparkSQLCLIDriver extends CliDriver with Logging {
           // Sample query: select "quoted value --"
           //                                    ^^ avoids starting a comment if it's inside quotes.
         } else if (hasNext && line.charAt(index + 1) == '-') {
-          // ignore quotes and ;
-          insideComment = true
+          // ignore quotes and ; in simple comment
+          insideSimpleComment = true
         }
       } else if (line.charAt(index) == ';') {
         if (insideSingleQuote || insideDoubleQuote || insideComment) {
           // do not split
         } else {
-          // split, do not include ; itself
-          ret.add(line.substring(beginIndex, index))
+          if (isStatement) {
+            // split, do not include ; itself
+            ret.add(line.substring(beginIndex, index))
+          }
           beginIndex = index + 1
+          isStatement = false
         }
       } else if (line.charAt(index) == '\n') {
-        // with a new line the inline comment should end.
+        // with a new line the inline simple comment should end.
         if (!escape) {
-          insideComment = false
+          insideSimpleComment = false
+        }
+      } else if (line.charAt(index) == '/' && !insideSimpleComment) {
+        val hasNext = index + 1 < line.length
+        if (insideSingleQuote || insideDoubleQuote) {
+          // Ignores '/' in any case of quotes
+        } else if (insideBracketedComment && line.charAt(index - 1) == '*' ) {
+          // Decrements `bracketedCommentLevel` at the beginning of the next loop
+          leavingBracketedComment = true
+        } else if (hasNext && !insideBracketedComment && line.charAt(index + 1) == '*') {
+          bracketedCommentLevel += 1
         }
       }
       // set the escape
@@ -572,8 +602,12 @@ private[hive] class SparkSQLCLIDriver extends CliDriver with Logging {
       } else if (line.charAt(index) == '\\') {
         escape = true
       }
+
+      isStatement = statementInProgress(index)
+    }
+    if (isStatement) {
+      ret.add(line.substring(beginIndex))
     }
-    ret.add(line.substring(beginIndex))
     ret
   }
 }
diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
index ea1a371..4a075d3 100644
--- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
+++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
@@ -551,4 +551,27 @@ class CliSuite extends SparkFunSuite with BeforeAndAfterAll with Logging {
       errorResponses = Seq("AnalysisException"))(
       ("", "Error in query: The second argument of 'date_sub' function needs to be an integer."))
   }
+
+  test("SPARK-33100: Ignore a semicolon inside a bracketed comment in spark-sql") {
+    runCliWithin(4.minute)(
+      "/* SELECT 'test';*/ SELECT 'test';" -> "test",
+      ";;/* SELECT 'test';*/ SELECT 'test';" -> "test",
+      "/* SELECT 'test';*/;; SELECT 'test';" -> "test",
+      "SELECT 'test'; -- SELECT 'test';" -> "test",
+      "SELECT 'test'; /* SELECT 'test';*/;" -> "test",
+      "/*$meta chars{^\\;}*/ SELECT 'test';" -> "test",
+      "/*\nmulti-line\n*/ SELECT 'test';" -> "test",
+      "/*/* multi-level bracketed*/ SELECT 'test';" -> "test"
+    )
+  }
+
+  test("SPARK-33100: test sql statements with hint in bracketed comment") {
+    runCliWithin(2.minute)(
+      "CREATE TEMPORARY VIEW t1 AS SELECT * FROM VALUES(1, 2) AS t1(k, v);" -> "",
+      "CREATE TEMPORARY VIEW t2 AS SELECT * FROM VALUES(2, 1) AS t2(k, v);" -> "",
+      "EXPLAIN SELECT /*+ MERGEJOIN(t1) */ t1.* FROM t1 JOIN t2 ON t1.k = t2.v;" -> "SortMergeJoin",
+      "EXPLAIN SELECT /* + MERGEJOIN(t1) */ t1.* FROM t1 JOIN t2 ON t1.k = t2.v;"
+        -> "BroadcastHashJoin"
+    )
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org