You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@spark.apache.org by ya...@apache.org on 2020/06/15 15:47:43 UTC

[spark] branch branch-3.0 updated (764da2f -> ed69190)

This is an automated email from the ASF dual-hosted git repository.

yamamuro pushed a change to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git.


    from 764da2f  [SPARK-31990][SS] Use toSet.toSeq in Dataset.dropDuplicates
     new b70c68a  [SPARK-31950][SQL][TESTS] Extract SQL keywords from the SqlBase.g4 file
     new ed69190  [SPARK-26905][SQL] Follow the SQL:2016 reserved keywords

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 docs/sql-ref-ansi-compliance.md                    |   6 +-
 .../apache/spark/sql/catalyst/parser/SqlBase.g4    |   7 +
 .../resources/ansi-sql-2016-reserved-keywords.txt  | 401 ++++++++++++++++++
 .../parser/TableIdentifierParserSuite.scala        | 452 ++++++---------------
 4 files changed, 537 insertions(+), 329 deletions(-)
 create mode 100644 sql/catalyst/src/test/resources/ansi-sql-2016-reserved-keywords.txt


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org

[spark] 02/02: [SPARK-26905][SQL] Follow the SQL:2016 reserved keywords

Posted by ya...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

yamamuro pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git

commit ed69190ce0762f3b741b8d175ef8d02da45f3183
Author: Takeshi Yamamuro <ya...@apache.org>
AuthorDate: Tue Jun 16 00:27:45 2020 +0900

    [SPARK-26905][SQL] Follow the SQL:2016 reserved keywords
    
    ### What changes were proposed in this pull request?
    
    This PR intends to move keywords `ANTI`, `SEMI`, and `MINUS` from reserved to non-reserved.
    
    ### Why are the changes needed?
    
    To comply with the ANSI/SQL standard.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No.
    
    ### How was this patch tested?
    
    Added tests.
    
    Closes #28807 from maropu/SPARK-26905-2.
    
    Authored-by: Takeshi Yamamuro <ya...@apache.org>
    Signed-off-by: Takeshi Yamamuro <ya...@apache.org>
---
 docs/sql-ref-ansi-compliance.md                    |   6 +-
 .../apache/spark/sql/catalyst/parser/SqlBase.g4    |   3 +
 .../resources/ansi-sql-2016-reserved-keywords.txt  | 401 +++++++++++++++++++++
 .../parser/TableIdentifierParserSuite.scala        |  24 +-
 4 files changed, 429 insertions(+), 5 deletions(-)

diff --git a/docs/sql-ref-ansi-compliance.md b/docs/sql-ref-ansi-compliance.md
index eab194c..e5ca7e9d 100644
--- a/docs/sql-ref-ansi-compliance.md
+++ b/docs/sql-ref-ansi-compliance.md
@@ -135,7 +135,7 @@ Below is a list of all the keywords in Spark SQL.
 |ALTER|non-reserved|non-reserved|reserved|
 |ANALYZE|non-reserved|non-reserved|non-reserved|
 |AND|reserved|non-reserved|reserved|
-|ANTI|reserved|strict-non-reserved|non-reserved|
+|ANTI|non-reserved|strict-non-reserved|non-reserved|
 |ANY|reserved|non-reserved|reserved|
 |ARCHIVE|non-reserved|non-reserved|non-reserved|
 |ARRAY|non-reserved|non-reserved|reserved|
@@ -264,7 +264,7 @@ Below is a list of all the keywords in Spark SQL.
 |MAP|non-reserved|non-reserved|non-reserved|
 |MATCHED|non-reserved|non-reserved|non-reserved|
 |MERGE|non-reserved|non-reserved|non-reserved|
-|MINUS|reserved|strict-non-reserved|non-reserved|
+|MINUS|not-reserved|strict-non-reserved|non-reserved|
 |MINUTE|reserved|non-reserved|reserved|
 |MONTH|reserved|non-reserved|reserved|
 |MSCK|non-reserved|non-reserved|non-reserved|
@@ -325,7 +325,7 @@ Below is a list of all the keywords in Spark SQL.
 |SCHEMA|non-reserved|non-reserved|non-reserved|
 |SECOND|reserved|non-reserved|reserved|
 |SELECT|reserved|non-reserved|reserved|
-|SEMI|reserved|strict-non-reserved|non-reserved|
+|SEMI|non-reserved|strict-non-reserved|non-reserved|
 |SEPARATED|non-reserved|non-reserved|non-reserved|
 |SERDE|non-reserved|non-reserved|non-reserved|
 |SERDEPROPERTIES|non-reserved|non-reserved|non-reserved|
diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
index 14a6687..5821a74 100644
--- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
+++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
@@ -994,6 +994,7 @@ ansiNonReserved
     | AFTER
     | ALTER
     | ANALYZE
+    | ANTI
     | ARCHIVE
     | ARRAY
     | ASC
@@ -1126,10 +1127,12 @@ ansiNonReserved
     | ROW
     | ROWS
     | SCHEMA
+    | SEMI
     | SEPARATED
     | SERDE
     | SERDEPROPERTIES
     | SET
+    | SETMINUS
     | SETS
     | SHOW
     | SKEWED
diff --git a/sql/catalyst/src/test/resources/ansi-sql-2016-reserved-keywords.txt b/sql/catalyst/src/test/resources/ansi-sql-2016-reserved-keywords.txt
new file mode 100644
index 0000000..921491a
--- /dev/null
+++ b/sql/catalyst/src/test/resources/ansi-sql-2016-reserved-keywords.txt
@@ -0,0 +1,401 @@
+-- This file comes from: https://github.com/postgres/postgres/tree/master/doc/src/sgml/keywords
+ABS
+ACOS
+ALL
+ALLOCATE
+ALTER
+AND
+ANY
+ARE
+ARRAY
+ARRAY_AGG
+ARRAY_MAX_CARDINALITY
+AS
+ASENSITIVE
+ASIN
+ASYMMETRIC
+AT
+ATAN
+ATOMIC
+AUTHORIZATION
+AVG
+BEGIN
+BEGIN_FRAME
+BEGIN_PARTITION
+BETWEEN
+BIGINT
+BINARY
+BLOB
+BOOLEAN
+BOTH
+BY
+CALL
+CALLED
+CARDINALITY
+CASCADED
+CASE
+CAST
+CEIL
+CEILING
+CHAR
+CHAR_LENGTH
+CHARACTER
+CHARACTER_LENGTH
+CHECK
+CLASSIFIER
+CLOB
+CLOSE
+COALESCE
+COLLATE
+COLLECT
+COLUMN
+COMMIT
+CONDITION
+CONNECT
+CONSTRAINT
+CONTAINS
+CONVERT
+COPY
+CORR
+CORRESPONDING
+COS
+COSH
+COUNT
+COVAR_POP
+COVAR_SAMP
+CREATE
+CROSS
+CUBE
+CUME_DIST
+CURRENT
+CURRENT_CATALOG
+CURRENT_DATE
+CURRENT_DEFAULT_TRANSFORM_GROUP
+CURRENT_PATH
+CURRENT_ROLE
+CURRENT_ROW
+CURRENT_SCHEMA
+CURRENT_TIME
+CURRENT_TIMESTAMP
+CURRENT_TRANSFORM_GROUP_FOR_TYPE
+CURRENT_USER
+CURSOR
+CYCLE
+DATE
+DAY
+DEALLOCATE
+DEC
+DECIMAL
+DECFLOAT
+DECLARE
+DEFAULT
+DEFINE
+DELETE
+DENSE_RANK
+DEREF
+DESCRIBE
+DETERMINISTIC
+DISCONNECT
+DISTINCT
+DOUBLE
+DROP
+DYNAMIC
+EACH
+ELEMENT
+ELSE
+EMPTY
+END
+END_FRAME
+END_PARTITION
+END-EXEC
+EQUALS
+ESCAPE
+EVERY
+EXCEPT
+EXEC
+EXECUTE
+EXISTS
+EXP
+EXTERNAL
+EXTRACT
+FALSE
+FETCH
+FILTER
+FIRST_VALUE
+FLOAT
+FLOOR
+FOR
+FOREIGN
+FRAME_ROW
+FREE
+FROM
+FULL
+FUNCTION
+FUSION
+GET
+GLOBAL
+GRANT
+GROUP
+GROUPING
+GROUPS
+HAVING
+HOLD
+HOUR
+IDENTITY
+IN
+INDICATOR
+INITIAL
+INNER
+INOUT
+INSENSITIVE
+INSERT
+INT
+INTEGER
+INTERSECT
+INTERSECTION
+INTERVAL
+INTO
+IS
+JOIN
+JSON_ARRAY
+JSON_ARRAYAGG
+JSON_EXISTS
+JSON_OBJECT
+JSON_OBJECTAGG
+JSON_QUERY
+JSON_TABLE
+JSON_TABLE_PRIMITIVE
+JSON_VALUE
+LAG
+LANGUAGE
+LARGE
+LAST_VALUE
+LATERAL
+LEAD
+LEADING
+LEFT
+LIKE
+LIKE_REGEX
+LISTAGG
+LN
+LOCAL
+LOCALTIME
+LOCALTIMESTAMP
+LOG
+LOG10
+LOWER
+MATCH
+MATCH_NUMBER
+MATCH_RECOGNIZE
+MATCHES
+MAX
+MEASURES
+MEMBER
+MERGE
+METHOD
+MIN
+MINUTE
+MOD
+MODIFIES
+MODULE
+MONTH
+MULTISET
+NATIONAL
+NATURAL
+NCHAR
+NCLOB
+NEW
+NO
+NONE
+NORMALIZE
+NOT
+NTH_VALUE
+NTILE
+NULL
+NULLIF
+NUMERIC
+OCTET_LENGTH
+OCCURRENCES_REGEX
+OF
+OFFSET
+OLD
+OMIT
+ON
+ONE
+ONLY
+OPEN
+OR
+ORDER
+OUT
+OUTER
+OVER
+OVERLAPS
+OVERLAY
+PARAMETER
+PARTITION
+PATTERN
+PER
+PERCENT
+PERCENT_RANK
+PERCENTILE_CONT
+PERCENTILE_DISC
+PERIOD
+PERMUTE
+PORTION
+POSITION
+POSITION_REGEX
+POWER
+PRECEDES
+PRECISION
+PREPARE
+PRIMARY
+PROCEDURE
+PTF
+RANGE
+RANK
+READS
+REAL
+RECURSIVE
+REF
+REFERENCES
+REFERENCING
+REGR_AVGX
+REGR_AVGY
+REGR_COUNT
+REGR_INTERCEPT
+REGR_R2
+REGR_SLOPE
+REGR_SXX
+REGR_SXY
+REGR_SYY
+RELEASE
+RESULT
+RETURN
+RETURNS
+REVOKE
+RIGHT
+ROLLBACK
+ROLLUP
+ROW
+ROW_NUMBER
+ROWS
+RUNNING
+SAVEPOINT
+SCOPE
+SCROLL
+SEARCH
+SECOND
+SEEK
+SELECT
+SENSITIVE
+SESSION_USER
+SET
+SHOW
+SIMILAR
+SIN
+SINH
+SKIP
+SMALLINT
+SOME
+SPECIFIC
+SPECIFICTYPE
+SQL
+SQLEXCEPTION
+SQLSTATE
+SQLWARNING
+SQRT
+START
+STATIC
+STDDEV_POP
+STDDEV_SAMP
+SUBMULTISET
+SUBSET
+SUBSTRING
+SUBSTRING_REGEX
+SUCCEEDS
+SUM
+SYMMETRIC
+SYSTEM
+SYSTEM_TIME
+SYSTEM_USER
+TABLE
+TABLESAMPLE
+TAN
+TANH
+THEN
+TIME
+TIMESTAMP
+TIMEZONE_HOUR
+TIMEZONE_MINUTE
+TO
+TRAILING
+TRANSLATE
+TRANSLATE_REGEX
+TRANSLATION
+TREAT
+TRIGGER
+TRIM
+TRIM_ARRAY
+TRUE
+TRUNCATE
+UESCAPE
+UNION
+UNIQUE
+UNKNOWN
+UNMATCHED
+UNNEST
+UPDATE
+UPPER
+USER
+USING
+VALUE
+VALUES
+VALUE_OF
+VAR_POP
+VAR_SAMP
+VARBINARY
+VARCHAR
+VARYING
+VERSIONING
+WHEN
+WHENEVER
+WHERE
+WIDTH_BUCKET
+WINDOW
+WITH
+WITHIN
+WITHOUT
+YEAR
+DATALINK
+DLNEWCOPY
+DLPREVIOUSCOPY
+DLURLCOMPLETE
+DLURLCOMPLETEWRITE
+DLURLCOMPLETEONLY
+DLURLPATH
+DLURLPATHWRITE
+DLURLPATHONLY
+DLURLSCHEME
+DLURLSERVER
+DLVALUE
+IMPORT
+XML
+XMLAGG
+XMLATTRIBUTES
+XMLBINARY
+XMLCAST
+XMLCOMMENT
+XMLCONCAT
+XMLDOCUMENT
+XMLELEMENT
+XMLEXISTS
+XMLFOREST
+XMLITERATE
+XMLNAMESPACES
+XMLPARSE
+XMLPI
+XMLQUERY
+XMLSERIALIZE
+XMLTABLE
+XMLTEXT
+XMLVALIDATE
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableIdentifierParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableIdentifierParserSuite.scala
index 04969e3..04c427d 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableIdentifierParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableIdentifierParserSuite.scala
@@ -16,8 +16,11 @@
  */
 package org.apache.spark.sql.catalyst.parser
 
+import java.io.File
+import java.nio.file.Files
 import java.util.Locale
 
+import scala.collection.JavaConverters._
 import scala.collection.mutable
 
 import org.apache.spark.SparkFunSuite
@@ -340,7 +343,12 @@ class TableIdentifierParserSuite extends SparkFunSuite with SQLHelper {
         // The case where a symbol has multiple literal definitions,
         // e.g., `DATABASES: 'DATABASES' | 'SCHEMAS';`.
         if (hasMultipleLiterals) {
-          val literals = splitDefs.map(_.replaceAll("'", "").trim).toSeq
+          // Filters out inappropriate entries, e.g., `!` in `NOT: 'NOT' | '!';`
+          val litDef = """([A-Z_]+)""".r
+          val literals = splitDefs.map(_.replaceAll("'", "").trim).toSeq.flatMap {
+            case litDef(lit) => Some(lit)
+            case _ => None
+          }
           (symbol, literals) :: Nil
         } else {
           val literal = literalDef.replaceAll("'", "").trim
@@ -388,12 +396,24 @@ class TableIdentifierParserSuite extends SparkFunSuite with SQLHelper {
   val reservedKeywordsInAnsiMode = allCandidateKeywords -- nonReservedKeywordsInAnsiMode
 
   test("check # of reserved keywords") {
-    val numReservedKeywords = 78
+    val numReservedKeywords = 74
     assert(reservedKeywordsInAnsiMode.size == numReservedKeywords,
       s"The expected number of reserved keywords is $numReservedKeywords, but " +
         s"${reservedKeywordsInAnsiMode.size} found.")
   }
 
+  test("reserved keywords in Spark are also reserved in SQL 2016") {
+    withTempDir { dir =>
+      val tmpFile = new File(dir, "tmp")
+      val is = Thread.currentThread().getContextClassLoader
+        .getResourceAsStream("ansi-sql-2016-reserved-keywords.txt")
+      Files.copy(is, tmpFile.toPath)
+      val reservedKeywordsInSql2016 = Files.readAllLines(tmpFile.toPath)
+        .asScala.filterNot(_.startsWith("--")).map(_.trim).toSet
+      assert((reservedKeywordsInAnsiMode  -- reservedKeywordsInSql2016).isEmpty)
+    }
+  }
+
   test("table identifier") {
     // Regular names.
     assert(TableIdentifier("q") === parseTableIdentifier("q"))


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org

[spark] 01/02: [SPARK-31950][SQL][TESTS] Extract SQL keywords from the SqlBase.g4 file

Posted by ya...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

yamamuro pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git

commit b70c68ae458d929cbf28a084cecf8252b4a3849f
Author: Takeshi Yamamuro <ya...@apache.org>
AuthorDate: Sat Jun 13 07:12:27 2020 +0900

    [SPARK-31950][SQL][TESTS] Extract SQL keywords from the SqlBase.g4 file
    
    ### What changes were proposed in this pull request?
    
    This PR intends to extract SQL reserved/non-reserved keywords from the ANTLR grammar file (`SqlBase.g4`) directly.
    
    This approach is based on the cloud-fan suggestion: https://github.com/apache/spark/pull/28779#issuecomment-642033217
    
    ### Why are the changes needed?
    
    It is hard to maintain a full set of the keywords in `TableIdentifierParserSuite`, so it would be nice if we could extract them from the `SqlBase.g4` file directly.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No.
    
    ### How was this patch tested?
    
    Existing tests.
    
    Closes #28802 from maropu/SPARK-31950-2.
    
    Authored-by: Takeshi Yamamuro <ya...@apache.org>
    Signed-off-by: Takeshi Yamamuro <ya...@apache.org>
---
 .../apache/spark/sql/catalyst/parser/SqlBase.g4    |   4 +
 .../parser/TableIdentifierParserSuite.scala        | 432 +++++----------------
 2 files changed, 110 insertions(+), 326 deletions(-)

diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
index 208a503..14a6687 100644
--- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
+++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
@@ -989,6 +989,7 @@ alterColumnAction
 // You can find the full keywords list by searching "Start of the keywords list" in this file.
 // The non-reserved keywords are listed below. Keywords not in this list are reserved keywords.
 ansiNonReserved
+//--ANSI-NON-RESERVED-START
     : ADD
     | AFTER
     | ALTER
@@ -1165,6 +1166,7 @@ ansiNonReserved
     | VIEW
     | VIEWS
     | WINDOW
+//--ANSI-NON-RESERVED-END
     ;
 
 // When `SQL_standard_keyword_behavior=false`, there are 2 kinds of keywords in Spark SQL.
@@ -1442,6 +1444,7 @@ nonReserved
 //============================
 // Start of the keywords list
 //============================
+//--SPARK-KEYWORD-LIST-START
 ADD: 'ADD';
 AFTER: 'AFTER';
 ALL: 'ALL';
@@ -1694,6 +1697,7 @@ WHERE: 'WHERE';
 WINDOW: 'WINDOW';
 WITH: 'WITH';
 YEAR: 'YEAR';
+//--SPARK-KEYWORD-LIST-END
 //============================
 // End of the keywords list
 //============================
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableIdentifierParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableIdentifierParserSuite.scala
index bd617bf..04969e3 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableIdentifierParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableIdentifierParserSuite.scala
@@ -16,9 +16,14 @@
  */
 package org.apache.spark.sql.catalyst.parser
 
+import java.util.Locale
+
+import scala.collection.mutable
+
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.plans.SQLHelper
+import org.apache.spark.sql.catalyst.util.fileToString
 import org.apache.spark.sql.internal.SQLConf
 
 class TableIdentifierParserSuite extends SparkFunSuite with SQLHelper {
@@ -285,334 +290,109 @@ class TableIdentifierParserSuite extends SparkFunSuite with SQLHelper {
     "where",
     "with")
 
-  // All the keywords in `docs/sql-keywords.md` are listed below:
-  val allCandidateKeywords = Set(
-    "add",
-    "after",
-    "all",
-    "alter",
-    "analyze",
-    "and",
-    "anti",
-    "any",
-    "archive",
-    "array",
-    "as",
-    "asc",
-    "at",
-    "authorization",
-    "between",
-    "both",
-    "bucket",
-    "buckets",
-    "by",
-    "cache",
-    "cascade",
-    "case",
-    "cast",
-    "change",
-    "check",
-    "clear",
-    "cluster",
-    "clustered",
-    "codegen",
-    "collate",
-    "collection",
-    "column",
-    "columns",
-    "comment",
-    "commit",
-    "compact",
-    "compactions",
-    "compute",
-    "concatenate",
-    "constraint",
-    "cost",
-    "create",
-    "cross",
-    "cube",
-    "current",
-    "current_date",
-    "current_time",
-    "current_timestamp",
-    "current_user",
-    "data",
-    "database",
-    "databases",
-    "day",
-    "dbproperties",
-    "defined",
-    "delete",
-    "delimited",
-    "desc",
-    "describe",
-    "dfs",
-    "directories",
-    "directory",
-    "distinct",
-    "distribute",
-    "div",
-    "drop",
-    "else",
-    "end",
-    "escape",
-    "escaped",
-    "except",
-    "exchange",
-    "exists",
-    "explain",
-    "export",
-    "extended",
-    "external",
-    "extract",
-    "false",
-    "fetch",
-    "fields",
-    "fileformat",
-    "first",
-    "following",
-    "for",
-    "foreign",
-    "format",
-    "formatted",
-    "from",
-    "full",
-    "function",
-    "functions",
-    "global",
-    "grant",
-    "group",
-    "grouping",
-    "having",
-    "hour",
-    "if",
-    "ignore",
-    "import",
-    "in",
-    "index",
-    "indexes",
-    "inner",
-    "inpath",
-    "inputformat",
-    "insert",
-    "intersect",
-    "interval",
-    "into",
-    "is",
-    "items",
-    "join",
-    "keys",
-    "last",
-    "lateral",
-    "lazy",
-    "leading",
-    "left",
-    "like",
-    "limit",
-    "lines",
-    "list",
-    "load",
-    "local",
-    "location",
-    "lock",
-    "locks",
-    "logical",
-    "macro",
-    "map",
-    "minus",
-    "minute",
-    "month",
-    "msck",
-    "namespaces",
-    "natural",
-    "no",
-    "not",
-    "null",
-    "nulls",
-    "of",
-    "on",
-    "only",
-    "option",
-    "options",
-    "or",
-    "order",
-    "out",
-    "outer",
-    "outputformat",
-    "over",
-    "overlaps",
-    "overlay",
-    "overwrite",
-    "partition",
-    "partitioned",
-    "partitions",
-    "percent",
-    "pivot",
-    "placing",
-    "position",
-    "preceding",
-    "primary",
-    "principals",
-    "purge",
-    "query",
-    "range",
-    "recordreader",
-    "recordwriter",
-    "recover",
-    "reduce",
-    "references",
-    "refresh",
-    "rename",
-    "repair",
-    "replace",
-    "reset",
-    "restrict",
-    "revoke",
-    "right",
-    "rlike",
-    "role",
-    "roles",
-    "rollback",
-    "rollup",
-    "row",
-    "rows",
-    "schema",
-    "second",
-    "select",
-    "semi",
-    "separated",
-    "serde",
-    "serdeproperties",
-    "session_user",
-    "set",
-    "sets",
-    "show",
-    "skewed",
-    "some",
-    "sort",
-    "sorted",
-    "start",
-    "statistics",
-    "stored",
-    "stratify",
-    "struct",
-    "substr",
-    "substring",
-    "table",
-    "tables",
-    "tablesample",
-    "tblproperties",
-    "temporary",
-    "terminated",
-    "then",
-    "to",
-    "touch",
-    "trailing",
-    "transaction",
-    "transactions",
-    "transform",
-    "true",
-    "truncate",
-    "type",
-    "unarchive",
-    "unbounded",
-    "uncache",
-    "union",
-    "unique",
-    "unknown",
-    "unlock",
-    "unset",
-    "use",
-    "user",
-    "using",
-    "values",
-    "view",
-    "views",
-    "when",
-    "where",
-    "window",
-    "with",
-    "year")
+  private val sqlSyntaxDefs = {
+    val sqlBasePath = {
+      val sparkHome = {
+        assert(sys.props.contains("spark.test.home") ||
+          sys.env.contains("SPARK_HOME"), "spark.test.home or SPARK_HOME is not set.")
+        sys.props.getOrElse("spark.test.home", sys.env("SPARK_HOME"))
+      }
+      java.nio.file.Paths.get(sparkHome, "sql", "catalyst", "src", "main", "antlr4", "org",
+        "apache", "spark", "sql", "catalyst", "parser", "SqlBase.g4").toFile
+    }
+    fileToString(sqlBasePath).split("\n")
+  }
 
-  val reservedKeywordsInAnsiMode = Set(
-    "all",
-    "and",
-    "anti",
-    "any",
-    "as",
-    "authorization",
-    "both",
-    "case",
-    "cast",
-    "check",
-    "collate",
-    "column",
-    "constraint",
-    "create",
-    "cross",
-    "current_date",
-    "current_time",
-    "current_timestamp",
-    "current_user",
-    "day",
-    "distinct",
-    "else",
-    "end",
-    "escape",
-    "except",
-    "false",
-    "fetch",
-    "for",
-    "foreign",
-    "from",
-    "full",
-    "grant",
-    "group",
-    "having",
-    "hour",
-    "in",
-    "inner",
-    "intersect",
-    "into",
-    "join",
-    "is",
-    "leading",
-    "left",
-    "minute",
-    "month",
-    "natural",
-    "not",
-    "null",
-    "on",
-    "only",
-    "or",
-    "order",
-    "outer",
-    "overlaps",
-    "primary",
-    "references",
-    "right",
-    "select",
-    "semi",
-    "session_user",
-    "minus",
-    "second",
-    "some",
-    "table",
-    "then",
-    "to",
-    "trailing",
-    "union",
-    "unique",
-    "unknown",
-    "user",
-    "using",
-    "when",
-    "where",
-    "with",
-    "year")
+  private def parseAntlrGrammars[T](startTag: String, endTag: String)
+      (f: PartialFunction[String, Seq[T]]): Set[T] = {
+    val keywords = new mutable.ArrayBuffer[T]
+    val default = (_: String) => Nil
+    var startTagFound = false
+    var parseFinished = false
+    val lineIter = sqlSyntaxDefs.toIterator
+    while (!parseFinished && lineIter.hasNext) {
+      val line = lineIter.next()
+      if (line.trim.startsWith(startTag)) {
+        startTagFound = true
+      } else if (line.trim.startsWith(endTag)) {
+        parseFinished = true
+      } else if (startTagFound) {
+        f.applyOrElse(line, default).foreach { symbol =>
+          keywords += symbol
+        }
+      }
+    }
+    assert(keywords.nonEmpty && startTagFound && parseFinished, "cannot extract keywords from " +
+      s"the `SqlBase.g4` file, so please check if the start/end tags (`$startTag` and `$endTag`) " +
+      "are placed correctly in the file.")
+    keywords.toSet
+  }
 
-  val nonReservedKeywordsInAnsiMode = allCandidateKeywords -- reservedKeywordsInAnsiMode
+  // If a symbol does not have the same string with its literal (e.g., `SETMINUS: 'MINUS';`),
+  // we need to map a symbol to actual literal strings.
+  val symbolsToExpandIntoDifferentLiterals = {
+    val kwDef = """([A-Z_]+):(.+);""".r
+    val keywords = parseAntlrGrammars(
+        "//--SPARK-KEYWORD-LIST-START", "//--SPARK-KEYWORD-LIST-END") {
+      case kwDef(symbol, literalDef) =>
+        val splitDefs = literalDef.split("""\|""")
+        val hasMultipleLiterals = splitDefs.length > 1
+        // The case where a symbol has multiple literal definitions,
+        // e.g., `DATABASES: 'DATABASES' | 'SCHEMAS';`.
+        if (hasMultipleLiterals) {
+          val literals = splitDefs.map(_.replaceAll("'", "").trim).toSeq
+          (symbol, literals) :: Nil
+        } else {
+          val literal = literalDef.replaceAll("'", "").trim
+          // The case where a symbol string and its literal string are different,
+          // e.g., `SETMINUS: 'MINUS';`.
+          if (symbol != literal) {
+            (symbol, literal :: Nil) :: Nil
+          } else {
+            Nil
+          }
+        }
+    }
+    keywords.toMap
+  }
+
+  // All the SQL keywords defined in `SqlBase.g4`
+  val allCandidateKeywords = {
+    val kwDef = """([A-Z_]+):.+;""".r
+    val keywords = parseAntlrGrammars(
+        "//--SPARK-KEYWORD-LIST-START", "//--SPARK-KEYWORD-LIST-END") {
+      // Parses a pattern, e.g., `AFTER: 'AFTER';`
+      case kwDef(symbol) =>
+        if (symbolsToExpandIntoDifferentLiterals.contains(symbol)) {
+          symbolsToExpandIntoDifferentLiterals(symbol)
+        } else {
+          symbol :: Nil
+        }
+    }
+    keywords
+  }
+
+  val nonReservedKeywordsInAnsiMode = {
+    val kwDef = """\s*[\|:]\s*([A-Z_]+)\s*""".r
+    parseAntlrGrammars("//--ANSI-NON-RESERVED-START", "//--ANSI-NON-RESERVED-END") {
+      // Parses a pattern, e.g., `    | AFTER`
+      case kwDef(symbol) =>
+        if (symbolsToExpandIntoDifferentLiterals.contains(symbol)) {
+          symbolsToExpandIntoDifferentLiterals(symbol)
+        } else {
+          symbol :: Nil
+        }
+    }
+  }
+
+  val reservedKeywordsInAnsiMode = allCandidateKeywords -- nonReservedKeywordsInAnsiMode
+
+  test("check # of reserved keywords") {
+    val numReservedKeywords = 78
+    assert(reservedKeywordsInAnsiMode.size == numReservedKeywords,
+      s"The expected number of reserved keywords is $numReservedKeywords, but " +
+        s"${reservedKeywordsInAnsiMode.size} found.")
+  }
 
   test("table identifier") {
     // Regular names.


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org