You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ya...@apache.org on 2020/06/15 15:47:44 UTC
[spark] 01/02: [SPARK-31950][SQL][TESTS] Extract SQL keywords from
the SqlBase.g4 file
This is an automated email from the ASF dual-hosted git repository.
yamamuro pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git
commit b70c68ae458d929cbf28a084cecf8252b4a3849f
Author: Takeshi Yamamuro <ya...@apache.org>
AuthorDate: Sat Jun 13 07:12:27 2020 +0900
[SPARK-31950][SQL][TESTS] Extract SQL keywords from the SqlBase.g4 file
### What changes were proposed in this pull request?
This PR intends to extract SQL reserved/non-reserved keywords from the ANTLR grammar file (`SqlBase.g4`) directly.
This approach is based on the cloud-fan suggestion: https://github.com/apache/spark/pull/28779#issuecomment-642033217
### Why are the changes needed?
It is hard to maintain a full set of the keywords in `TableIdentifierParserSuite`, so it would be nice if we could extract them from the `SqlBase.g4` file directly.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
Existing tests.
Closes #28802 from maropu/SPARK-31950-2.
Authored-by: Takeshi Yamamuro <ya...@apache.org>
Signed-off-by: Takeshi Yamamuro <ya...@apache.org>
---
.../apache/spark/sql/catalyst/parser/SqlBase.g4 | 4 +
.../parser/TableIdentifierParserSuite.scala | 432 +++++----------------
2 files changed, 110 insertions(+), 326 deletions(-)
diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
index 208a503..14a6687 100644
--- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
+++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
@@ -989,6 +989,7 @@ alterColumnAction
// You can find the full keywords list by searching "Start of the keywords list" in this file.
// The non-reserved keywords are listed below. Keywords not in this list are reserved keywords.
ansiNonReserved
+//--ANSI-NON-RESERVED-START
: ADD
| AFTER
| ALTER
@@ -1165,6 +1166,7 @@ ansiNonReserved
| VIEW
| VIEWS
| WINDOW
+//--ANSI-NON-RESERVED-END
;
// When `SQL_standard_keyword_behavior=false`, there are 2 kinds of keywords in Spark SQL.
@@ -1442,6 +1444,7 @@ nonReserved
//============================
// Start of the keywords list
//============================
+//--SPARK-KEYWORD-LIST-START
ADD: 'ADD';
AFTER: 'AFTER';
ALL: 'ALL';
@@ -1694,6 +1697,7 @@ WHERE: 'WHERE';
WINDOW: 'WINDOW';
WITH: 'WITH';
YEAR: 'YEAR';
+//--SPARK-KEYWORD-LIST-END
//============================
// End of the keywords list
//============================
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableIdentifierParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableIdentifierParserSuite.scala
index bd617bf..04969e3 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableIdentifierParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableIdentifierParserSuite.scala
@@ -16,9 +16,14 @@
*/
package org.apache.spark.sql.catalyst.parser
+import java.util.Locale
+
+import scala.collection.mutable
+
import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.plans.SQLHelper
+import org.apache.spark.sql.catalyst.util.fileToString
import org.apache.spark.sql.internal.SQLConf
class TableIdentifierParserSuite extends SparkFunSuite with SQLHelper {
@@ -285,334 +290,109 @@ class TableIdentifierParserSuite extends SparkFunSuite with SQLHelper {
"where",
"with")
- // All the keywords in `docs/sql-keywords.md` are listed below:
- val allCandidateKeywords = Set(
- "add",
- "after",
- "all",
- "alter",
- "analyze",
- "and",
- "anti",
- "any",
- "archive",
- "array",
- "as",
- "asc",
- "at",
- "authorization",
- "between",
- "both",
- "bucket",
- "buckets",
- "by",
- "cache",
- "cascade",
- "case",
- "cast",
- "change",
- "check",
- "clear",
- "cluster",
- "clustered",
- "codegen",
- "collate",
- "collection",
- "column",
- "columns",
- "comment",
- "commit",
- "compact",
- "compactions",
- "compute",
- "concatenate",
- "constraint",
- "cost",
- "create",
- "cross",
- "cube",
- "current",
- "current_date",
- "current_time",
- "current_timestamp",
- "current_user",
- "data",
- "database",
- "databases",
- "day",
- "dbproperties",
- "defined",
- "delete",
- "delimited",
- "desc",
- "describe",
- "dfs",
- "directories",
- "directory",
- "distinct",
- "distribute",
- "div",
- "drop",
- "else",
- "end",
- "escape",
- "escaped",
- "except",
- "exchange",
- "exists",
- "explain",
- "export",
- "extended",
- "external",
- "extract",
- "false",
- "fetch",
- "fields",
- "fileformat",
- "first",
- "following",
- "for",
- "foreign",
- "format",
- "formatted",
- "from",
- "full",
- "function",
- "functions",
- "global",
- "grant",
- "group",
- "grouping",
- "having",
- "hour",
- "if",
- "ignore",
- "import",
- "in",
- "index",
- "indexes",
- "inner",
- "inpath",
- "inputformat",
- "insert",
- "intersect",
- "interval",
- "into",
- "is",
- "items",
- "join",
- "keys",
- "last",
- "lateral",
- "lazy",
- "leading",
- "left",
- "like",
- "limit",
- "lines",
- "list",
- "load",
- "local",
- "location",
- "lock",
- "locks",
- "logical",
- "macro",
- "map",
- "minus",
- "minute",
- "month",
- "msck",
- "namespaces",
- "natural",
- "no",
- "not",
- "null",
- "nulls",
- "of",
- "on",
- "only",
- "option",
- "options",
- "or",
- "order",
- "out",
- "outer",
- "outputformat",
- "over",
- "overlaps",
- "overlay",
- "overwrite",
- "partition",
- "partitioned",
- "partitions",
- "percent",
- "pivot",
- "placing",
- "position",
- "preceding",
- "primary",
- "principals",
- "purge",
- "query",
- "range",
- "recordreader",
- "recordwriter",
- "recover",
- "reduce",
- "references",
- "refresh",
- "rename",
- "repair",
- "replace",
- "reset",
- "restrict",
- "revoke",
- "right",
- "rlike",
- "role",
- "roles",
- "rollback",
- "rollup",
- "row",
- "rows",
- "schema",
- "second",
- "select",
- "semi",
- "separated",
- "serde",
- "serdeproperties",
- "session_user",
- "set",
- "sets",
- "show",
- "skewed",
- "some",
- "sort",
- "sorted",
- "start",
- "statistics",
- "stored",
- "stratify",
- "struct",
- "substr",
- "substring",
- "table",
- "tables",
- "tablesample",
- "tblproperties",
- "temporary",
- "terminated",
- "then",
- "to",
- "touch",
- "trailing",
- "transaction",
- "transactions",
- "transform",
- "true",
- "truncate",
- "type",
- "unarchive",
- "unbounded",
- "uncache",
- "union",
- "unique",
- "unknown",
- "unlock",
- "unset",
- "use",
- "user",
- "using",
- "values",
- "view",
- "views",
- "when",
- "where",
- "window",
- "with",
- "year")
+ private val sqlSyntaxDefs = {
+ val sqlBasePath = {
+ val sparkHome = {
+ assert(sys.props.contains("spark.test.home") ||
+ sys.env.contains("SPARK_HOME"), "spark.test.home or SPARK_HOME is not set.")
+ sys.props.getOrElse("spark.test.home", sys.env("SPARK_HOME"))
+ }
+ java.nio.file.Paths.get(sparkHome, "sql", "catalyst", "src", "main", "antlr4", "org",
+ "apache", "spark", "sql", "catalyst", "parser", "SqlBase.g4").toFile
+ }
+ fileToString(sqlBasePath).split("\n")
+ }
- val reservedKeywordsInAnsiMode = Set(
- "all",
- "and",
- "anti",
- "any",
- "as",
- "authorization",
- "both",
- "case",
- "cast",
- "check",
- "collate",
- "column",
- "constraint",
- "create",
- "cross",
- "current_date",
- "current_time",
- "current_timestamp",
- "current_user",
- "day",
- "distinct",
- "else",
- "end",
- "escape",
- "except",
- "false",
- "fetch",
- "for",
- "foreign",
- "from",
- "full",
- "grant",
- "group",
- "having",
- "hour",
- "in",
- "inner",
- "intersect",
- "into",
- "join",
- "is",
- "leading",
- "left",
- "minute",
- "month",
- "natural",
- "not",
- "null",
- "on",
- "only",
- "or",
- "order",
- "outer",
- "overlaps",
- "primary",
- "references",
- "right",
- "select",
- "semi",
- "session_user",
- "minus",
- "second",
- "some",
- "table",
- "then",
- "to",
- "trailing",
- "union",
- "unique",
- "unknown",
- "user",
- "using",
- "when",
- "where",
- "with",
- "year")
+ private def parseAntlrGrammars[T](startTag: String, endTag: String)
+ (f: PartialFunction[String, Seq[T]]): Set[T] = {
+ val keywords = new mutable.ArrayBuffer[T]
+ val default = (_: String) => Nil
+ var startTagFound = false
+ var parseFinished = false
+ val lineIter = sqlSyntaxDefs.toIterator
+ while (!parseFinished && lineIter.hasNext) {
+ val line = lineIter.next()
+ if (line.trim.startsWith(startTag)) {
+ startTagFound = true
+ } else if (line.trim.startsWith(endTag)) {
+ parseFinished = true
+ } else if (startTagFound) {
+ f.applyOrElse(line, default).foreach { symbol =>
+ keywords += symbol
+ }
+ }
+ }
+ assert(keywords.nonEmpty && startTagFound && parseFinished, "cannot extract keywords from " +
+ s"the `SqlBase.g4` file, so please check if the start/end tags (`$startTag` and `$endTag`) " +
+ "are placed correctly in the file.")
+ keywords.toSet
+ }
- val nonReservedKeywordsInAnsiMode = allCandidateKeywords -- reservedKeywordsInAnsiMode
+ // If a symbol does not have the same string with its literal (e.g., `SETMINUS: 'MINUS';`),
+ // we need to map a symbol to actual literal strings.
+ val symbolsToExpandIntoDifferentLiterals = {
+ val kwDef = """([A-Z_]+):(.+);""".r
+ val keywords = parseAntlrGrammars(
+ "//--SPARK-KEYWORD-LIST-START", "//--SPARK-KEYWORD-LIST-END") {
+ case kwDef(symbol, literalDef) =>
+ val splitDefs = literalDef.split("""\|""")
+ val hasMultipleLiterals = splitDefs.length > 1
+ // The case where a symbol has multiple literal definitions,
+ // e.g., `DATABASES: 'DATABASES' | 'SCHEMAS';`.
+ if (hasMultipleLiterals) {
+ val literals = splitDefs.map(_.replaceAll("'", "").trim).toSeq
+ (symbol, literals) :: Nil
+ } else {
+ val literal = literalDef.replaceAll("'", "").trim
+ // The case where a symbol string and its literal string are different,
+ // e.g., `SETMINUS: 'MINUS';`.
+ if (symbol != literal) {
+ (symbol, literal :: Nil) :: Nil
+ } else {
+ Nil
+ }
+ }
+ }
+ keywords.toMap
+ }
+
+ // All the SQL keywords defined in `SqlBase.g4`
+ val allCandidateKeywords = {
+ val kwDef = """([A-Z_]+):.+;""".r
+ val keywords = parseAntlrGrammars(
+ "//--SPARK-KEYWORD-LIST-START", "//--SPARK-KEYWORD-LIST-END") {
+ // Parses a pattern, e.g., `AFTER: 'AFTER';`
+ case kwDef(symbol) =>
+ if (symbolsToExpandIntoDifferentLiterals.contains(symbol)) {
+ symbolsToExpandIntoDifferentLiterals(symbol)
+ } else {
+ symbol :: Nil
+ }
+ }
+ keywords
+ }
+
+ val nonReservedKeywordsInAnsiMode = {
+ val kwDef = """\s*[\|:]\s*([A-Z_]+)\s*""".r
+ parseAntlrGrammars("//--ANSI-NON-RESERVED-START", "//--ANSI-NON-RESERVED-END") {
+ // Parses a pattern, e.g., ` | AFTER`
+ case kwDef(symbol) =>
+ if (symbolsToExpandIntoDifferentLiterals.contains(symbol)) {
+ symbolsToExpandIntoDifferentLiterals(symbol)
+ } else {
+ symbol :: Nil
+ }
+ }
+ }
+
+ val reservedKeywordsInAnsiMode = allCandidateKeywords -- nonReservedKeywordsInAnsiMode
+
+ test("check # of reserved keywords") {
+ val numReservedKeywords = 78
+ assert(reservedKeywordsInAnsiMode.size == numReservedKeywords,
+ s"The expected number of reserved keywords is $numReservedKeywords, but " +
+ s"${reservedKeywordsInAnsiMode.size} found.")
+ }
test("table identifier") {
// Regular names.
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org