You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by hv...@apache.org on 2016/09/15 18:53:55 UTC
spark git commit: [SPARK-17364][SQL] Antlr lexer wrongly treats full
qualified identifier as a decimal number token when parsing SQL string
Repository: spark
Updated Branches:
refs/heads/master fe767395f -> a6b818200
[SPARK-17364][SQL] Antlr lexer wrongly treats full qualified identifier as a decimal number token when parsing SQL string
## What changes were proposed in this pull request?
The Antlr lexer we use to tokenize a SQL string may wrongly tokenize a fully qualified identifier as a decimal number token. For example, table identifier `default.123_table` is wrongly tokenized as
```
default // Matches lexer rule IDENTIFIER
.123 // Matches lexer rule DECIMAL_VALUE
_TABLE // Matches lexer rule IDENTIFIER
```
The correct tokenization for `default.123_table` should be:
```
default // Matches lexer rule IDENTIFIER,
. // Matches a single dot
123_TABLE // Matches lexer rule IDENTIFIER
```
This PR fix the Antlr grammar so that it can tokenize fully qualified identifier correctly:
1. Fully qualified table name can be parsed correctly. For example, `select * from database.123_suffix`.
2. Fully qualified column name can be parsed correctly, for example `select a.123_suffix from a`.
### Before change
#### Case 1: Failed to parse fully qualified column name
```
scala> spark.sql("select a.123_column from a").show
org.apache.spark.sql.catalyst.parser.ParseException:
extraneous input '.123' expecting {<EOF>,
...
, IDENTIFIER, BACKQUOTED_IDENTIFIER}(line 1, pos 8)
== SQL ==
select a.123_column from a
--------^^^
```
#### Case 2: Failed to parse fully qualified table name
```
scala> spark.sql("select * from default.123_table")
org.apache.spark.sql.catalyst.parser.ParseException:
extraneous input '.123' expecting {<EOF>,
...
IDENTIFIER, BACKQUOTED_IDENTIFIER}(line 1, pos 21)
== SQL ==
select * from default.123_table
---------------------^^^
```
### After Change
#### Case 1: fully qualified column name, no ParseException thrown
```
scala> spark.sql("select a.123_column from a").show
```
#### Case 2: fully qualified table name, no ParseException thrown
```
scala> spark.sql("select * from default.123_table")
```
## How was this patch tested?
Unit test.
Author: Sean Zhong <se...@databricks.com>
Closes #15006 from clockfly/SPARK-17364.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a6b81820
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a6b81820
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a6b81820
Branch: refs/heads/master
Commit: a6b8182006d0c3dda67c06861067ca78383ecf1b
Parents: fe76739
Author: Sean Zhong <se...@databricks.com>
Authored: Thu Sep 15 20:53:48 2016 +0200
Committer: Herman van Hovell <hv...@databricks.com>
Committed: Thu Sep 15 20:53:48 2016 +0200
----------------------------------------------------------------------
.../apache/spark/sql/catalyst/parser/SqlBase.g4 | 44 ++++++++++++++++----
.../catalyst/parser/ExpressionParserSuite.scala | 15 ++++++-
.../parser/TableIdentifierParserSuite.scala | 13 ++++++
3 files changed, 63 insertions(+), 9 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/a6b81820/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
index b475abd..7023c0c 100644
--- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
+++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
@@ -16,6 +16,30 @@
grammar SqlBase;
+@members {
+ /**
+ * Verify whether current token is a valid decimal token (which contains dot).
+ * Returns true if the character that follows the token is not a digit or letter or underscore.
+ *
+ * For example:
+ * For char stream "2.3", "2." is not a valid decimal token, because it is followed by digit '3'.
+ * For char stream "2.3_", "2.3" is not a valid decimal token, because it is followed by '_'.
+ * For char stream "2.3W", "2.3" is not a valid decimal token, because it is followed by 'W'.
+ * For char stream "12.0D 34.E2+0.12 " 12.0D is a valid decimal token because it is folllowed
+ * by a space. 34.E2 is a valid decimal token because it is followed by symbol '+'
+ * which is not a digit or letter or underscore.
+ */
+ public boolean isValidDecimal() {
+ int nextChar = _input.LA(1);
+ if (nextChar >= 'A' && nextChar <= 'Z' || nextChar >= '0' && nextChar <= '9' ||
+ nextChar == '_') {
+ return false;
+ } else {
+ return true;
+ }
+ }
+}
+
tokens {
DELIMITER
}
@@ -920,23 +944,22 @@ INTEGER_VALUE
;
DECIMAL_VALUE
- : DIGIT+ '.' DIGIT*
- | '.' DIGIT+
+ : DECIMAL_DIGITS {isValidDecimal()}?
;
SCIENTIFIC_DECIMAL_VALUE
- : DIGIT+ ('.' DIGIT*)? EXPONENT
- | '.' DIGIT+ EXPONENT
+ : DIGIT+ EXPONENT
+ | DECIMAL_DIGITS EXPONENT {isValidDecimal()}?
;
DOUBLE_LITERAL
- :
- (INTEGER_VALUE | DECIMAL_VALUE | SCIENTIFIC_DECIMAL_VALUE) 'D'
+ : DIGIT+ EXPONENT? 'D'
+ | DECIMAL_DIGITS EXPONENT? 'D' {isValidDecimal()}?
;
BIGDECIMAL_LITERAL
- :
- (INTEGER_VALUE | DECIMAL_VALUE | SCIENTIFIC_DECIMAL_VALUE) 'BD'
+ : DIGIT+ EXPONENT? 'BD'
+ | DECIMAL_DIGITS EXPONENT? 'BD' {isValidDecimal()}?
;
IDENTIFIER
@@ -947,6 +970,11 @@ BACKQUOTED_IDENTIFIER
: '`' ( ~'`' | '``' )* '`'
;
+fragment DECIMAL_DIGITS
+ : DIGIT+ '.' DIGIT*
+ | '.' DIGIT+
+ ;
+
fragment EXPONENT
: 'E' [+-]? DIGIT+
;
http://git-wip-us.apache.org/repos/asf/spark/blob/a6b81820/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala
index 4e399ee..f319215 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala
@@ -18,7 +18,7 @@ package org.apache.spark.sql.catalyst.parser
import java.sql.{Date, Timestamp}
-import org.apache.spark.sql.catalyst.FunctionIdentifier
+import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier}
import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, _}
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.plans.PlanTest
@@ -518,4 +518,17 @@ class ExpressionParserSuite extends PlanTest {
assertEqual("current_date", CurrentDate())
assertEqual("current_timestamp", CurrentTimestamp())
}
+
+ test("SPARK-17364, fully qualified column name which starts with number") {
+ assertEqual("123_", UnresolvedAttribute("123_"))
+ assertEqual("1a.123_", UnresolvedAttribute("1a.123_"))
+ // ".123" should not be treated as token of type DECIMAL_VALUE
+ assertEqual("a.123A", UnresolvedAttribute("a.123A"))
+ // ".123E3" should not be treated as token of type SCIENTIFIC_DECIMAL_VALUE
+ assertEqual("a.123E3_column", UnresolvedAttribute("a.123E3_column"))
+ // ".123D" should not be treated as token of type DOUBLE_LITERAL
+ assertEqual("a.123D_column", UnresolvedAttribute("a.123D_column"))
+ // ".123BD" should not be treated as token of type BIGDECIMAL_LITERAL
+ assertEqual("a.123BD_column", UnresolvedAttribute("a.123BD_column"))
+ }
}
http://git-wip-us.apache.org/repos/asf/spark/blob/a6b81820/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableIdentifierParserSuite.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableIdentifierParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableIdentifierParserSuite.scala
index dadb8a8..793be89 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableIdentifierParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableIdentifierParserSuite.scala
@@ -91,4 +91,17 @@ class TableIdentifierParserSuite extends SparkFunSuite {
assert(TableIdentifier(nonReserved) === parseTableIdentifier(nonReserved))
}
}
+
+ test("SPARK-17364 table identifier - contains number") {
+ assert(parseTableIdentifier("123_") == TableIdentifier("123_"))
+ assert(parseTableIdentifier("1a.123_") == TableIdentifier("123_", Some("1a")))
+ // ".123" should not be treated as token of type DECIMAL_VALUE
+ assert(parseTableIdentifier("a.123A") == TableIdentifier("123A", Some("a")))
+ // ".123E3" should not be treated as token of type SCIENTIFIC_DECIMAL_VALUE
+ assert(parseTableIdentifier("a.123E3_LIST") == TableIdentifier("123E3_LIST", Some("a")))
+ // ".123D" should not be treated as token of type DOUBLE_LITERAL
+ assert(parseTableIdentifier("a.123D_LIST") == TableIdentifier("123D_LIST", Some("a")))
+ // ".123BD" should not be treated as token of type BIGDECIMAL_LITERAL
+ assert(parseTableIdentifier("a.123BD_LIST") == TableIdentifier("123BD_LIST", Some("a")))
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org