You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2017/03/16 04:06:29 UTC
spark git commit: [SPARK-19830][SQL] Add parseTableSchema API to ParserInterface

Repository: spark
Updated Branches:
  refs/heads/master 21f333c63 -> 1472cac4b


[SPARK-19830][SQL] Add parseTableSchema API to ParserInterface

### What changes were proposed in this pull request?

Specifying the table schema in DDL formats is needed for different scenarios. For example,
- [specifying the schema in SQL function `from_json` using DDL formats](https://issues.apache.org/jira/browse/SPARK-19637), which is suggested by marmbrus ,
- [specifying the customized JDBC data types](https://github.com/apache/spark/pull/16209).

These two PRs need users to use the JSON format to specify the table schema. This is not user friendly.

This PR is to provide a `parseTableSchema` API in `ParserInterface`.

### How was this patch tested?
Added a test suite `TableSchemaParserSuite`

Author: Xiao Li <ga...@gmail.com>

Closes #17171 from gatorsmile/parseDDLStmt.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1472cac4
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1472cac4
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1472cac4

Branch: refs/heads/master
Commit: 1472cac4bb31c1886f82830778d34c4dd9030d7a
Parents: 21f333c
Author: Xiao Li <ga...@gmail.com>
Authored: Thu Mar 16 12:06:20 2017 +0800
Committer: Wenchen Fan <we...@databricks.com>
Committed: Thu Mar 16 12:06:20 2017 +0800

----------------------------------------------------------------------
 .../spark/sql/catalyst/parser/ParseDriver.scala | 10 ++-
 .../sql/catalyst/parser/ParserInterface.scala   |  7 ++
 .../parser/TableSchemaParserSuite.scala         | 88 ++++++++++++++++++++
 3 files changed, 104 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/1472cac4/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParseDriver.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParseDriver.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParseDriver.scala
index d687a85..f704b09 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParseDriver.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParseDriver.scala
@@ -26,7 +26,7 @@ import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.expressions.Expression
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.trees.Origin
-import org.apache.spark.sql.types.DataType
+import org.apache.spark.sql.types.{DataType, StructType}
 
 /**
  * Base SQL parsing infrastructure.
@@ -49,6 +49,14 @@ abstract class AbstractSqlParser extends ParserInterface with Logging {
     astBuilder.visitSingleTableIdentifier(parser.singleTableIdentifier())
   }
 
+  /**
+   * Creates StructType for a given SQL string, which is a comma separated list of field
+   * definitions which will preserve the correct Hive metadata.
+   */
+  override def parseTableSchema(sqlText: String): StructType = parse(sqlText) { parser =>
+    StructType(astBuilder.visitColTypeList(parser.colTypeList()))
+  }
+
   /** Creates LogicalPlan for a given SQL string. */
   override def parsePlan(sqlText: String): LogicalPlan = parse(sqlText) { parser =>
     astBuilder.visitSingleStatement(parser.singleStatement()) match {

http://git-wip-us.apache.org/repos/asf/spark/blob/1472cac4/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserInterface.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserInterface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserInterface.scala
index 7f35d65..6edbe25 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserInterface.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserInterface.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.catalyst.parser
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.expressions.Expression
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.types.StructType
 
 /**
  * Interface for a parser.
@@ -33,4 +34,10 @@ trait ParserInterface {
 
   /** Creates TableIdentifier for a given SQL string. */
   def parseTableIdentifier(sqlText: String): TableIdentifier
+
+  /**
+   * Creates StructType for a given SQL string, which is a comma separated list of field
+   * definitions which will preserve the correct Hive metadata.
+   */
+  def parseTableSchema(sqlText: String): StructType
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/1472cac4/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableSchemaParserSuite.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableSchemaParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableSchemaParserSuite.scala
new file mode 100644
index 0000000..da1041d
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableSchemaParserSuite.scala
@@ -0,0 +1,88 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.catalyst.parser
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.types._
+
+class TableSchemaParserSuite extends SparkFunSuite {
+
+  def parse(sql: String): StructType = CatalystSqlParser.parseTableSchema(sql)
+
+  def checkTableSchema(tableSchemaString: String, expectedDataType: DataType): Unit = {
+    test(s"parse $tableSchemaString") {
+      assert(parse(tableSchemaString) === expectedDataType)
+    }
+  }
+
+  def assertError(sql: String): Unit =
+    intercept[ParseException](CatalystSqlParser.parseTableSchema(sql))
+
+  checkTableSchema("a int", new StructType().add("a", "int"))
+  checkTableSchema("A int", new StructType().add("A", "int"))
+  checkTableSchema("a INT", new StructType().add("a", "int"))
+  checkTableSchema("`!@#$%.^&*()` string", new StructType().add("!@#$%.^&*()", "string"))
+  checkTableSchema("a int, b long", new StructType().add("a", "int").add("b", "long"))
+  checkTableSchema("a STRUCT<intType: int, ts:timestamp>",
+    StructType(
+      StructField("a", StructType(
+        StructField("intType", IntegerType) ::
+        StructField("ts", TimestampType) :: Nil)) :: Nil))
+  checkTableSchema(
+    "a int comment 'test'",
+    new StructType().add("a", "int", nullable = true, "test"))
+
+  test("complex hive type") {
+    val tableSchemaString =
+      """
+        |complexStructCol struct<
+        |struct:struct<deciMal:DECimal, anotherDecimal:decimAL(5,2)>,
+        |MAP:Map<timestamp, varchar(10)>,
+        |arrAy:Array<double>,
+        |anotherArray:Array<char(9)>>
+      """.stripMargin.replace("\n", "")
+
+    val builder = new MetadataBuilder
+    builder.putString(HIVE_TYPE_STRING,
+      "struct<struct:struct<deciMal:decimal(10,0),anotherDecimal:decimal(5,2)>," +
+        "MAP:map<timestamp,varchar(10)>,arrAy:array<double>,anotherArray:array<char(9)>>")
+
+    val expectedDataType =
+      StructType(
+        StructField("complexStructCol", StructType(
+          StructField("struct",
+            StructType(
+              StructField("deciMal", DecimalType.USER_DEFAULT) ::
+                StructField("anotherDecimal", DecimalType(5, 2)) :: Nil)) ::
+            StructField("MAP", MapType(TimestampType, StringType)) ::
+            StructField("arrAy", ArrayType(DoubleType)) ::
+            StructField("anotherArray", ArrayType(StringType)) :: Nil),
+          nullable = true,
+          builder.build()) :: Nil)
+
+    assert(parse(tableSchemaString) === expectedDataType)
+  }
+
+  // Negative cases
+  assertError("")
+  assertError("a")
+  assertError("a INT b long")
+  assertError("a INT,, b long")
+  assertError("a INT, b long,,")
+  assertError("a INT, b long, c int,")
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org