You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2016/10/25 02:47:18 UTC
spark git commit: [SPARK-17409][SQL][FOLLOW-UP] Do Not Optimize Query in CTAS More Than Once

Repository: spark
Updated Branches:
  refs/heads/master 84a339990 -> d479c5262


[SPARK-17409][SQL][FOLLOW-UP] Do Not Optimize Query in CTAS More Than Once

### What changes were proposed in this pull request?
This follow-up PR is for addressing the [comment](https://github.com/apache/spark/pull/15048).

We added two test cases based on the suggestion from yhuai . One is a new test case using the `saveAsTable` API to create a data source table. Another is for CTAS on Hive serde table.

Note: No need to backport this PR to 2.0. Will submit a new PR to backport the whole fix with new test cases to Spark 2.0

### How was this patch tested?
N/A

Author: gatorsmile <ga...@gmail.com>

Closes #15459 from gatorsmile/ctasOptimizedTestCases.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d479c526
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d479c526
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d479c526

Branch: refs/heads/master
Commit: d479c5262276b47302659bd877a9e3467400bdb6
Parents: 84a3399
Author: gatorsmile <ga...@gmail.com>
Authored: Tue Oct 25 10:47:11 2016 +0800
Committer: Wenchen Fan <we...@databricks.com>
Committed: Tue Oct 25 10:47:11 2016 +0800

----------------------------------------------------------------------
 .../org/apache/spark/sql/DataFrameSuite.scala   | 18 ++++++++++++++++++
 .../sql/sources/CreateTableAsSelectSuite.scala  |  2 +-
 .../spark/sql/hive/MetastoreRelationSuite.scala | 20 ++++++++++++++++++--
 3 files changed, 37 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/d479c526/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index e87baa4..3fb7eee 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -1599,6 +1599,24 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
     assert(df.persist.take(1).apply(0).toSeq(100).asInstanceOf[Long] == 100)
   }
 
+  test("SPARK-17409: Do Not Optimize Query in CTAS (Data source tables) More Than Once") {
+    withTable("bar") {
+      withTempView("foo") {
+        withSQLConf(SQLConf.DEFAULT_DATA_SOURCE_NAME.key -> "json") {
+          sql("select 0 as id").createOrReplaceTempView("foo")
+          val df = sql("select * from foo group by id")
+          // If we optimize the query in CTAS more than once, the following saveAsTable will fail
+          // with the error: `GROUP BY position 0 is not in select list (valid range is [1, 1])`
+          df.write.mode("overwrite").saveAsTable("bar")
+          checkAnswer(spark.table("bar"), Row(0) :: Nil)
+          val tableMetadata = spark.sessionState.catalog.getTableMetadata(TableIdentifier("bar"))
+          assert(tableMetadata.provider == Some("json"),
+            "the expected table is a data source table using json")
+        }
+      }
+    }
+  }
+
   test("copy results for sampling with replacement") {
     val df = Seq((1, 0), (2, 0), (3, 0)).toDF("a", "b")
     val sampleDf = df.sample(true, 2.00)

http://git-wip-us.apache.org/repos/asf/spark/blob/d479c526/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala
index c39005f..5cc9467 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala
@@ -238,7 +238,7 @@ class CreateTableAsSelectSuite
     }
   }
 
-  test("CTAS of decimal calculation") {
+  test("SPARK-17409: CTAS of decimal calculation") {
     withTable("tab2") {
       withTempView("tab1") {
         spark.range(99, 101).createOrReplaceTempView("tab1")

http://git-wip-us.apache.org/repos/asf/spark/blob/d479c526/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreRelationSuite.scala
----------------------------------------------------------------------
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreRelationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreRelationSuite.scala
index c28e41a..91ff711 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreRelationSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreRelationSuite.scala
@@ -17,12 +17,14 @@
 
 package org.apache.spark.sql.hive
 
-import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.{QueryTest, Row}
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType}
+import org.apache.spark.sql.hive.test.TestHiveSingleton
+import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
 
-class MetastoreRelationSuite extends SparkFunSuite {
+class MetastoreRelationSuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
   test("makeCopy and toJSON should work") {
     val table = CatalogTable(
       identifier = TableIdentifier("test", Some("db")),
@@ -36,4 +38,18 @@ class MetastoreRelationSuite extends SparkFunSuite {
     // No exception should be thrown
     relation.toJSON
   }
+
+  test("SPARK-17409: Do Not Optimize Query in CTAS (Hive Serde Table) More Than Once") {
+    withTable("bar") {
+      withTempView("foo") {
+        sql("select 0 as id").createOrReplaceTempView("foo")
+        // If we optimize the query in CTAS more than once, the following saveAsTable will fail
+        // with the error: `GROUP BY position 0 is not in select list (valid range is [1, 1])`
+        sql("CREATE TABLE bar AS SELECT * FROM foo group by id")
+        checkAnswer(spark.table("bar"), Row(0) :: Nil)
+        val tableMetadata = spark.sessionState.catalog.getTableMetadata(TableIdentifier("bar"))
+        assert(tableMetadata.provider == Some("hive"), "the expected table is a Hive serde table")
+      }
+    }
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org