You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2016/10/25 02:47:18 UTC
spark git commit: [SPARK-17409][SQL][FOLLOW-UP] Do Not Optimize Query
in CTAS More Than Once
Repository: spark
Updated Branches:
refs/heads/master 84a339990 -> d479c5262
[SPARK-17409][SQL][FOLLOW-UP] Do Not Optimize Query in CTAS More Than Once
### What changes were proposed in this pull request?
This follow-up PR is for addressing the [comment](https://github.com/apache/spark/pull/15048).
We added two test cases based on the suggestion from yhuai . One is a new test case using the `saveAsTable` API to create a data source table. Another is for CTAS on Hive serde table.
Note: No need to backport this PR to 2.0. Will submit a new PR to backport the whole fix with new test cases to Spark 2.0
### How was this patch tested?
N/A
Author: gatorsmile <ga...@gmail.com>
Closes #15459 from gatorsmile/ctasOptimizedTestCases.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d479c526
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d479c526
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d479c526
Branch: refs/heads/master
Commit: d479c5262276b47302659bd877a9e3467400bdb6
Parents: 84a3399
Author: gatorsmile <ga...@gmail.com>
Authored: Tue Oct 25 10:47:11 2016 +0800
Committer: Wenchen Fan <we...@databricks.com>
Committed: Tue Oct 25 10:47:11 2016 +0800
----------------------------------------------------------------------
.../org/apache/spark/sql/DataFrameSuite.scala | 18 ++++++++++++++++++
.../sql/sources/CreateTableAsSelectSuite.scala | 2 +-
.../spark/sql/hive/MetastoreRelationSuite.scala | 20 ++++++++++++++++++--
3 files changed, 37 insertions(+), 3 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/d479c526/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index e87baa4..3fb7eee 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -1599,6 +1599,24 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
assert(df.persist.take(1).apply(0).toSeq(100).asInstanceOf[Long] == 100)
}
+ test("SPARK-17409: Do Not Optimize Query in CTAS (Data source tables) More Than Once") {
+ withTable("bar") {
+ withTempView("foo") {
+ withSQLConf(SQLConf.DEFAULT_DATA_SOURCE_NAME.key -> "json") {
+ sql("select 0 as id").createOrReplaceTempView("foo")
+ val df = sql("select * from foo group by id")
+ // If we optimize the query in CTAS more than once, the following saveAsTable will fail
+ // with the error: `GROUP BY position 0 is not in select list (valid range is [1, 1])`
+ df.write.mode("overwrite").saveAsTable("bar")
+ checkAnswer(spark.table("bar"), Row(0) :: Nil)
+ val tableMetadata = spark.sessionState.catalog.getTableMetadata(TableIdentifier("bar"))
+ assert(tableMetadata.provider == Some("json"),
+ "the expected table is a data source table using json")
+ }
+ }
+ }
+ }
+
test("copy results for sampling with replacement") {
val df = Seq((1, 0), (2, 0), (3, 0)).toDF("a", "b")
val sampleDf = df.sample(true, 2.00)
http://git-wip-us.apache.org/repos/asf/spark/blob/d479c526/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala
index c39005f..5cc9467 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala
@@ -238,7 +238,7 @@ class CreateTableAsSelectSuite
}
}
- test("CTAS of decimal calculation") {
+ test("SPARK-17409: CTAS of decimal calculation") {
withTable("tab2") {
withTempView("tab1") {
spark.range(99, 101).createOrReplaceTempView("tab1")
http://git-wip-us.apache.org/repos/asf/spark/blob/d479c526/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreRelationSuite.scala
----------------------------------------------------------------------
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreRelationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreRelationSuite.scala
index c28e41a..91ff711 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreRelationSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreRelationSuite.scala
@@ -17,12 +17,14 @@
package org.apache.spark.sql.hive
-import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.{QueryTest, Row}
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType}
+import org.apache.spark.sql.hive.test.TestHiveSingleton
+import org.apache.spark.sql.test.SQLTestUtils
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
-class MetastoreRelationSuite extends SparkFunSuite {
+class MetastoreRelationSuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
test("makeCopy and toJSON should work") {
val table = CatalogTable(
identifier = TableIdentifier("test", Some("db")),
@@ -36,4 +38,18 @@ class MetastoreRelationSuite extends SparkFunSuite {
// No exception should be thrown
relation.toJSON
}
+
+ test("SPARK-17409: Do Not Optimize Query in CTAS (Hive Serde Table) More Than Once") {
+ withTable("bar") {
+ withTempView("foo") {
+ sql("select 0 as id").createOrReplaceTempView("foo")
+ // If we optimize the query in CTAS more than once, the following saveAsTable will fail
+ // with the error: `GROUP BY position 0 is not in select list (valid range is [1, 1])`
+ sql("CREATE TABLE bar AS SELECT * FROM foo group by id")
+ checkAnswer(spark.table("bar"), Row(0) :: Nil)
+ val tableMetadata = spark.sessionState.catalog.getTableMetadata(TableIdentifier("bar"))
+ assert(tableMetadata.provider == Some("hive"), "the expected table is a Hive serde table")
+ }
+ }
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org