You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by li...@apache.org on 2018/02/15 07:53:03 UTC
spark git commit: [SPARK-23421][SPARK-22356][SQL] Document the behavior change in

Repository: spark
Updated Branches:
  refs/heads/master 658d9d9d7 -> a77ebb092


[SPARK-23421][SPARK-22356][SQL] Document the behavior change in

## What changes were proposed in this pull request?
https://github.com/apache/spark/pull/19579 introduces a behavior change. We need to document it in the migration guide.

## How was this patch tested?
Also update the HiveExternalCatalogVersionsSuite to verify it.

Author: gatorsmile <ga...@gmail.com>

Closes #20606 from gatorsmile/addMigrationGuide.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a77ebb09
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a77ebb09
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a77ebb09

Branch: refs/heads/master
Commit: a77ebb0921e390cf4fc6279a8c0a92868ad7e69b
Parents: 658d9d9
Author: gatorsmile <ga...@gmail.com>
Authored: Wed Feb 14 23:52:59 2018 -0800
Committer: gatorsmile <ga...@gmail.com>
Committed: Wed Feb 14 23:52:59 2018 -0800

----------------------------------------------------------------------
 docs/sql-programming-guide.md                                    | 2 ++
 .../apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/a77ebb09/docs/sql-programming-guide.md
----------------------------------------------------------------------
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 0f9f01e..cf9529a 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1963,6 +1963,8 @@ working with timestamps in `pandas_udf`s to get the best performance, see
 ## Upgrading From Spark SQL 2.1 to 2.2
 
   - Spark 2.1.1 introduced a new configuration key: `spark.sql.hive.caseSensitiveInferenceMode`. It had a default setting of `NEVER_INFER`, which kept behavior identical to 2.1.0. However, Spark 2.2.0 changes this setting's default value to `INFER_AND_SAVE` to restore compatibility with reading Hive metastore tables whose underlying file schema have mixed-case column names. With the `INFER_AND_SAVE` configuration value, on first access Spark will perform schema inference on any Hive metastore table for which it has not already saved an inferred schema. Note that schema inference can be a very time consuming operation for tables with thousands of partitions. If compatibility with mixed-case column names is not a concern, you can safely set `spark.sql.hive.caseSensitiveInferenceMode` to `NEVER_INFER` to avoid the initial overhead of schema inference. Note that with the new default `INFER_AND_SAVE` setting, the results of the schema inference are saved as a metastore key for future use
 . Therefore, the initial schema inference occurs only at a table's first access.
+  
+  - Since Spark 2.2.1 and 2.3.0, the schema is always inferred at runtime when the data source tables have the columns that exist in both partition schema and data schema. The inferred schema does not have the partitioned columns. When reading the table, Spark respects the partition values of these overlapping columns instead of the values stored in the data source files. In 2.2.0 and 2.1.x release, the inferred schema is partitioned but the data of the table is invisible to users (i.e., the result set is empty).
 
 ## Upgrading From Spark SQL 2.0 to 2.1
 

http://git-wip-us.apache.org/repos/asf/spark/blob/a77ebb09/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala
----------------------------------------------------------------------
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala
index ae4aeb7..c13a750 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala
@@ -195,7 +195,7 @@ class HiveExternalCatalogVersionsSuite extends SparkSubmitTestUtils {
 
 object PROCESS_TABLES extends QueryTest with SQLTestUtils {
   // Tests the latest version of every release line.
-  val testingVersions = Seq("2.0.2", "2.1.2", "2.2.0")
+  val testingVersions = Seq("2.0.2", "2.1.2", "2.2.0", "2.2.1")
 
   protected var spark: SparkSession = _
 
@@ -249,7 +249,7 @@ object PROCESS_TABLES extends QueryTest with SQLTestUtils {
 
       // SPARK-22356: overlapped columns between data and partition schema in data source tables
       val tbl_with_col_overlap = s"tbl_with_col_overlap_$index"
-      // For Spark 2.2.0 and 2.1.x, the behavior is different from Spark 2.0.
+      // For Spark 2.2.0 and 2.1.x, the behavior is different from Spark 2.0, 2.2.1, 2.3+
       if (testingVersions(index).startsWith("2.1") || testingVersions(index) == "2.2.0") {
         spark.sql("msck repair table " + tbl_with_col_overlap)
         assert(spark.table(tbl_with_col_overlap).columns === Array("i", "j", "p"))


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org