You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2016/09/07 01:36:58 UTC

spark git commit: [SPARK-17238][SQL] simplify the logic for converting data source table into hive compatible format

Repository: spark
Updated Branches:
  refs/heads/master a40657bfd -> d6eede9a3


[SPARK-17238][SQL] simplify the logic for converting data source table into hive compatible format

## What changes were proposed in this pull request?

Previously we have 2 conditions to decide whether a data source table is hive-compatible:

1. the data source is file-based and has a corresponding Hive serde
2. have a `path` entry in data source options/storage properties

However, if condition 1 is true, condition 2 must be true too, as we will put the default table path into data source options/storage properties for managed data source tables.

There is also a potential issue: we will set the `locationUri` even for managed table.

This PR removes the condition 2 and only set the `locationUri` for external data source tables.

Note: this is also a first step to unify the `path` of data source tables and `locationUri` of hive serde tables. For hive serde tables, `locationUri` is only set for external table. For data source tables, `path` is always set. We can make them consistent after this PR.

## How was this patch tested?

existing tests

Author: Wenchen Fan <we...@databricks.com>

Closes #14809 from cloud-fan/minor2.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d6eede9a
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d6eede9a
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d6eede9a

Branch: refs/heads/master
Commit: d6eede9a36766e2d2294951b054d7557008a5662
Parents: a40657b
Author: Wenchen Fan <we...@databricks.com>
Authored: Wed Sep 7 09:36:53 2016 +0800
Committer: Wenchen Fan <we...@databricks.com>
Committed: Wed Sep 7 09:36:53 2016 +0800

----------------------------------------------------------------------
 .../spark/sql/hive/HiveExternalCatalog.scala    | 32 +++++++++++---------
 1 file changed, 18 insertions(+), 14 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/d6eede9a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
----------------------------------------------------------------------
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
index 2e127ef..d35a681 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -249,10 +249,21 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
       }
 
       // converts the table metadata to Hive compatible format, i.e. set the serde information.
-      def newHiveCompatibleMetastoreTable(serde: HiveSerDe, path: String): CatalogTable = {
+      def newHiveCompatibleMetastoreTable(serde: HiveSerDe): CatalogTable = {
+        val location = if (tableDefinition.tableType == EXTERNAL) {
+          // When we hit this branch, we are saving an external data source table with hive
+          // compatible format, which means the data source is file-based and must have a `path`.
+          val map = new CaseInsensitiveMap(tableDefinition.storage.properties)
+          require(map.contains("path"),
+            "External file-based data source table must have a `path` entry in storage properties.")
+          Some(new Path(map("path")).toUri.toString)
+        } else {
+          None
+        }
+
         tableDefinition.copy(
           storage = tableDefinition.storage.copy(
-            locationUri = Some(new Path(path).toUri.toString),
+            locationUri = location,
             inputFormat = serde.inputFormat,
             outputFormat = serde.outputFormat,
             serde = serde.serde
@@ -262,11 +273,10 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
 
       val qualifiedTableName = tableDefinition.identifier.quotedString
       val maybeSerde = HiveSerDe.sourceToSerDe(tableDefinition.provider.get)
-      val maybePath = new CaseInsensitiveMap(tableDefinition.storage.properties).get("path")
       val skipHiveMetadata = tableDefinition.storage.properties
         .getOrElse("skipHiveMetadata", "false").toBoolean
 
-      val (hiveCompatibleTable, logMessage) = (maybeSerde, maybePath) match {
+      val (hiveCompatibleTable, logMessage) = maybeSerde match {
         case _ if skipHiveMetadata =>
           val message =
             s"Persisting data source table $qualifiedTableName into Hive metastore in" +
@@ -280,17 +290,11 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
               "Hive metastore in Spark SQL specific format, which is NOT compatible with Hive. "
           (None, message)
 
-        case (Some(serde), Some(path)) =>
+        case Some(serde) =>
           val message =
-            s"Persisting file based data source table $qualifiedTableName with an input path " +
-              s"into Hive metastore in Hive compatible format."
-          (Some(newHiveCompatibleMetastoreTable(serde, path)), message)
-
-        case (Some(_), None) =>
-          val message =
-            s"Data source table $qualifiedTableName is not file based. Persisting it into " +
-              s"Hive metastore in Spark SQL specific format, which is NOT compatible with Hive."
-          (None, message)
+            s"Persisting file based data source table $qualifiedTableName into " +
+              s"Hive metastore in Hive compatible format."
+          (Some(newHiveCompatibleMetastoreTable(serde)), message)
 
         case _ =>
           val provider = tableDefinition.provider.get


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org