You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by li...@apache.org on 2018/07/17 21:15:33 UTC
spark git commit: [SPARK-24681][SQL] Verify nested column names in
Hive metastore
Repository: spark
Updated Branches:
refs/heads/master 912634b00 -> 2a4dd6f06
[SPARK-24681][SQL] Verify nested column names in Hive metastore
## What changes were proposed in this pull request?
This pr added code to check if nested column names do not include ',', ':', and ';' because Hive metastore can't handle these characters in nested column names;
ref: https://github.com/apache/hive/blob/release-1.2.1/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/TypeInfoUtils.java#L239
## How was this patch tested?
Added tests in `HiveDDLSuite`.
Author: Takeshi Yamamuro <ya...@apache.org>
Closes #21711 from maropu/SPARK-24681.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2a4dd6f0
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2a4dd6f0
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2a4dd6f0
Branch: refs/heads/master
Commit: 2a4dd6f06cfd2f58fda9786c88809e6de695444e
Parents: 912634b
Author: Takeshi Yamamuro <ya...@apache.org>
Authored: Tue Jul 17 14:15:30 2018 -0700
Committer: Xiao Li <ga...@gmail.com>
Committed: Tue Jul 17 14:15:30 2018 -0700
----------------------------------------------------------------------
.../spark/sql/hive/HiveExternalCatalog.scala | 34 ++++++++++++++++----
.../spark/sql/hive/execution/HiveDDLSuite.scala | 19 +++++++++++
2 files changed, 46 insertions(+), 7 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/2a4dd6f0/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
----------------------------------------------------------------------
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
index 44480ce..7f28fc4 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -138,17 +138,37 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
}
/**
- * Checks the validity of data column names. Hive metastore disallows the table to use comma in
- * data column names. Partition columns do not have such a restriction. Views do not have such
- * a restriction.
+ * Checks the validity of data column names. Hive metastore disallows the table to use some
+ * special characters (',', ':', and ';') in data column names, including nested column names.
+ * Partition columns do not have such a restriction. Views do not have such a restriction.
*/
private def verifyDataSchema(
tableName: TableIdentifier, tableType: CatalogTableType, dataSchema: StructType): Unit = {
if (tableType != VIEW) {
- dataSchema.map(_.name).foreach { colName =>
- if (colName.contains(",")) {
- throw new AnalysisException("Cannot create a table having a column whose name contains " +
- s"commas in Hive metastore. Table: $tableName; Column: $colName")
+ val invalidChars = Seq(",", ":", ";")
+ def verifyNestedColumnNames(schema: StructType): Unit = schema.foreach { f =>
+ f.dataType match {
+ case st: StructType => verifyNestedColumnNames(st)
+ case _ if invalidChars.exists(f.name.contains) =>
+ val invalidCharsString = invalidChars.map(c => s"'$c'").mkString(", ")
+ val errMsg = "Cannot create a table having a nested column whose name contains " +
+ s"invalid characters ($invalidCharsString) in Hive metastore. Table: $tableName; " +
+ s"Column: ${f.name}"
+ throw new AnalysisException(errMsg)
+ case _ =>
+ }
+ }
+
+ dataSchema.foreach { f =>
+ f.dataType match {
+ // Checks top-level column names
+ case _ if f.name.contains(",") =>
+ throw new AnalysisException("Cannot create a table having a column whose name " +
+ s"contains commas in Hive metastore. Table: $tableName; Column: ${f.name}")
+ // Checks nested column names
+ case st: StructType =>
+ verifyNestedColumnNames(st)
+ case _ =>
}
}
}
http://git-wip-us.apache.org/repos/asf/spark/blob/2a4dd6f0/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
----------------------------------------------------------------------
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
index 0341c3b..31fd4c5 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
@@ -33,6 +33,7 @@ import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.analysis.{NoSuchPartitionException, TableAlreadyExistsException}
import org.apache.spark.sql.catalyst.catalog._
import org.apache.spark.sql.execution.command.{DDLSuite, DDLUtils}
+import org.apache.spark.sql.functions._
import org.apache.spark.sql.hive.HiveExternalCatalog
import org.apache.spark.sql.hive.HiveUtils.{CONVERT_METASTORE_ORC, CONVERT_METASTORE_PARQUET}
import org.apache.spark.sql.hive.orc.OrcFileOperator
@@ -2248,4 +2249,22 @@ class HiveDDLSuite
checkAnswer(spark.table("t4"), Row(0, 0))
}
}
+
+ test("SPARK-24681 checks if nested column names do not include ',', ':', and ';'") {
+ val expectedMsg = "Cannot create a table having a nested column whose name contains invalid " +
+ "characters (',', ':', ';') in Hive metastore."
+
+ Seq("nested,column", "nested:column", "nested;column").foreach { nestedColumnName =>
+ withTable("t") {
+ val e = intercept[AnalysisException] {
+ spark.range(1)
+ .select(struct(lit(0).as(nestedColumnName)).as("toplevel"))
+ .write
+ .format("hive")
+ .saveAsTable("t")
+ }.getMessage
+ assert(e.contains(expectedMsg))
+ }
+ }
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org