You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by do...@apache.org on 2019/05/26 15:50:05 UTC

[spark] branch branch-2.3 updated: [SPARK-27441][SQL][TEST] Add read/write tests to Hive serde tables

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-2.3
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-2.3 by this push:
     new ba224f8  [SPARK-27441][SQL][TEST] Add read/write tests to Hive serde tables
ba224f8 is described below

commit ba224f8fab5622b8443e6fa460ed84110ff83f8f
Author: Yuming Wang <yu...@ebay.com>
AuthorDate: Sun May 26 08:35:58 2019 -0700

    [SPARK-27441][SQL][TEST] Add read/write tests to Hive serde tables
    
    ## What changes were proposed in this pull request?
    
    The versions between Hive, Parquet and ORC after the built-in Hive upgraded to 2.3.5 for Hadoop 3.2:
    
    - built-in Hive is 1.2.1.spark2:
    
      | ORC | Parquet
    -- | -- | --
    Spark datasource table | 1.5.5 | 1.10.1
    Spark hive table | Hive built-in | 1.6.0
    Apache Hive 1.2.1 | Hive built-in | 1.6.0
    
    - built-in Hive is 2.3.5:
    
      | ORC | Parquet
    -- | -- | --
    Spark datasource table | 1.5.5 | 1.10.1
    Spark hive table | 1.5.5 | [1.10.1](https://github.com/apache/spark/pull/24346)
    Apache Hive 2.3.5 | 1.3.4 | 1.8.1
    
    We should add a test for Hive Serde table. This pr adds tests to test read/write of all supported data types using Parquet and ORC.
    
    ## How was this patch tested?
    
    unit tests
    
    Closes #24345 from wangyum/SPARK-27441.
    
    Authored-by: Yuming Wang <yu...@ebay.com>
    Signed-off-by: Dongjoon Hyun <dh...@apple.com>
    (cherry picked from commit 193304b51bee164c1d355b75be039f762118bdba)
    Signed-off-by: Dongjoon Hyun <dh...@apple.com>
---
 .../hive/execution/HiveSerDeReadWriteSuite.scala   | 185 +++++++++++++++++++++
 1 file changed, 185 insertions(+)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeReadWriteSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeReadWriteSuite.scala
new file mode 100644
index 0000000..25ff354
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeReadWriteSuite.scala
@@ -0,0 +1,185 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.execution
+
+import java.sql.{Date, Timestamp}
+
+import org.apache.spark.sql.{QueryTest, Row}
+import org.apache.spark.sql.hive.HiveUtils.{CONVERT_METASTORE_ORC, CONVERT_METASTORE_PARQUET}
+import org.apache.spark.sql.hive.test.TestHiveSingleton
+import org.apache.spark.sql.internal.SQLConf.ORC_IMPLEMENTATION
+import org.apache.spark.sql.test.SQLTestUtils
+
+class HiveSerDeReadWriteSuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
+
+  private var originalConvertMetastoreParquet = CONVERT_METASTORE_PARQUET.defaultValueString
+  private var originalConvertMetastoreORC = CONVERT_METASTORE_ORC.defaultValueString
+  private var originalORCImplementation = ORC_IMPLEMENTATION.defaultValueString
+
+  protected override def beforeAll(): Unit = {
+    super.beforeAll()
+    originalConvertMetastoreParquet = spark.conf.get(CONVERT_METASTORE_PARQUET.key)
+    originalConvertMetastoreORC = spark.conf.get(CONVERT_METASTORE_ORC.key)
+    originalORCImplementation = spark.conf.get(ORC_IMPLEMENTATION)
+
+    spark.conf.set(CONVERT_METASTORE_PARQUET.key, "false")
+    spark.conf.set(CONVERT_METASTORE_ORC.key, "false")
+    spark.conf.set(ORC_IMPLEMENTATION.key, "hive")
+  }
+
+  protected override def afterAll(): Unit = {
+    spark.conf.set(CONVERT_METASTORE_PARQUET.key, originalConvertMetastoreParquet)
+    spark.conf.set(CONVERT_METASTORE_ORC.key, originalConvertMetastoreORC)
+    spark.conf.set(ORC_IMPLEMENTATION.key, originalORCImplementation)
+    super.afterAll()
+  }
+
+  private def checkNumericTypes(fileFormat: String, dataType: String, value: Any): Unit = {
+    withTable("hive_serde") {
+      hiveClient.runSqlHive(s"CREATE TABLE hive_serde (c1 $dataType) STORED AS $fileFormat")
+      hiveClient.runSqlHive("INSERT INTO TABLE hive_serde values(1)")
+      checkAnswer(spark.table("hive_serde"), Row(1))
+      spark.sql(s"INSERT INTO TABLE hive_serde values($value)")
+      checkAnswer(spark.table("hive_serde"), Seq(Row(1), Row(value)))
+    }
+  }
+
+  private def checkDateTimeTypes(fileFormat: String): Unit = {
+    // TIMESTAMP
+    withTable("hive_serde") {
+      hiveClient.runSqlHive(s"CREATE TABLE hive_serde (c1 TIMESTAMP) STORED AS $fileFormat")
+      hiveClient.runSqlHive("INSERT INTO TABLE hive_serde values('2019-04-11 15:50:00')")
+      checkAnswer(spark.table("hive_serde"), Row(Timestamp.valueOf("2019-04-11 15:50:00")))
+      spark.sql("INSERT INTO TABLE hive_serde values('2019-04-12 15:50:00')")
+      checkAnswer(
+        spark.table("hive_serde"),
+        Seq(Row(Timestamp.valueOf("2019-04-11 15:50:00")),
+          Row(Timestamp.valueOf("2019-04-12 15:50:00"))))
+    }
+
+    // DATE
+    withTable("hive_serde") {
+      hiveClient.runSqlHive(s"CREATE TABLE hive_serde (c1 DATE) STORED AS $fileFormat")
+      hiveClient.runSqlHive("INSERT INTO TABLE hive_serde values('2019-04-11')")
+      checkAnswer(spark.table("hive_serde"), Row(Date.valueOf("2019-04-11")))
+      spark.sql("INSERT INTO TABLE hive_serde values('2019-04-12')")
+      checkAnswer(
+        spark.table("hive_serde"),
+        Seq(Row(Date.valueOf("2019-04-11")), Row(Date.valueOf("2019-04-12"))))
+    }
+  }
+
+  private def checkStringTypes(fileFormat: String, dataType: String, value: String): Unit = {
+    withTable("hive_serde") {
+      hiveClient.runSqlHive(s"CREATE TABLE hive_serde (c1 $dataType) STORED AS $fileFormat")
+      hiveClient.runSqlHive("INSERT INTO TABLE hive_serde values('s')")
+      checkAnswer(spark.table("hive_serde"), Row("s"))
+      spark.sql(s"INSERT INTO TABLE hive_serde values('$value')")
+      checkAnswer(spark.table("hive_serde"), Seq(Row("s"), Row(value)))
+    }
+  }
+
+  private def checkCharTypes(fileFormat: String): Unit = {
+    withTable("hive_serde") {
+      hiveClient.runSqlHive(s"CREATE TABLE hive_serde (c1 CHAR(10)) STORED AS $fileFormat")
+      hiveClient.runSqlHive("INSERT INTO TABLE hive_serde values('s')")
+      checkAnswer(spark.table("hive_serde"), Row("s" + " " * 9))
+      spark.sql(s"INSERT INTO TABLE hive_serde values('s3')")
+      checkAnswer(spark.table("hive_serde"), Seq(Row("s" + " " * 9), Row("s3" + " " * 8)))
+    }
+  }
+
+  private def checkMiscTypes(fileFormat: String): Unit = {
+    // BOOLEAN
+    withTable("hive_serde") {
+      hiveClient.runSqlHive(s"CREATE TABLE hive_serde (c1 BOOLEAN) STORED AS $fileFormat")
+      hiveClient.runSqlHive("INSERT INTO TABLE hive_serde values(false)")
+      checkAnswer(spark.table("hive_serde"), Row(false))
+      spark.sql("INSERT INTO TABLE hive_serde values(true)")
+      checkAnswer(spark.table("hive_serde"), Seq(Row(false), Row(true)))
+    }
+
+    // BINARY
+    withTable("hive_serde") {
+      hiveClient.runSqlHive(s"CREATE TABLE hive_serde (c1 BINARY) STORED AS $fileFormat")
+      hiveClient.runSqlHive("INSERT INTO TABLE hive_serde values('1')")
+      checkAnswer(spark.table("hive_serde"), Row("1".getBytes))
+      spark.sql("INSERT INTO TABLE hive_serde values('2')")
+      checkAnswer(spark.table("hive_serde"), Seq(Row("1".getBytes), Row("2".getBytes)))
+    }
+  }
+
+  private def checkComplexTypes(fileFormat: String): Unit = {
+    // ARRAY<data_type>
+    withTable("hive_serde") {
+      hiveClient.runSqlHive(s"CREATE TABLE hive_serde (c1 ARRAY <STRING>) STORED AS $fileFormat")
+      hiveClient.runSqlHive("INSERT INTO TABLE hive_serde SELECT ARRAY('a','b') FROM (SELECT 1) t")
+      checkAnswer(spark.table("hive_serde"), Row(Array("a", "b")))
+      spark.sql("INSERT INTO TABLE hive_serde SELECT ARRAY('c', 'd')")
+      checkAnswer(spark.table("hive_serde"), Seq(Row(Array("a", "b")), Row(Array("c", "d"))))
+    }
+    // MAP<primitive_type, data_type>
+    withTable("hive_serde") {
+      hiveClient.runSqlHive(s"CREATE TABLE hive_serde (c1 MAP <INT, STRING>) STORED AS $fileFormat")
+      hiveClient.runSqlHive("INSERT INTO TABLE hive_serde SELECT MAP(1, 'a') FROM (SELECT 1) t")
+      checkAnswer(spark.table("hive_serde"), Row(Map(1 -> "a")))
+      spark.sql("INSERT INTO TABLE hive_serde SELECT MAP(2, 'b')")
+      checkAnswer(spark.table("hive_serde"), Seq(Row(Map(1 -> "a")), Row(Map(2 -> "b"))))
+    }
+
+    // STRUCT<col_name : data_type [COMMENT col_comment], ...>
+    withTable("hive_serde") {
+      hiveClient.runSqlHive(
+        s"CREATE TABLE hive_serde (c1 STRUCT <k: INT>) STORED AS $fileFormat")
+      hiveClient.runSqlHive(
+        "INSERT INTO TABLE hive_serde SELECT NAMED_STRUCT('k', 1) FROM (SELECT 1) t")
+      checkAnswer(spark.table("hive_serde"), Row(Row(1)))
+      spark.sql("INSERT INTO TABLE hive_serde SELECT NAMED_STRUCT('k', 2)")
+      checkAnswer(spark.table("hive_serde"), Seq(Row(Row(1)), Row(Row(2))))
+    }
+  }
+
+  Seq("PARQUET", "ORC").foreach { fileFormat =>
+    test(s"Read/Write Hive $fileFormat serde table") {
+      // Numeric Types
+      checkNumericTypes(fileFormat, "TINYINT", 2)
+      checkNumericTypes(fileFormat, "SMALLINT", 2)
+      checkNumericTypes(fileFormat, "INT", 2)
+      checkNumericTypes(fileFormat, "BIGINT", 2)
+      checkNumericTypes(fileFormat, "FLOAT", 2.1F)
+      checkNumericTypes(fileFormat, "DOUBLE", 2.1D)
+      checkNumericTypes(fileFormat, "DECIMAL(9, 2)", 2.1D)
+      checkNumericTypes(fileFormat, "DECIMAL(18, 2)", 2.1D)
+      checkNumericTypes(fileFormat, "DECIMAL(38, 2)", 2.1D)
+
+      // Date/Time Types
+      checkDateTimeTypes(fileFormat)
+
+      // String Types
+      checkStringTypes(fileFormat, "STRING", "s1")
+      checkStringTypes(fileFormat, "VARCHAR(10)", "s2")
+      checkCharTypes(fileFormat)
+
+      // Misc Types
+      checkMiscTypes(fileFormat)
+
+      // Complex Types
+      checkComplexTypes(fileFormat)
+    }
+  }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org