You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2020/12/28 08:23:35 UTC

[spark] branch master updated: [SPARK-32685][SQL] When specify serde, default filed.delim is '\t'

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new fc508d1  [SPARK-32685][SQL] When specify serde, default filed.delim is '\t'
fc508d1 is described below

commit fc508d189820f7b64a507709738786856dd89f8a
Author: angerszhu <an...@gmail.com>
AuthorDate: Mon Dec 28 08:23:01 2020 +0000

    [SPARK-32685][SQL] When specify serde, default filed.delim is '\t'
    
    ### What changes were proposed in this pull request?
    In hive script transform, when we use specified serde, the `filed.delim` is '\t'
    ![image](https://user-images.githubusercontent.com/46485123/103187960-7dd77800-4901-11eb-8241-f4636e66fbc8.png)
    And change to other serde and explain query plan, `filed.delim` is same.
    
    In spark current code, the result is as below:
    ![image](https://user-images.githubusercontent.com/46485123/103187999-95aefc00-4901-11eb-9850-5c385000b78c.png)
    
    We should keep same as hive.
    
    Notic:
    the result's NULL value is   different is another issue  https://issues.apache.org/jira/browse/SPARK-32684
    
    ### Why are the changes needed?
    Keep same with hive serde
    
    ### Does this PR introduce _any_ user-facing change?
    In script transform, is not specified,  `field.delim` keep same with hive as `\t`
    
    ### How was this patch tested?
    UT added
    
    Closes #30942 from AngersZhuuuu/SPARK-32685.
    
    Authored-by: angerszhu <an...@gmail.com>
    Signed-off-by: Wenchen Fan <we...@databricks.com>
---
 .../spark/sql/execution/SparkSqlParser.scala       |  3 +-
 .../execution/HiveScriptTransformationSuite.scala  | 63 ++++++++++++++++++++++
 2 files changed, 65 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
index e530b4c..16e1914 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
@@ -505,7 +505,8 @@ class SparkSqlAstBuilder extends AstBuilder {
           } else {
             None
           }
-          (Seq.empty, Option(name), props.toSeq, recordHandler)
+          val finalProps = props ++ Seq("field.delim" -> props.getOrElse("field.delim", "\t"))
+          (Seq.empty, Option(name), finalProps.toSeq, recordHandler)
 
         case null =>
           // Use default (serde) format.
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveScriptTransformationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveScriptTransformationSuite.scala
index 0876709..266c526 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveScriptTransformationSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveScriptTransformationSuite.scala
@@ -23,6 +23,7 @@ import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe
 import org.scalatest.exceptions.TestFailedException
 
 import org.apache.spark.{SparkException, TestUtils}
+import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Expression}
 import org.apache.spark.sql.execution._
 import org.apache.spark.sql.functions._
@@ -438,4 +439,66 @@ class HiveScriptTransformationSuite extends BaseScriptTransformationSuite with T
       assert(e2.contains("array<double> cannot be converted to Hive TypeInfo"))
     }
   }
+
+  test("SPARK-32685: When use specified serde, filed.delim's default value is '\t'") {
+    val query1 = sql(
+      """
+        |SELECT split(value, "\t") FROM (
+        |SELECT TRANSFORM(a, b, c)
+        |USING 'cat'
+        |FROM (SELECT 1 AS a, 2 AS b, 3 AS c) t
+        |) temp;
+      """.stripMargin)
+    checkAnswer(query1, identity, Row(Seq("2", "3")) :: Nil)
+
+    val query2 = sql(
+      """
+        |SELECT split(value, "\t") FROM (
+        |SELECT TRANSFORM(a, b, c)
+        |  ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
+        |USING 'cat'
+        |  ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
+        |  WITH SERDEPROPERTIES (
+        |   'serialization.last.column.takes.rest' = 'true'
+        |  )
+        |FROM (SELECT 1 AS a, 2 AS b, 3 AS c) t
+        |) temp;
+      """.stripMargin)
+    checkAnswer(query2, identity, Row(Seq("2", "3")) :: Nil)
+
+    val query3 = sql(
+      """
+        |SELECT split(value, "&") FROM (
+        |SELECT TRANSFORM(a, b, c)
+        |  ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
+        |  WITH SERDEPROPERTIES (
+        |   'field.delim' = '&'
+        |  )
+        |USING 'cat'
+        |  ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
+        |  WITH SERDEPROPERTIES (
+        |   'serialization.last.column.takes.rest' = 'true',
+        |   'field.delim' = '&'
+        |  )
+        |FROM (SELECT 1 AS a, 2 AS b, 3 AS c) t
+        |) temp;
+      """.stripMargin)
+    checkAnswer(query3, identity, Row(Seq("2", "3")) :: Nil)
+
+    val query4 = sql(
+      """
+        |SELECT split(value, "&") FROM (
+        |SELECT TRANSFORM(a, b, c)
+        |  ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
+        |USING 'cat'
+        |  ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
+        |  WITH SERDEPROPERTIES (
+        |   'serialization.last.column.takes.rest' = 'true',
+        |   'field.delim' = '&'
+        |  )
+        |FROM (SELECT 1 AS a, 2 AS b, 3 AS c) t
+        |) temp;
+      """.stripMargin)
+    checkAnswer(query4, identity, Row(null) :: Nil)
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org