You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2020/12/28 08:23:35 UTC
[spark] branch master updated: [SPARK-32685][SQL] When specify
serde, default filed.delim is '\t'
This is an automated email from the ASF dual-hosted git repository.
wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new fc508d1 [SPARK-32685][SQL] When specify serde, default filed.delim is '\t'
fc508d1 is described below
commit fc508d189820f7b64a507709738786856dd89f8a
Author: angerszhu <an...@gmail.com>
AuthorDate: Mon Dec 28 08:23:01 2020 +0000
[SPARK-32685][SQL] When specify serde, default filed.delim is '\t'
### What changes were proposed in this pull request?
In hive script transform, when we use specified serde, the `filed.delim` is '\t'
![image](https://user-images.githubusercontent.com/46485123/103187960-7dd77800-4901-11eb-8241-f4636e66fbc8.png)
And change to other serde and explain query plan, `filed.delim` is same.
In spark current code, the result is as below:
![image](https://user-images.githubusercontent.com/46485123/103187999-95aefc00-4901-11eb-9850-5c385000b78c.png)
We should keep same as hive.
Notic:
the result's NULL value is different is another issue https://issues.apache.org/jira/browse/SPARK-32684
### Why are the changes needed?
Keep same with hive serde
### Does this PR introduce _any_ user-facing change?
In script transform, is not specified, `field.delim` keep same with hive as `\t`
### How was this patch tested?
UT added
Closes #30942 from AngersZhuuuu/SPARK-32685.
Authored-by: angerszhu <an...@gmail.com>
Signed-off-by: Wenchen Fan <we...@databricks.com>
---
.../spark/sql/execution/SparkSqlParser.scala | 3 +-
.../execution/HiveScriptTransformationSuite.scala | 63 ++++++++++++++++++++++
2 files changed, 65 insertions(+), 1 deletion(-)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
index e530b4c..16e1914 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
@@ -505,7 +505,8 @@ class SparkSqlAstBuilder extends AstBuilder {
} else {
None
}
- (Seq.empty, Option(name), props.toSeq, recordHandler)
+ val finalProps = props ++ Seq("field.delim" -> props.getOrElse("field.delim", "\t"))
+ (Seq.empty, Option(name), finalProps.toSeq, recordHandler)
case null =>
// Use default (serde) format.
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveScriptTransformationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveScriptTransformationSuite.scala
index 0876709..266c526 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveScriptTransformationSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveScriptTransformationSuite.scala
@@ -23,6 +23,7 @@ import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe
import org.scalatest.exceptions.TestFailedException
import org.apache.spark.{SparkException, TestUtils}
+import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Expression}
import org.apache.spark.sql.execution._
import org.apache.spark.sql.functions._
@@ -438,4 +439,66 @@ class HiveScriptTransformationSuite extends BaseScriptTransformationSuite with T
assert(e2.contains("array<double> cannot be converted to Hive TypeInfo"))
}
}
+
+ test("SPARK-32685: When use specified serde, filed.delim's default value is '\t'") {
+ val query1 = sql(
+ """
+ |SELECT split(value, "\t") FROM (
+ |SELECT TRANSFORM(a, b, c)
+ |USING 'cat'
+ |FROM (SELECT 1 AS a, 2 AS b, 3 AS c) t
+ |) temp;
+ """.stripMargin)
+ checkAnswer(query1, identity, Row(Seq("2", "3")) :: Nil)
+
+ val query2 = sql(
+ """
+ |SELECT split(value, "\t") FROM (
+ |SELECT TRANSFORM(a, b, c)
+ | ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
+ |USING 'cat'
+ | ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
+ | WITH SERDEPROPERTIES (
+ | 'serialization.last.column.takes.rest' = 'true'
+ | )
+ |FROM (SELECT 1 AS a, 2 AS b, 3 AS c) t
+ |) temp;
+ """.stripMargin)
+ checkAnswer(query2, identity, Row(Seq("2", "3")) :: Nil)
+
+ val query3 = sql(
+ """
+ |SELECT split(value, "&") FROM (
+ |SELECT TRANSFORM(a, b, c)
+ | ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
+ | WITH SERDEPROPERTIES (
+ | 'field.delim' = '&'
+ | )
+ |USING 'cat'
+ | ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
+ | WITH SERDEPROPERTIES (
+ | 'serialization.last.column.takes.rest' = 'true',
+ | 'field.delim' = '&'
+ | )
+ |FROM (SELECT 1 AS a, 2 AS b, 3 AS c) t
+ |) temp;
+ """.stripMargin)
+ checkAnswer(query3, identity, Row(Seq("2", "3")) :: Nil)
+
+ val query4 = sql(
+ """
+ |SELECT split(value, "&") FROM (
+ |SELECT TRANSFORM(a, b, c)
+ | ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
+ |USING 'cat'
+ | ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
+ | WITH SERDEPROPERTIES (
+ | 'serialization.last.column.takes.rest' = 'true',
+ | 'field.delim' = '&'
+ | )
+ |FROM (SELECT 1 AS a, 2 AS b, 3 AS c) t
+ |) temp;
+ """.stripMargin)
+ checkAnswer(query4, identity, Row(null) :: Nil)
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org