You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2020/02/13 04:35:43 UTC
[spark] branch branch-3.0 updated: [SPARK-30790] The dataType of
map() should be map
This is an automated email from the ASF dual-hosted git repository.
wenchen pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.0 by this push:
new 8ab6ae3 [SPARK-30790] The dataType of map() should be map<null,null>
8ab6ae3 is described below
commit 8ab6ae3ede96adb093347470a5cbbf17fe8c04e9
Author: iRakson <ra...@gmail.com>
AuthorDate: Thu Feb 13 12:23:40 2020 +0800
[SPARK-30790] The dataType of map() should be map<null,null>
### What changes were proposed in this pull request?
`spark.sql("select map()")` returns {}.
After these changes it will return map<null,null>
### Why are the changes needed?
After changes introduced due to #27521, it is important to maintain consistency while using map().
### Does this PR introduce any user-facing change?
Yes. Now map() will give map<null,null> instead of {}.
### How was this patch tested?
UT added. Migration guide updated as well
Closes #27542 from iRakson/SPARK-30790.
Authored-by: iRakson <ra...@gmail.com>
Signed-off-by: Wenchen Fan <we...@databricks.com>
(cherry picked from commit 926e3a1efe9e142804fcbf52146b22700640ae1b)
Signed-off-by: Wenchen Fan <we...@databricks.com>
---
docs/sql-migration-guide.md | 2 +-
.../catalyst/expressions/complexTypeCreator.scala | 14 +++++++++---
.../sql/catalyst/util/ArrayBasedMapBuilder.scala | 5 ++---
.../org/apache/spark/sql/internal/SQLConf.scala | 10 ++++-----
.../apache/spark/sql/DataFrameFunctionsSuite.scala | 25 +++++++++++++++-------
5 files changed, 36 insertions(+), 20 deletions(-)
diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md
index f98fab5..46b7416 100644
--- a/docs/sql-migration-guide.md
+++ b/docs/sql-migration-guide.md
@@ -216,7 +216,7 @@ license: |
- Since Spark 3.0, the `size` function returns `NULL` for the `NULL` input. In Spark version 2.4 and earlier, this function gives `-1` for the same input. To restore the behavior before Spark 3.0, you can set `spark.sql.legacy.sizeOfNull` to `true`.
- - Since Spark 3.0, when the `array` function is called without any parameters, it returns an empty array of `NullType`. In Spark version 2.4 and earlier, it returns an empty array of string type. To restore the behavior before Spark 3.0, you can set `spark.sql.legacy.arrayDefaultToStringType.enabled` to `true`.
+ - Since Spark 3.0, when the `array`/`map` function is called without any parameters, it returns an empty collection with `NullType` as element type. In Spark version 2.4 and earlier, it returns an empty collection with `StringType` as element type. To restore the behavior before Spark 3.0, you can set `spark.sql.legacy.createEmptyCollectionUsingStringType` to `true`.
- Since Spark 3.0, the interval literal syntax does not allow multiple from-to units anymore. For example, `SELECT INTERVAL '1-1' YEAR TO MONTH '2-2' YEAR TO MONTH'` throws parser exception.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
index 7335e30..4bd85d3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
@@ -46,7 +46,7 @@ case class CreateArray(children: Seq[Expression]) extends Expression {
}
private val defaultElementType: DataType = {
- if (SQLConf.get.getConf(SQLConf.LEGACY_ARRAY_DEFAULT_TO_STRING)) {
+ if (SQLConf.get.getConf(SQLConf.LEGACY_CREATE_EMPTY_COLLECTION_USING_STRING_TYPE)) {
StringType
} else {
NullType
@@ -145,6 +145,14 @@ case class CreateMap(children: Seq[Expression]) extends Expression {
lazy val keys = children.indices.filter(_ % 2 == 0).map(children)
lazy val values = children.indices.filter(_ % 2 != 0).map(children)
+ private val defaultElementType: DataType = {
+ if (SQLConf.get.getConf(SQLConf.LEGACY_CREATE_EMPTY_COLLECTION_USING_STRING_TYPE)) {
+ StringType
+ } else {
+ NullType
+ }
+ }
+
override def foldable: Boolean = children.forall(_.foldable)
override def checkInputDataTypes(): TypeCheckResult = {
@@ -167,9 +175,9 @@ case class CreateMap(children: Seq[Expression]) extends Expression {
override lazy val dataType: MapType = {
MapType(
keyType = TypeCoercion.findCommonTypeDifferentOnlyInNullFlags(keys.map(_.dataType))
- .getOrElse(StringType),
+ .getOrElse(defaultElementType),
valueType = TypeCoercion.findCommonTypeDifferentOnlyInNullFlags(values.map(_.dataType))
- .getOrElse(StringType),
+ .getOrElse(defaultElementType),
valueContainsNull = values.exists(_.nullable))
}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilder.scala
index 9893436..37d6530 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilder.scala
@@ -29,12 +29,11 @@ import org.apache.spark.unsafe.array.ByteArrayMethods
*/
class ArrayBasedMapBuilder(keyType: DataType, valueType: DataType) extends Serializable {
assert(!keyType.existsRecursively(_.isInstanceOf[MapType]), "key of map cannot be/contain map")
- assert(keyType != NullType, "map key cannot be null type.")
private lazy val keyToIndex = keyType match {
// Binary type data is `byte[]`, which can't use `==` to check equality.
- case _: AtomicType | _: CalendarIntervalType if !keyType.isInstanceOf[BinaryType] =>
- new java.util.HashMap[Any, Int]()
+ case _: AtomicType | _: CalendarIntervalType | _: NullType
+ if !keyType.isInstanceOf[BinaryType] => new java.util.HashMap[Any, Int]()
case _ =>
// for complex types, use interpreted ordering to be able to compare unsafe data with safe
// data, e.g. UnsafeRow vs GenericInternalRow.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index b79b767..442711d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -2007,12 +2007,12 @@ object SQLConf {
.booleanConf
.createWithDefault(false)
- val LEGACY_ARRAY_DEFAULT_TO_STRING =
- buildConf("spark.sql.legacy.arrayDefaultToStringType.enabled")
+ val LEGACY_CREATE_EMPTY_COLLECTION_USING_STRING_TYPE =
+ buildConf("spark.sql.legacy.createEmptyCollectionUsingStringType")
.internal()
- .doc("When set to true, it returns an empty array of string type when the `array` " +
- "function is called without any parameters. Otherwise, it returns an empty " +
- "array of `NullType`")
+ .doc("When set to true, Spark returns an empty collection with `StringType` as element " +
+ "type if the `array`/`map` function is called without any parameters. Otherwise, Spark " +
+ "returns an empty collection with `NullType` as element type.")
.booleanConf
.createWithDefault(false)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index 6012678..f7531ea 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -3499,13 +3499,6 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession {
).foreach(assertValuesDoNotChangeAfterCoalesceOrUnion(_))
}
- test("SPARK-21281 use string types by default if map have no argument") {
- val ds = spark.range(1)
- var expectedSchema = new StructType()
- .add("x", MapType(StringType, StringType, valueContainsNull = false), nullable = false)
- assert(ds.select(map().as("x")).schema == expectedSchema)
- }
-
test("SPARK-21281 fails if functions have no argument") {
val df = Seq(1).toDF("a")
@@ -3563,7 +3556,8 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession {
test("SPARK-29462: Empty array of NullType for array function with no arguments") {
Seq((true, StringType), (false, NullType)).foreach {
case (arrayDefaultToString, expectedType) =>
- withSQLConf(SQLConf.LEGACY_ARRAY_DEFAULT_TO_STRING.key -> arrayDefaultToString.toString) {
+ withSQLConf(SQLConf.LEGACY_CREATE_EMPTY_COLLECTION_USING_STRING_TYPE.key ->
+ arrayDefaultToString.toString) {
val schema = spark.range(1).select(array()).schema
assert(schema.nonEmpty && schema.head.dataType.isInstanceOf[ArrayType])
val actualType = schema.head.dataType.asInstanceOf[ArrayType].elementType
@@ -3571,6 +3565,21 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession {
}
}
}
+
+ test("SPARK-30790: Empty map with NullType as key/value type for map function with no argument") {
+ Seq((true, StringType), (false, NullType)).foreach {
+ case (mapDefaultToString, expectedType) =>
+ withSQLConf(SQLConf.LEGACY_CREATE_EMPTY_COLLECTION_USING_STRING_TYPE.key ->
+ mapDefaultToString.toString) {
+ val schema = spark.range(1).select(map()).schema
+ assert(schema.nonEmpty && schema.head.dataType.isInstanceOf[MapType])
+ val actualKeyType = schema.head.dataType.asInstanceOf[MapType].keyType
+ val actualValueType = schema.head.dataType.asInstanceOf[MapType].valueType
+ assert(actualKeyType === expectedType)
+ assert(actualValueType === expectedType)
+ }
+ }
+ }
}
object DataFrameFunctionsSuite {
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org