You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ma...@apache.org on 2014/11/03 00:47:14 UTC
git commit: [SPARK-4185][SQL] JSON schema inference failed when
dealing with type conflicts in arrays
Repository: spark
Updated Branches:
refs/heads/master e749f5ded -> 06232d23f
[SPARK-4185][SQL] JSON schema inference failed when dealing with type conflicts in arrays
JIRA: https://issues.apache.org/jira/browse/SPARK-4185.
This PR also has the fix of #3052.
Author: Yin Huai <hu...@cse.ohio-state.edu>
Closes #3056 from yhuai/SPARK-4185 and squashes the following commits:
ed3a5a8 [Yin Huai] Correctly handle type conflicts between structs and primitive types in an array.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/06232d23
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/06232d23
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/06232d23
Branch: refs/heads/master
Commit: 06232d23ff2a6344c49fff81364d9f6b02af326b
Parents: e749f5d
Author: Yin Huai <hu...@cse.ohio-state.edu>
Authored: Sun Nov 2 15:46:56 2014 -0800
Committer: Michael Armbrust <mi...@databricks.com>
Committed: Sun Nov 2 15:46:56 2014 -0800
----------------------------------------------------------------------
.../scala/org/apache/spark/sql/json/JsonRDD.scala | 16 +++++++++++-----
.../scala/org/apache/spark/sql/json/JsonSuite.scala | 9 ++++++---
.../org/apache/spark/sql/json/TestJsonData.scala | 4 +++-
3 files changed, 20 insertions(+), 9 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/06232d23/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
index 5bb6f6c..0f2dcdc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
@@ -73,16 +73,18 @@ private[sql] object JsonRDD extends Logging {
def makeStruct(values: Seq[Seq[String]], prefix: Seq[String]): StructType = {
val (topLevel, structLike) = values.partition(_.size == 1)
+
val topLevelFields = topLevel.filter {
name => resolved.get(prefix ++ name).get match {
case ArrayType(elementType, _) => {
def hasInnerStruct(t: DataType): Boolean = t match {
- case s: StructType => false
+ case s: StructType => true
case ArrayType(t1, _) => hasInnerStruct(t1)
- case o => true
+ case o => false
}
- hasInnerStruct(elementType)
+ // Check if this array has inner struct.
+ !hasInnerStruct(elementType)
}
case struct: StructType => false
case _ => true
@@ -90,8 +92,11 @@ private[sql] object JsonRDD extends Logging {
}.map {
a => StructField(a.head, resolved.get(prefix ++ a).get, nullable = true)
}
+ val topLevelFieldNameSet = topLevelFields.map(_.name)
- val structFields: Seq[StructField] = structLike.groupBy(_(0)).map {
+ val structFields: Seq[StructField] = structLike.groupBy(_(0)).filter {
+ case (name, _) => !topLevelFieldNameSet.contains(name)
+ }.map {
case (name, fields) => {
val nestedFields = fields.map(_.tail)
val structType = makeStruct(nestedFields, prefix :+ name)
@@ -354,7 +359,8 @@ private[sql] object JsonRDD extends Logging {
case (key, value) =>
if (count > 0) builder.append(",")
count += 1
- builder.append(s"""\"${key}\":${toString(value)}""")
+ val stringValue = if (value.isInstanceOf[String]) s"""\"$value\"""" else toString(value)
+ builder.append(s"""\"${key}\":${stringValue}""")
}
builder.append("}")
http://git-wip-us.apache.org/repos/asf/spark/blob/06232d23/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
index 362c7e1..4b851d1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
@@ -483,7 +483,8 @@ class JsonSuite extends QueryTest {
val expectedSchema = StructType(
StructField("array1", ArrayType(StringType, true), true) ::
StructField("array2", ArrayType(StructType(
- StructField("field", LongType, true) :: Nil), false), true) :: Nil)
+ StructField("field", LongType, true) :: Nil), false), true) ::
+ StructField("array3", ArrayType(StringType, false), true) :: Nil)
assert(expectedSchema === jsonSchemaRDD.schema)
@@ -492,12 +493,14 @@ class JsonSuite extends QueryTest {
checkAnswer(
sql("select * from jsonTable"),
Seq(Seq("1", "1.1", "true", null, "[]", "{}", "[2,3,4]",
- """{"field":str}"""), Seq(Seq(214748364700L), Seq(1))) :: Nil
+ """{"field":"str"}"""), Seq(Seq(214748364700L), Seq(1)), null) ::
+ Seq(null, null, Seq("""{"field":"str"}""", """{"field":1}""")) ::
+ Seq(null, null, Seq("1", "2", "3")) :: Nil
)
// Treat an element as a number.
checkAnswer(
- sql("select array1[0] + 1 from jsonTable"),
+ sql("select array1[0] + 1 from jsonTable where array1 is not null"),
2
)
}
http://git-wip-us.apache.org/repos/asf/spark/blob/06232d23/sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala b/sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala
index c204162..e5773a5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala
@@ -57,7 +57,9 @@ object TestJsonData {
val arrayElementTypeConflict =
TestSQLContext.sparkContext.parallelize(
"""{"array1": [1, 1.1, true, null, [], {}, [2,3,4], {"field":"str"}],
- "array2": [{"field":214748364700}, {"field":1}]}""" :: Nil)
+ "array2": [{"field":214748364700}, {"field":1}]}""" ::
+ """{"array3": [{"field":"str"}, {"field":1}]}""" ::
+ """{"array3": [1, 2, 3]}""" :: Nil)
val missingFields =
TestSQLContext.sparkContext.parallelize(
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org