You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ma...@apache.org on 2014/07/08 02:06:03 UTC
git commit: [SPARK-2375][SQL] JSON schema inference may not resolve
type conflicts correctly for a field inside an array of structs
Repository: spark
Updated Branches:
refs/heads/master 4deeed17c -> f0496ee10
[SPARK-2375][SQL] JSON schema inference may not resolve type conflicts correctly for a field inside an array of structs
For example, for
```
{"array": [{"field":214748364700}, {"field":1}]}
```
the type of field is resolved as IntType. While, for
```
{"array": [{"field":1}, {"field":214748364700}]}
```
the type of field is resolved as LongType.
JIRA: https://issues.apache.org/jira/browse/SPARK-2375
Author: Yin Huai <hu...@gmail.com>
Closes #1308 from yhuai/SPARK-2375 and squashes the following commits:
3e2e312 [Yin Huai] Update unit test.
1b2ff9f [Yin Huai] Merge remote-tracking branch 'upstream/master' into SPARK-2375
10794eb [Yin Huai] Correctly resolve the type of a field inside an array of structs.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f0496ee1
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f0496ee1
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f0496ee1
Branch: refs/heads/master
Commit: f0496ee10847db921a028a34f70385f9b740b3f3
Parents: 4deeed1
Author: Yin Huai <hu...@gmail.com>
Authored: Mon Jul 7 17:05:59 2014 -0700
Committer: Michael Armbrust <mi...@databricks.com>
Committed: Mon Jul 7 17:05:59 2014 -0700
----------------------------------------------------------------------
.../src/main/scala/org/apache/spark/sql/json/JsonRDD.scala | 9 +++++----
.../test/scala/org/apache/spark/sql/json/JsonSuite.scala | 8 +++++---
.../test/scala/org/apache/spark/sql/json/TestJsonData.scala | 3 ++-
3 files changed, 12 insertions(+), 8 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/f0496ee1/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
index edf8677..f6cbca9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
@@ -198,11 +198,12 @@ private[sql] object JsonRDD extends Logging {
* in this JSON object can appear in other JSON objects.
*/
private def allKeysWithValueTypes(m: Map[String, Any]): Set[(String, DataType)] = {
- m.map{
+ val keyValuePairs = m.map {
// Quote the key with backticks to handle cases which have dots
// in the field name.
- case (key, dataType) => (s"`$key`", dataType)
- }.flatMap {
+ case (key, value) => (s"`$key`", value)
+ }.toSet
+ keyValuePairs.flatMap {
case (key: String, struct: Map[String, Any]) => {
// The value associted with the key is an JSON object.
allKeysWithValueTypes(struct).map {
@@ -224,7 +225,7 @@ private[sql] object JsonRDD extends Logging {
}
}
case (key: String, value) => (key, typeOfPrimitiveValue(value)) :: Nil
- }.toSet
+ }
}
/**
http://git-wip-us.apache.org/repos/asf/spark/blob/f0496ee1/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
index 10bd9f0..e765cfc 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
@@ -451,7 +451,9 @@ class JsonSuite extends QueryTest {
val jsonSchemaRDD = jsonRDD(arrayElementTypeConflict)
val expectedSchema =
- AttributeReference("array", ArrayType(StringType), true)() :: Nil
+ AttributeReference("array1", ArrayType(StringType), true)() ::
+ AttributeReference("array2", ArrayType(StructType(
+ StructField("field", LongType, true) :: Nil)), true)() :: Nil
comparePlans(Schema(expectedSchema), Schema(jsonSchemaRDD.logicalPlan.output))
@@ -460,12 +462,12 @@ class JsonSuite extends QueryTest {
checkAnswer(
sql("select * from jsonTable"),
Seq(Seq("1", "1.1", "true", null, "[]", "{}", "[2,3,4]",
- """{"field":str}""")) :: Nil
+ """{"field":str}"""), Seq(Seq(214748364700L), Seq(1))) :: Nil
)
// Treat an element as a number.
checkAnswer(
- sql("select array[0] + 1 from jsonTable"),
+ sql("select array1[0] + 1 from jsonTable"),
2
)
}
http://git-wip-us.apache.org/repos/asf/spark/blob/f0496ee1/sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala b/sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala
index 065e040..d0180f3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala
@@ -72,7 +72,8 @@ object TestJsonData {
val arrayElementTypeConflict =
TestSQLContext.sparkContext.parallelize(
- """{"array": [1, 1.1, true, null, [], {}, [2,3,4], {"field":"str"}]}""" :: Nil)
+ """{"array1": [1, 1.1, true, null, [], {}, [2,3,4], {"field":"str"}],
+ "array2": [{"field":214748364700}, {"field":1}]}""" :: Nil)
val missingFields =
TestSQLContext.sparkContext.parallelize(