You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ma...@apache.org on 2014/12/18 00:02:31 UTC
spark git commit: [SPARK-4856] [SQL] NullType instead of StringType when sampling against empty string or nul...

Repository: spark
Updated Branches:
  refs/heads/master 19c0faad6 -> 8d0d2a65e


[SPARK-4856] [SQL] NullType instead of StringType when sampling against empty string or nul...

```
TestSQLContext.sparkContext.parallelize(
  """{"ip":"27.31.100.29","headers":{"Host":"1.abc.com","Charset":"UTF-8"}}""" ::
  """{"ip":"27.31.100.29","headers":{}}""" ::
  """{"ip":"27.31.100.29","headers":""}""" :: Nil)
```
As empty string (the "headers") will be considered as String in the beginning (in line 2 and 3), it ignores the real nested data type (struct type "headers" in line 1), and also take the line 1 (the "headers") as String Type, which is not our expected.

Author: Cheng Hao <ha...@intel.com>

Closes #3708 from chenghao-intel/json and squashes the following commits:

e7a72e9 [Cheng Hao] add more concise unit test
853de51 [Cheng Hao] NullType instead of StringType when sampling against empty string or null value


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8d0d2a65
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8d0d2a65
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8d0d2a65

Branch: refs/heads/master
Commit: 8d0d2a65eb3a7b1865f7fa7cc18b146fc6474620
Parents: 19c0faa
Author: Cheng Hao <ha...@intel.com>
Authored: Wed Dec 17 15:01:59 2014 -0800
Committer: Michael Armbrust <mi...@databricks.com>
Committed: Wed Dec 17 15:01:59 2014 -0800

----------------------------------------------------------------------
 .../org/apache/spark/sql/json/JsonRDD.scala      |  4 +++-
 .../org/apache/spark/sql/json/JsonSuite.scala    | 19 +++++++++++++++++++
 .../org/apache/spark/sql/json/TestJsonData.scala |  7 +++++++
 3 files changed, 29 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/8d0d2a65/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
index ffb9548..00449c2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
@@ -263,6 +263,8 @@ private[sql] object JsonRDD extends Logging {
         val elementType = typeOfArray(array)
         buildKeyPathForInnerStructs(array, elementType) :+ (key, elementType)
       }
+      // we couldn't tell what the type is if the value is null or empty string
+      case (key: String, value) if value == "" || value == null => (key, NullType) :: Nil
       case (key: String, value) => (key, typeOfPrimitiveValue(value)) :: Nil
     }
   }
@@ -400,13 +402,13 @@ private[sql] object JsonRDD extends Logging {
     } else {
       desiredType match {
         case StringType => toString(value)
+        case _ if value == null || value == "" => null // guard the non string type
         case IntegerType => value.asInstanceOf[IntegerType.JvmType]
         case LongType => toLong(value)
         case DoubleType => toDouble(value)
         case DecimalType() => toDecimal(value)
         case BooleanType => value.asInstanceOf[BooleanType.JvmType]
         case NullType => null
-
         case ArrayType(elementType, _) =>
           value.asInstanceOf[Seq[Any]].map(enforceCorrectType(_, elementType))
         case struct: StructType => asRow(value.asInstanceOf[Map[String, Any]], struct)

http://git-wip-us.apache.org/repos/asf/spark/blob/8d0d2a65/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
index f088d41..8dce337 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
@@ -193,6 +193,25 @@ class JsonSuite extends QueryTest {
       StringType)
   }
 
+  test("Complex field and type inferring with null in sampling") {
+    val jsonSchemaRDD = jsonRDD(jsonNullStruct)
+    val expectedSchema = StructType(
+      StructField("headers", StructType(
+        StructField("Charset", StringType, true) ::
+          StructField("Host", StringType, true) :: Nil)
+        , true) ::
+        StructField("ip", StringType, true) ::
+        StructField("nullstr", StringType, true):: Nil)
+
+    assert(expectedSchema === jsonSchemaRDD.schema)
+    jsonSchemaRDD.registerTempTable("jsonTable")
+
+    checkAnswer(
+      sql("select nullstr, headers.Host from jsonTable"),
+      Seq(Row("", "1.abc.com"), Row("", null), Row("", null), Row(null, null))
+    )
+  }
+
   test("Primitive field and type inferring") {
     val jsonSchemaRDD = jsonRDD(primitiveFieldAndType)
 

http://git-wip-us.apache.org/repos/asf/spark/blob/8d0d2a65/sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala b/sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala
index e5773a5..3370b3c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala
@@ -43,6 +43,13 @@ object TestJsonData {
       """{"num_num_1":21474836570, "num_num_2":1.1, "num_num_3": 21474836470,
           "num_bool":null, "num_str":92233720368547758070, "str_bool":null}""" :: Nil)
 
+  val jsonNullStruct =
+    TestSQLContext.sparkContext.parallelize(
+      """{"nullstr":"","ip":"27.31.100.29","headers":{"Host":"1.abc.com","Charset":"UTF-8"}}""" ::
+        """{"nullstr":"","ip":"27.31.100.29","headers":{}}""" ::
+        """{"nullstr":"","ip":"27.31.100.29","headers":""}""" ::
+        """{"nullstr":null,"ip":"27.31.100.29","headers":null}""" :: Nil)
+
   val complexFieldValueTypeConflict =
     TestSQLContext.sparkContext.parallelize(
       """{"num_struct":11, "str_array":[1, 2, 3],


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org