You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by rx...@apache.org on 2016/04/08 09:30:29 UTC
spark git commit: [SPARK-14189][SQL] JSON data sources find compatible types even if inferred decimal type is not capable of the others

Repository: spark
Updated Branches:
  refs/heads/master 725b860e2 -> 73b56a3c6


[SPARK-14189][SQL] JSON data sources find compatible types even if inferred decimal type is not capable of the others

## What changes were proposed in this pull request?

https://issues.apache.org/jira/browse/SPARK-14189

When inferred types in the same field during finding compatible `DataType`, are `IntegralType` and `DecimalType` but `DecimalType` is not capable of the given `IntegralType`, JSON data source simply fails to find a compatible type resulting in `StringType`.

This can be observed when `prefersDecimal` is enabled.

```scala
def mixedIntegerAndDoubleRecords: RDD[String] =
  sqlContext.sparkContext.parallelize(
    """{"a": 3, "b": 1.1}""" ::
    """{"a": 3.1, "b": 1}""" :: Nil)

val jsonDF = sqlContext.read
  .option("prefersDecimal", "true")
  .json(mixedIntegerAndDoubleRecords)
  .printSchema()
```

- **Before**

```
root
 |-- a: string (nullable = true)
 |-- b: string (nullable = true)
```

- **After**

```
root
 |-- a: decimal(21, 1) (nullable = true)
 |-- b: decimal(21, 1) (nullable = true)
```
(Note that integer is inferred as `LongType` which becomes `DecimalType(20, 0)`)

## How was this patch tested?

unit tests were used and style tests by `dev/run_tests`.

Author: hyukjinkwon <gu...@gmail.com>

Closes #11993 from HyukjinKwon/SPARK-14189.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/73b56a3c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/73b56a3c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/73b56a3c

Branch: refs/heads/master
Commit: 73b56a3c6c5c590219b42884c8bbe88b0a236987
Parents: 725b860
Author: hyukjinkwon <gu...@gmail.com>
Authored: Fri Apr 8 00:30:26 2016 -0700
Committer: Reynold Xin <rx...@databricks.com>
Committed: Fri Apr 8 00:30:26 2016 -0700

----------------------------------------------------------------------
 .../datasources/json/InferSchema.scala          |  8 +++++++
 .../execution/datasources/json/JsonSuite.scala  | 22 ++++++++++++++++++++
 2 files changed, 30 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/73b56a3c/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/InferSchema.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/InferSchema.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/InferSchema.scala
index 4a34f36..8e8238a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/InferSchema.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/InferSchema.scala
@@ -256,6 +256,14 @@ private[sql] object InferSchema {
         case (ArrayType(elementType1, containsNull1), ArrayType(elementType2, containsNull2)) =>
           ArrayType(compatibleType(elementType1, elementType2), containsNull1 || containsNull2)
 
+        // The case that given `DecimalType` is capable of given `IntegralType` is handled in
+        // `findTightestCommonTypeOfTwo`. Both cases below will be executed only when
+        // the given `DecimalType` is not capable of the given `IntegralType`.
+        case (t1: IntegralType, t2: DecimalType) =>
+          compatibleType(DecimalType.forType(t1), t2)
+        case (t1: DecimalType, t2: IntegralType) =>
+          compatibleType(t1, DecimalType.forType(t2))
+
         // strings and every string is a Json object.
         case (_, _) => StringType
       }

http://git-wip-us.apache.org/repos/asf/spark/blob/73b56a3c/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
index 421862c..2a18acb 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
@@ -773,6 +773,28 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
     )
   }
 
+  test("Find compatible types even if inferred DecimalType is not capable of other IntegralType") {
+    val mixedIntegerAndDoubleRecords = sparkContext.parallelize(
+      """{"a": 3, "b": 1.1}""" ::
+      s"""{"a": 3.1, "b": 0.${"0" * 38}1}""" :: Nil)
+    val jsonDF = sqlContext.read
+      .option("prefersDecimal", "true")
+      .json(mixedIntegerAndDoubleRecords)
+
+    // The values in `a` field will be decimals as they fit in decimal. For `b` field,
+    // they will be doubles as `1.0E-39D` does not fit.
+    val expectedSchema = StructType(
+      StructField("a", DecimalType(21, 1), true) ::
+      StructField("b", DoubleType, true) :: Nil)
+
+    assert(expectedSchema === jsonDF.schema)
+    checkAnswer(
+      jsonDF,
+      Row(BigDecimal("3"), 1.1D) ::
+      Row(BigDecimal("3.1"), 1.0E-39D) :: Nil
+    )
+  }
+
   test("Infer big integers correctly even when it does not fit in decimal") {
     val jsonDF = sqlContext.read
       .json(bigIntegerRecords)


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org