You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2020/06/26 01:42:42 UTC
[spark] branch master updated: [SPARK-32025][SQL] Csv schema inference problems with different types in the same column

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new bbb2cba  [SPARK-32025][SQL] Csv schema inference problems with different types in the same column
bbb2cba is described below

commit bbb2cba6158455c9e7744b1dbc778eb5d0a8860f
Author: Pablo Langa <so...@gmail.com>
AuthorDate: Fri Jun 26 10:41:27 2020 +0900

    [SPARK-32025][SQL] Csv schema inference problems with different types in the same column
    
    ### What changes were proposed in this pull request?
    
    This pull request fixes a bug present in the csv type inference.
    We have problems when we have different types in the same column.
    
    **Previously:**
    ```
    $ cat /example/f1.csv
    col1
    43200000
    true
    
    spark.read.csv(path="file:///example/*.csv", header=True, inferSchema=True).show()
    +----+
    |col1|
    +----+
    |null|
    |true|
    +----+
    
    root
     |-- col1: boolean (nullable = true)
    ```
    **Now**
    ```
    spark.read.csv(path="file:///example/*.csv", header=True, inferSchema=True).show()
    +-------------+
    |col1          |
    +-------------+
    |43200000 |
    |true           |
    +-------------+
    
    root
     |-- col1: string (nullable = true)
    ```
    
    Previously the hierarchy of type inference is the following:
    
    > IntegerType
    > > LongType
    > > > DecimalType
    > > > > DoubleType
    > > > > > TimestampType
    > > > > > > BooleanType
    > > > > > > > StringType
    
    So, when, for example, we have integers in one column, and the last element is a boolean, all the column is inferred as a boolean column incorrectly and all the number are shown as null when you see the data
    
    We need the following hierarchy. When we have different numeric types in the column it will be resolved correctly. And when we have other different types it will be resolved as a String type column
    > IntegerType
    > > LongType
    > > > DecimalType
    > > > > DoubleType
    > > > > > StringType
    
    > TimestampType
    > > StringType
    
    > BooleanType
    > > StringType
    
    > StringType
    
    ### Why are the changes needed?
    
    Fix the bug explained
    
    ### Does this PR introduce _any_ user-facing change?
    
    No
    
    ### How was this patch tested?
    
    Unit test and manual tests
    
    Closes #28896 from planga82/feature/SPARK-32025_csv_inference.
    
    Authored-by: Pablo Langa <so...@gmail.com>
    Signed-off-by: HyukjinKwon <gu...@apache.org>
---
 .../org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala   |  7 +++----
 .../apache/spark/sql/catalyst/csv/CSVInferSchemaSuite.scala  | 10 +++++-----
 .../spark/sql/execution/datasources/csv/CSVSuite.scala       | 12 ++++++++++++
 3 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala
index f0df18d..56677d7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala
@@ -102,13 +102,11 @@ class CSVInferSchema(val options: CSVOptions) extends Serializable {
     if (field == null || field.isEmpty || field == options.nullValue) {
       typeSoFar
     } else {
-      typeSoFar match {
+      val typeElemInfer = typeSoFar match {
         case NullType => tryParseInteger(field)
         case IntegerType => tryParseInteger(field)
         case LongType => tryParseLong(field)
-        case _: DecimalType =>
-          // DecimalTypes have different precisions and scales, so we try to find the common type.
-          compatibleType(typeSoFar, tryParseDecimal(field)).getOrElse(StringType)
+        case _: DecimalType => tryParseDecimal(field)
         case DoubleType => tryParseDouble(field)
         case TimestampType => tryParseTimestamp(field)
         case BooleanType => tryParseBoolean(field)
@@ -116,6 +114,7 @@ class CSVInferSchema(val options: CSVOptions) extends Serializable {
         case other: DataType =>
           throw new UnsupportedOperationException(s"Unexpected data type $other")
       }
+      compatibleType(typeSoFar, typeElemInfer).getOrElse(StringType)
     }
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchemaSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchemaSuite.scala
index b014eb9..d268f8c 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchemaSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchemaSuite.scala
@@ -56,11 +56,11 @@ class CSVInferSchemaSuite extends SparkFunSuite with SQLHelper {
     assert(inferSchema.inferField(IntegerType, "1.0") == DoubleType)
     assert(inferSchema.inferField(DoubleType, null) == DoubleType)
     assert(inferSchema.inferField(DoubleType, "test") == StringType)
-    assert(inferSchema.inferField(LongType, "2015-08-20 14:57:00") == TimestampType)
-    assert(inferSchema.inferField(DoubleType, "2015-08-20 15:57:00") == TimestampType)
-    assert(inferSchema.inferField(LongType, "True") == BooleanType)
-    assert(inferSchema.inferField(IntegerType, "FALSE") == BooleanType)
-    assert(inferSchema.inferField(TimestampType, "FALSE") == BooleanType)
+    assert(inferSchema.inferField(LongType, "2015-08-20 14:57:00") == StringType)
+    assert(inferSchema.inferField(DoubleType, "2015-08-20 15:57:00") == StringType)
+    assert(inferSchema.inferField(LongType, "True") == StringType)
+    assert(inferSchema.inferField(IntegerType, "FALSE") == StringType)
+    assert(inferSchema.inferField(TimestampType, "FALSE") == StringType)
 
     val textValueOne = Long.MaxValue.toString + "0"
     val decimalValueOne = new java.math.BigDecimal(textValueOne)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index 366cf11..fcb7bdc 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -2341,6 +2341,18 @@ abstract class CSVSuite extends QueryTest with SharedSparkSession with TestCsvDa
       checkAnswer(csv, Row(null))
     }
   }
+
+  test("SPARK-32025: infer the schema from mixed-type values") {
+    withTempPath { path =>
+      Seq("col_mixed_types", "2012", "1997", "True").toDS.write.text(path.getCanonicalPath)
+      val df = spark.read.format("csv")
+        .option("header", "true")
+        .option("inferSchema", "true")
+        .load(path.getCanonicalPath)
+
+      assert(df.schema.last == StructField("col_mixed_types", StringType, true))
+    }
+  }
 }
 
 class CSVv1Suite extends CSVSuite {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org