You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by sr...@apache.org on 2017/07/11 10:11:12 UTC
spark git commit: [SPARK-21263][SQL] Do not allow partially parsing
double and floats via NumberFormat in CSV
Repository: spark
Updated Branches:
refs/heads/master a4baa8f48 -> 7514db1de
[SPARK-21263][SQL] Do not allow partially parsing double and floats via NumberFormat in CSV
## What changes were proposed in this pull request?
This PR proposes to remove `NumberFormat.parse` use to disallow a case of partially parsed data. For example,
```
scala> spark.read.schema("a DOUBLE").option("mode", "FAILFAST").csv(Seq("10u12").toDS).show()
+----+
| a|
+----+
|10.0|
+----+
```
## How was this patch tested?
Unit tests added in `UnivocityParserSuite` and `CSVSuite`.
Author: hyukjinkwon <gu...@gmail.com>
Closes #18532 from HyukjinKwon/SPARK-21263.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7514db1d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7514db1d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7514db1d
Branch: refs/heads/master
Commit: 7514db1deca22b44ce18e4c571275ce79addc100
Parents: a4baa8f
Author: hyukjinkwon <gu...@gmail.com>
Authored: Tue Jul 11 11:11:08 2017 +0100
Committer: Sean Owen <so...@cloudera.com>
Committed: Tue Jul 11 11:11:08 2017 +0100
----------------------------------------------------------------------
.../datasources/csv/UnivocityParser.scala | 8 ++------
.../execution/datasources/csv/CSVSuite.scala | 21 ++++++++++++++++++++
.../datasources/csv/UnivocityParserSuite.scala | 21 ++++++++++----------
3 files changed, 34 insertions(+), 16 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/7514db1d/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
index c3657ac..0e41f3c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
@@ -111,9 +111,7 @@ class UnivocityParser(
case options.nanValue => Float.NaN
case options.negativeInf => Float.NegativeInfinity
case options.positiveInf => Float.PositiveInfinity
- case datum =>
- Try(datum.toFloat)
- .getOrElse(NumberFormat.getInstance(Locale.US).parse(datum).floatValue())
+ case datum => datum.toFloat
}
case _: DoubleType => (d: String) =>
@@ -121,9 +119,7 @@ class UnivocityParser(
case options.nanValue => Double.NaN
case options.negativeInf => Double.NegativeInfinity
case options.positiveInf => Double.PositiveInfinity
- case datum =>
- Try(datum.toDouble)
- .getOrElse(NumberFormat.getInstance(Locale.US).parse(datum).doubleValue())
+ case datum => datum.toDouble
}
case _: BooleanType => (d: String) =>
http://git-wip-us.apache.org/repos/asf/spark/blob/7514db1d/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index 89d9b69..487c84f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -1174,4 +1174,25 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
}
}
}
+
+ test("SPARK-21263: Invalid float and double are handled correctly in different modes") {
+ val exception = intercept[SparkException] {
+ spark.read.schema("a DOUBLE")
+ .option("mode", "FAILFAST")
+ .csv(Seq("10u12").toDS())
+ .collect()
+ }
+ assert(exception.getMessage.contains("""input string: "10u12""""))
+
+ val count = spark.read.schema("a FLOAT")
+ .option("mode", "DROPMALFORMED")
+ .csv(Seq("10u12").toDS())
+ .count()
+ assert(count == 0)
+
+ val results = spark.read.schema("a FLOAT")
+ .option("mode", "PERMISSIVE")
+ .csv(Seq("10u12").toDS())
+ checkAnswer(results, Row(null))
+ }
}
http://git-wip-us.apache.org/repos/asf/spark/blob/7514db1d/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParserSuite.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParserSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParserSuite.scala
index a74b22a..efbf735 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParserSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParserSuite.scala
@@ -130,16 +130,17 @@ class UnivocityParserSuite extends SparkFunSuite {
DateTimeUtils.millisToDays(DateTimeUtils.stringToTime("2015-01-01").getTime))
}
- test("Float and Double Types are cast without respect to platform default Locale") {
- val originalLocale = Locale.getDefault
- try {
- Locale.setDefault(new Locale("fr", "FR"))
- // Would parse as 1.0 in fr-FR
- val options = new CSVOptions(Map.empty[String, String], "GMT")
- assert(parser.makeConverter("_1", FloatType, options = options).apply("1,00") == 100.0)
- assert(parser.makeConverter("_1", DoubleType, options = options).apply("1,00") == 100.0)
- } finally {
- Locale.setDefault(originalLocale)
+ test("Throws exception for casting an invalid string to Float and Double Types") {
+ val options = new CSVOptions(Map.empty[String, String], "GMT")
+ val types = Seq(DoubleType, FloatType)
+ val input = Seq("10u000", "abc", "1 2/3")
+ types.foreach { dt =>
+ input.foreach { v =>
+ val message = intercept[NumberFormatException] {
+ parser.makeConverter("_1", dt, options = options).apply(v)
+ }.getMessage
+ assert(message.contains(v))
+ }
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org