You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by rx...@apache.org on 2016/02/21 22:22:01 UTC
spark git commit: [SPARK-13137][SQL] NullPoingException in schema
inference for CSV when the first line is empty
Repository: spark
Updated Branches:
refs/heads/master b6a873d6d -> 7eb83fefd
[SPARK-13137][SQL] NullPoingException in schema inference for CSV when the first line is empty
https://issues.apache.org/jira/browse/SPARK-13137
This PR adds a filter in schema inference so that it does not emit NullPointException.
Also, I removed `MAX_COMMENT_LINES_IN_HEADER `but instead used a monad chaining with `filter()` and `first()`.
Lastly, I simply added a newline rather than adding a new file for this so that this is covered with the original tests.
Author: hyukjinkwon <gu...@gmail.com>
Closes #11023 from HyukjinKwon/SPARK-13137.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7eb83fef
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7eb83fef
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7eb83fef
Branch: refs/heads/master
Commit: 7eb83fefd19e137d80a23b5174b66b14831c291a
Parents: b6a873d
Author: hyukjinkwon <gu...@gmail.com>
Authored: Sun Feb 21 13:21:59 2016 -0800
Committer: Reynold Xin <rx...@databricks.com>
Committed: Sun Feb 21 13:21:59 2016 -0800
----------------------------------------------------------------------
.../sql/execution/datasources/csv/CSVOptions.scala | 3 ---
.../sql/execution/datasources/csv/CSVRelation.scala | 12 +++++++-----
sql/core/src/test/resources/cars.csv | 1 +
3 files changed, 8 insertions(+), 8 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/7eb83fef/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
index bea8e97..38aa2dd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
@@ -75,9 +75,6 @@ private[sql] class CSVOptions(
val ignoreLeadingWhiteSpaceFlag = getBool("ignoreLeadingWhiteSpace")
val ignoreTrailingWhiteSpaceFlag = getBool("ignoreTrailingWhiteSpace")
- // Limit the number of lines we'll search for a header row that isn't comment-prefixed
- val MAX_COMMENT_LINES_IN_HEADER = 10
-
// Parse mode flags
if (!ParseModes.isValidMode(parseMode)) {
logWarning(s"$parseMode is not a valid parse mode. Using ${ParseModes.DEFAULT}.")
http://git-wip-us.apache.org/repos/asf/spark/blob/7eb83fef/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala
index f8e3a1b..471ed0d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala
@@ -154,12 +154,14 @@ private[csv] class CSVRelation(
*/
private def findFirstLine(rdd: RDD[String]): String = {
if (params.isCommentSet) {
- rdd.take(params.MAX_COMMENT_LINES_IN_HEADER)
- .find(!_.startsWith(params.comment.toString))
- .getOrElse(sys.error(s"No uncommented header line in " +
- s"first ${params.MAX_COMMENT_LINES_IN_HEADER} lines"))
+ val comment = params.comment.toString
+ rdd.filter { line =>
+ line.trim.nonEmpty && !line.startsWith(comment)
+ }.first()
} else {
- rdd.first()
+ rdd.filter { line =>
+ line.trim.nonEmpty
+ }.first()
}
}
}
http://git-wip-us.apache.org/repos/asf/spark/blob/7eb83fef/sql/core/src/test/resources/cars.csv
----------------------------------------------------------------------
diff --git a/sql/core/src/test/resources/cars.csv b/sql/core/src/test/resources/cars.csv
index 2b9d74c..40ded57 100644
--- a/sql/core/src/test/resources/cars.csv
+++ b/sql/core/src/test/resources/cars.csv
@@ -1,3 +1,4 @@
+
year,make,model,comment,blank
"2012","Tesla","S","No comment",
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org