You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by sr...@apache.org on 2018/12/31 16:15:31 UTC
[spark] branch master updated: [SPARK-26339][SQL] Throws better
exception when reading files that start with underscore
This is an automated email from the ASF dual-hosted git repository.
srowen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new c0b9db1 [SPARK-26339][SQL] Throws better exception when reading files that start with underscore
c0b9db1 is described below
commit c0b9db120d4c2ad0b5b99b9152549e94ef8f5a2d
Author: Hirobe Keiichi <ke...@forcia.com>
AuthorDate: Mon Dec 31 10:15:14 2018 -0600
[SPARK-26339][SQL] Throws better exception when reading files that start with underscore
## What changes were proposed in this pull request?
As the description in SPARK-26339, spark.read behavior is very confusing when reading files that start with underscore, fix this by throwing exception which message is "Path does not exist".
## How was this patch tested?
manual tests.
Both of codes below throws exception which message is "Path does not exist".
```
spark.read.csv("/home/forcia/work/spark/_test.csv")
spark.read.schema("test STRING, number INT").csv("/home/forcia/work/spark/_test.csv")
```
Closes #23288 from KeiichiHirobe/SPARK-26339.
Authored-by: Hirobe Keiichi <ke...@forcia.com>
Signed-off-by: Sean Owen <se...@databricks.com>
---
.../spark/sql/execution/datasources/DataSource.scala | 17 ++++++++++++++++-
sql/core/src/test/resources/test-data/_cars.csv | 7 +++++++
.../sql/execution/datasources/csv/CSVSuite.scala | 20 ++++++++++++++++++++
3 files changed, 43 insertions(+), 1 deletion(-)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
index fefff68..517e043 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -543,7 +543,7 @@ case class DataSource(
checkFilesExist: Boolean): Seq[Path] = {
val allPaths = caseInsensitiveOptions.get("path") ++ paths
val hadoopConf = sparkSession.sessionState.newHadoopConf()
- allPaths.flatMap { path =>
+ val allGlobPath = allPaths.flatMap { path =>
val hdfsPath = new Path(path)
val fs = hdfsPath.getFileSystem(hadoopConf)
val qualified = hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
@@ -560,6 +560,21 @@ case class DataSource(
}
globPath
}.toSeq
+
+ val (filteredOut, filteredIn) = allGlobPath.partition { path =>
+ InMemoryFileIndex.shouldFilterOut(path.getName)
+ }
+ if (filteredOut.nonEmpty) {
+ if (filteredIn.isEmpty) {
+ throw new AnalysisException(
+ s"All paths were ignored:\n${filteredOut.mkString("\n ")}")
+ } else {
+ logDebug(
+ s"Some paths were ignored:\n${filteredOut.mkString("\n ")}")
+ }
+ }
+
+ allGlobPath
}
}
diff --git a/sql/core/src/test/resources/test-data/_cars.csv b/sql/core/src/test/resources/test-data/_cars.csv
new file mode 100644
index 0000000..40ded57
--- /dev/null
+++ b/sql/core/src/test/resources/test-data/_cars.csv
@@ -0,0 +1,7 @@
+
+year,make,model,comment,blank
+"2012","Tesla","S","No comment",
+
+1997,Ford,E350,"Go get one now they are going fast",
+2015,Chevy,Volt
+
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index d9e5d7a..fb1bedf 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -53,6 +53,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te
private val carsEmptyValueFile = "test-data/cars-empty-value.csv"
private val carsBlankColName = "test-data/cars-blank-column-name.csv"
private val carsCrlf = "test-data/cars-crlf.csv"
+ private val carsFilteredOutFile = "test-data/_cars.csv"
private val emptyFile = "test-data/empty.csv"
private val commentsFile = "test-data/comments.csv"
private val disableCommentsFile = "test-data/disable_comments.csv"
@@ -346,6 +347,25 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te
assert(result.schema.fieldNames.size === 1)
}
+ test("SPARK-26339 Not throw an exception if some of specified paths are filtered in") {
+ val cars = spark
+ .read
+ .option("header", "false")
+ .csv(testFile(carsFile), testFile(carsFilteredOutFile))
+
+ verifyCars(cars, withHeader = false, checkTypes = false)
+ }
+
+ test("SPARK-26339 Throw an exception only if all of the specified paths are filtered out") {
+ val e = intercept[AnalysisException] {
+ val cars = spark
+ .read
+ .option("header", "false")
+ .csv(testFile(carsFilteredOutFile))
+ }.getMessage
+ assert(e.contains("All paths were ignored:"))
+ }
+
test("DDL test with empty file") {
withView("carsTable") {
spark.sql(
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org