You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by do...@apache.org on 2020/07/20 21:03:15 UTC
[spark] branch branch-3.0 updated: [SPARK-32368][SQL]
pathGlobFilter,
recursiveFileLookup and basePath should respect case insensitivity
This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.0 by this push:
new 9e7b130 [SPARK-32368][SQL] pathGlobFilter, recursiveFileLookup and basePath should respect case insensitivity
9e7b130 is described below
commit 9e7b130ece1200504fde34d8ef40cb6a19509a01
Author: HyukjinKwon <gu...@apache.org>
AuthorDate: Mon Jul 20 13:56:00 2020 -0700
[SPARK-32368][SQL] pathGlobFilter, recursiveFileLookup and basePath should respect case insensitivity
### What changes were proposed in this pull request?
This PR proposes to make the datasource options at `PartitioningAwareFileIndex` respect case insensitivity consistently:
- `pathGlobFilter`
- `recursiveFileLookup `
- `basePath`
### Why are the changes needed?
To support consistent case insensitivity in datasource options.
### Does this PR introduce _any_ user-facing change?
Yes, now users can also use case insensitive options such as `PathglobFilter`.
### How was this patch tested?
Unittest were added. It reuses existing tests and adds extra clues to make it easier to track when the test is broken.
Closes #29165 from HyukjinKwon/SPARK-32368.
Authored-by: HyukjinKwon <gu...@apache.org>
Signed-off-by: Dongjoon Hyun <do...@apache.org>
(cherry picked from commit 133c5edc807ca87825f61dd9a5d36018620033ee)
Signed-off-by: Dongjoon Hyun <do...@apache.org>
---
.../datasources/PartitioningAwareFileIndex.scala | 11 +++++++----
.../org/apache/spark/sql/FileBasedDataSourceSuite.scala | 16 +++++++++-------
.../spark/sql/execution/datasources/FileIndexSuite.scala | 16 +++++++++-------
3 files changed, 25 insertions(+), 18 deletions(-)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileIndex.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileIndex.scala
index 2e09c72..5341e22 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileIndex.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileIndex.scala
@@ -56,14 +56,17 @@ abstract class PartitioningAwareFileIndex(
protected def leafDirToChildrenFiles: Map[Path, Array[FileStatus]]
- protected lazy val pathGlobFilter = parameters.get("pathGlobFilter").map(new GlobFilter(_))
+ private val caseInsensitiveMap = CaseInsensitiveMap(parameters)
+
+ protected lazy val pathGlobFilter: Option[GlobFilter] =
+ caseInsensitiveMap.get("pathGlobFilter").map(new GlobFilter(_))
protected def matchGlobPattern(file: FileStatus): Boolean = {
pathGlobFilter.forall(_.accept(file.getPath))
}
- protected lazy val recursiveFileLookup = {
- parameters.getOrElse("recursiveFileLookup", "false").toBoolean
+ protected lazy val recursiveFileLookup: Boolean = {
+ caseInsensitiveMap.getOrElse("recursiveFileLookup", "false").toBoolean
}
override def listFiles(
@@ -215,7 +218,7 @@ abstract class PartitioningAwareFileIndex(
* and the returned DataFrame will have the column of `something`.
*/
private def basePaths: Set[Path] = {
- parameters.get(BASE_PATH_PARAM).map(new Path(_)) match {
+ caseInsensitiveMap.get(BASE_PATH_PARAM).map(new Path(_)) match {
case Some(userDefinedBasePath) =>
val fs = userDefinedBasePath.getFileSystem(hadoopConf)
if (!fs.isDirectory(userDefinedBasePath)) {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala
index 231a8f2..e9bff64 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala
@@ -633,13 +633,15 @@ class FileBasedDataSourceSuite extends QueryTest
assert(fileList.toSet === expectedFileList.toSet)
- val fileList2 = spark.read.format("binaryFile")
- .option("recursiveFileLookup", true)
- .option("pathGlobFilter", "*.bin")
- .load(dataPath)
- .select("path").collect().map(_.getString(0))
-
- assert(fileList2.toSet === expectedFileList.filter(_.endsWith(".bin")).toSet)
+ withClue("SPARK-32368: 'recursiveFileLookup' and 'pathGlobFilter' can be case insensitive") {
+ val fileList2 = spark.read.format("binaryFile")
+ .option("RecuRsivefileLookup", true)
+ .option("PaThglobFilter", "*.bin")
+ .load(dataPath)
+ .select("path").collect().map(_.getString(0))
+
+ assert(fileList2.toSet === expectedFileList.filter(_.endsWith(".bin")).toSet)
+ }
}
}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileIndexSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileIndexSuite.scala
index 553773e..c412d95 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileIndexSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileIndexSuite.scala
@@ -362,13 +362,15 @@ class FileIndexSuite extends SharedSparkSession {
val wrongBasePath = new File(dir, "unknown")
// basePath must be a directory
wrongBasePath.mkdir()
- val parameters = Map("basePath" -> wrongBasePath.getCanonicalPath)
- val fileIndex = new InMemoryFileIndex(spark, Seq(path), parameters, None)
- val msg = intercept[IllegalArgumentException] {
- // trigger inferPartitioning()
- fileIndex.partitionSpec()
- }.getMessage
- assert(msg === s"Wrong basePath ${wrongBasePath.getCanonicalPath} for the root path: $path")
+ withClue("SPARK-32368: 'basePath' can be case insensitive") {
+ val parameters = Map("bAsepAtH" -> wrongBasePath.getCanonicalPath)
+ val fileIndex = new InMemoryFileIndex(spark, Seq(path), parameters, None)
+ val msg = intercept[IllegalArgumentException] {
+ // trigger inferPartitioning()
+ fileIndex.partitionSpec()
+ }.getMessage
+ assert(msg === s"Wrong basePath ${wrongBasePath.getCanonicalPath} for the root path: $path")
+ }
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org