You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by do...@apache.org on 2020/07/20 21:03:15 UTC
[spark] branch branch-3.0 updated: [SPARK-32368][SQL] pathGlobFilter, recursiveFileLookup and basePath should respect case insensitivity

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new 9e7b130  [SPARK-32368][SQL] pathGlobFilter, recursiveFileLookup and basePath should respect case insensitivity
9e7b130 is described below

commit 9e7b130ece1200504fde34d8ef40cb6a19509a01
Author: HyukjinKwon <gu...@apache.org>
AuthorDate: Mon Jul 20 13:56:00 2020 -0700

    [SPARK-32368][SQL] pathGlobFilter, recursiveFileLookup and basePath should respect case insensitivity
    
    ### What changes were proposed in this pull request?
    
    This PR proposes to make the datasource options at `PartitioningAwareFileIndex` respect case insensitivity consistently:
    - `pathGlobFilter`
    - `recursiveFileLookup `
    - `basePath`
    
    ### Why are the changes needed?
    
    To support consistent case insensitivity in datasource options.
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes, now users can also use case insensitive options such as `PathglobFilter`.
    
    ### How was this patch tested?
    
    Unittest were added. It reuses existing tests and adds extra clues to make it easier to track when the test is broken.
    
    Closes #29165 from HyukjinKwon/SPARK-32368.
    
    Authored-by: HyukjinKwon <gu...@apache.org>
    Signed-off-by: Dongjoon Hyun <do...@apache.org>
    (cherry picked from commit 133c5edc807ca87825f61dd9a5d36018620033ee)
    Signed-off-by: Dongjoon Hyun <do...@apache.org>
---
 .../datasources/PartitioningAwareFileIndex.scala         | 11 +++++++----
 .../org/apache/spark/sql/FileBasedDataSourceSuite.scala  | 16 +++++++++-------
 .../spark/sql/execution/datasources/FileIndexSuite.scala | 16 +++++++++-------
 3 files changed, 25 insertions(+), 18 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileIndex.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileIndex.scala
index 2e09c72..5341e22 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileIndex.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileIndex.scala
@@ -56,14 +56,17 @@ abstract class PartitioningAwareFileIndex(
 
   protected def leafDirToChildrenFiles: Map[Path, Array[FileStatus]]
 
-  protected lazy val pathGlobFilter = parameters.get("pathGlobFilter").map(new GlobFilter(_))
+  private val caseInsensitiveMap = CaseInsensitiveMap(parameters)
+
+  protected lazy val pathGlobFilter: Option[GlobFilter] =
+    caseInsensitiveMap.get("pathGlobFilter").map(new GlobFilter(_))
 
   protected def matchGlobPattern(file: FileStatus): Boolean = {
     pathGlobFilter.forall(_.accept(file.getPath))
   }
 
-  protected lazy val recursiveFileLookup = {
-    parameters.getOrElse("recursiveFileLookup", "false").toBoolean
+  protected lazy val recursiveFileLookup: Boolean = {
+    caseInsensitiveMap.getOrElse("recursiveFileLookup", "false").toBoolean
   }
 
   override def listFiles(
@@ -215,7 +218,7 @@ abstract class PartitioningAwareFileIndex(
    * and the returned DataFrame will have the column of `something`.
    */
   private def basePaths: Set[Path] = {
-    parameters.get(BASE_PATH_PARAM).map(new Path(_)) match {
+    caseInsensitiveMap.get(BASE_PATH_PARAM).map(new Path(_)) match {
       case Some(userDefinedBasePath) =>
         val fs = userDefinedBasePath.getFileSystem(hadoopConf)
         if (!fs.isDirectory(userDefinedBasePath)) {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala
index 231a8f2..e9bff64 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala
@@ -633,13 +633,15 @@ class FileBasedDataSourceSuite extends QueryTest
 
       assert(fileList.toSet === expectedFileList.toSet)
 
-      val fileList2 = spark.read.format("binaryFile")
-        .option("recursiveFileLookup", true)
-        .option("pathGlobFilter", "*.bin")
-        .load(dataPath)
-        .select("path").collect().map(_.getString(0))
-
-      assert(fileList2.toSet === expectedFileList.filter(_.endsWith(".bin")).toSet)
+      withClue("SPARK-32368: 'recursiveFileLookup' and 'pathGlobFilter' can be case insensitive") {
+        val fileList2 = spark.read.format("binaryFile")
+          .option("RecuRsivefileLookup", true)
+          .option("PaThglobFilter", "*.bin")
+          .load(dataPath)
+          .select("path").collect().map(_.getString(0))
+
+        assert(fileList2.toSet === expectedFileList.filter(_.endsWith(".bin")).toSet)
+      }
     }
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileIndexSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileIndexSuite.scala
index 553773e..c412d95 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileIndexSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileIndexSuite.scala
@@ -362,13 +362,15 @@ class FileIndexSuite extends SharedSparkSession {
       val wrongBasePath = new File(dir, "unknown")
       // basePath must be a directory
       wrongBasePath.mkdir()
-      val parameters = Map("basePath" -> wrongBasePath.getCanonicalPath)
-      val fileIndex = new InMemoryFileIndex(spark, Seq(path), parameters, None)
-      val msg = intercept[IllegalArgumentException] {
-        // trigger inferPartitioning()
-        fileIndex.partitionSpec()
-      }.getMessage
-      assert(msg === s"Wrong basePath ${wrongBasePath.getCanonicalPath} for the root path: $path")
+      withClue("SPARK-32368: 'basePath' can be case insensitive") {
+        val parameters = Map("bAsepAtH" -> wrongBasePath.getCanonicalPath)
+        val fileIndex = new InMemoryFileIndex(spark, Seq(path), parameters, None)
+        val msg = intercept[IllegalArgumentException] {
+          // trigger inferPartitioning()
+          fileIndex.partitionSpec()
+        }.getMessage
+        assert(msg === s"Wrong basePath ${wrongBasePath.getCanonicalPath} for the root path: $path")
+      }
     }
   }
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org