You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2020/09/08 14:22:48 UTC

[spark] branch branch-2.4 updated: [SPARK-32815][ML][2.4] Fix LibSVM data source loading error on file paths with glob metacharacters

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch branch-2.4
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-2.4 by this push:
     new ef24542  [SPARK-32815][ML][2.4] Fix LibSVM data source loading error on file paths with glob metacharacters
ef24542 is described below

commit ef24542721bd1107339c6efef8f14a970e429384
Author: Max Gekk <ma...@gmail.com>
AuthorDate: Tue Sep 8 14:17:18 2020 +0000

    [SPARK-32815][ML][2.4] Fix LibSVM data source loading error on file paths with glob metacharacters
    
    ### What changes were proposed in this pull request?
    In the PR, I propose to fix an issue with LibSVM datasource when both of the following are true:
    * no user specified schema
    * some file paths contain escaped glob metacharacters, such as `[``]`, `{``}`, `*` etc.
    
    The fix is a backport of https://github.com/apache/spark/pull/29675, and it is based on another bug fix for CSV/JSON datasources https://github.com/apache/spark/pull/29663.
    
    ### Why are the changes needed?
    To fix the issue when the follow two queries try to read from paths `[abc]`:
    ```scala
    spark.read.format("libsvm").load("""/tmp/\[abc\].csv""").show
    ```
    but would end up hitting an exception:
    ```
    Path does not exist: file:/private/var/folders/p3/dfs6mf655d7fnjrsjvldh0tc0000gn/T/spark-6ef0ae5e-ff9f-4c4f-9ff4-0db3ee1f6a82/[abc]/part-00000-26406ab9-4e56-45fd-a25a-491c18a05e76-c000.libsvm;
    org.apache.spark.sql.AnalysisException: Path does not exist: file:/private/var/folders/p3/dfs6mf655d7fnjrsjvldh0tc0000gn/T/spark-6ef0ae5e-ff9f-4c4f-9ff4-0db3ee1f6a82/[abc]/part-00000-26406ab9-4e56-45fd-a25a-491c18a05e76-c000.libsvm;
    	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$checkAndGlobPathIfNecessary$3(DataSource.scala:770)
    	at org.apache.spark.util.ThreadUtils$.$anonfun$parmap$2(ThreadUtils.scala:373)
    	at scala.concurrent.Future$.$anonfun$apply$1(Future.scala:659)
    	at scala.util.Success.$anonfun$map$1(Try.scala:255)
    	at scala.util.Success.map(Try.scala:213)
    ```
    
    ### Does this PR introduce _any_ user-facing change?
    Yes
    
    ### How was this patch tested?
    Added UT to `LibSVMRelationSuite`.
    
    Closes #29678 from MaxGekk/globbing-paths-when-inferring-schema-ml-2.4.
    
    Authored-by: Max Gekk <ma...@gmail.com>
    Signed-off-by: Wenchen Fan <we...@databricks.com>
---
 .../apache/spark/ml/source/libsvm/LibSVMRelation.scala |  2 +-
 .../scala/org/apache/spark/mllib/util/MLUtils.scala    |  3 ++-
 .../spark/ml/source/libsvm/LibSVMRelationSuite.scala   | 18 ++++++++++++++++++
 3 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
index 39dcd91..5795812 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
@@ -99,7 +99,7 @@ private[libsvm] class LibSVMFileFormat
         "though the input. If you know the number in advance, please specify it via " +
         "'numFeatures' option to avoid the extra scan.")
 
-      val paths = files.map(_.getPath.toUri.toString)
+      val paths = files.map(_.getPath.toString)
       val parsed = MLUtils.parseLibSVMFile(sparkSession, paths)
       MLUtils.computeNumFeatures(parsed)
     }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
index 14af8b5..c8550cd 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
@@ -110,7 +110,8 @@ object MLUtils extends Logging {
       DataSource.apply(
         sparkSession,
         paths = paths,
-        className = classOf[TextFileFormat].getName
+        className = classOf[TextFileFormat].getName,
+        options = Map(DataSource.GLOB_PATHS_KEY -> "false")
       ).resolveRelation(checkFilesExist = false))
       .select("value")
 
diff --git a/mllib/src/test/scala/org/apache/spark/ml/source/libsvm/LibSVMRelationSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/source/libsvm/LibSVMRelationSuite.scala
index 3eabff4..28c770c 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/source/libsvm/LibSVMRelationSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/source/libsvm/LibSVMRelationSuite.scala
@@ -184,4 +184,22 @@ class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext {
       spark.sql("DROP TABLE IF EXISTS libsvmTable")
     }
   }
+
+  test("SPARK-32815: Test LibSVM data source on file paths with glob metacharacters") {
+    val basePath = Utils.createDirectory(tempDir.getCanonicalPath, "globbing")
+    // test libsvm writer / reader without specifying schema
+    val svmFileName = "[abc]"
+    val escapedSvmFileName = "\\[abc\\]"
+    val rawData = new java.util.ArrayList[Row]()
+    rawData.add(Row(1.0, Vectors.sparse(2, Seq((0, 2.0), (1, 3.0)))))
+    val struct = new StructType()
+      .add("labelFoo", DoubleType, false)
+      .add("featuresBar", VectorType, false)
+    val df = spark.createDataFrame(rawData, struct)
+    df.write.format("libsvm").save(s"$basePath/$svmFileName")
+    val df2 = spark.read.format("libsvm").load(s"$basePath/$escapedSvmFileName")
+    val row1 = df2.first()
+    val v = row1.getAs[SparseVector](1)
+    assert(v == Vectors.sparse(2, Seq((0, 2.0), (1, 3.0))))
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org