You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2020/09/08 14:21:16 UTC
[spark] branch branch-3.0 updated: [SPARK-32815][ML][3.0] Fix
LibSVM data source loading error on file paths with glob metacharacters
This is an automated email from the ASF dual-hosted git repository.
wenchen pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.0 by this push:
new 8c0b9cb [SPARK-32815][ML][3.0] Fix LibSVM data source loading error on file paths with glob metacharacters
8c0b9cb is described below
commit 8c0b9cbf68693db22314637a75f28e5aa954aff8
Author: Max Gekk <ma...@gmail.com>
AuthorDate: Tue Sep 8 14:16:13 2020 +0000
[SPARK-32815][ML][3.0] Fix LibSVM data source loading error on file paths with glob metacharacters
### What changes were proposed in this pull request?
In the PR, I propose to fix an issue with LibSVM datasource when both of the following are true:
* no user specified schema
* some file paths contain escaped glob metacharacters, such as `[``]`, `{``}`, `*` etc.
The fix is a backport of https://github.com/apache/spark/pull/29670, and it is based on another bug fix for CSV/JSON datasources https://github.com/apache/spark/pull/29659.
### Why are the changes needed?
To fix the issue when the follow two queries try to read from paths `[abc]`:
```scala
spark.read.format("libsvm").load("""/tmp/\[abc\].csv""").show
```
but would end up hitting an exception:
```
Path does not exist: file:/private/var/folders/p3/dfs6mf655d7fnjrsjvldh0tc0000gn/T/spark-6ef0ae5e-ff9f-4c4f-9ff4-0db3ee1f6a82/[abc]/part-00000-26406ab9-4e56-45fd-a25a-491c18a05e76-c000.libsvm;
org.apache.spark.sql.AnalysisException: Path does not exist: file:/private/var/folders/p3/dfs6mf655d7fnjrsjvldh0tc0000gn/T/spark-6ef0ae5e-ff9f-4c4f-9ff4-0db3ee1f6a82/[abc]/part-00000-26406ab9-4e56-45fd-a25a-491c18a05e76-c000.libsvm;
at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$checkAndGlobPathIfNecessary$3(DataSource.scala:770)
at org.apache.spark.util.ThreadUtils$.$anonfun$parmap$2(ThreadUtils.scala:373)
at scala.concurrent.Future$.$anonfun$apply$1(Future.scala:659)
at scala.util.Success.$anonfun$map$1(Try.scala:255)
at scala.util.Success.map(Try.scala:213)
```
### Does this PR introduce _any_ user-facing change?
Yes
### How was this patch tested?
Added UT to `LibSVMRelationSuite`.
Closes #29675 from MaxGekk/globbing-paths-when-inferring-schema-ml-3.0.
Authored-by: Max Gekk <ma...@gmail.com>
Signed-off-by: Wenchen Fan <we...@databricks.com>
---
.../spark/ml/source/libsvm/LibSVMRelation.scala | 2 +-
.../scala/org/apache/spark/mllib/util/MLUtils.scala | 3 ++-
.../spark/ml/source/libsvm/LibSVMRelationSuite.scala | 20 ++++++++++++++++++++
3 files changed, 23 insertions(+), 2 deletions(-)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
index da8f3a24f..11be1d8 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
@@ -100,7 +100,7 @@ private[libsvm] class LibSVMFileFormat
"though the input. If you know the number in advance, please specify it via " +
"'numFeatures' option to avoid the extra scan.")
- val paths = files.map(_.getPath.toUri.toString)
+ val paths = files.map(_.getPath.toString)
val parsed = MLUtils.parseLibSVMFile(sparkSession, paths)
MLUtils.computeNumFeatures(parsed)
}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
index 9198334..2411300 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
@@ -110,7 +110,8 @@ object MLUtils extends Logging {
DataSource.apply(
sparkSession,
paths = paths,
- className = classOf[TextFileFormat].getName
+ className = classOf[TextFileFormat].getName,
+ options = Map(DataSource.GLOB_PATHS_KEY -> "false")
).resolveRelation(checkFilesExist = false))
.select("value")
diff --git a/mllib/src/test/scala/org/apache/spark/ml/source/libsvm/LibSVMRelationSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/source/libsvm/LibSVMRelationSuite.scala
index 263ad26..0999892 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/source/libsvm/LibSVMRelationSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/source/libsvm/LibSVMRelationSuite.scala
@@ -191,4 +191,24 @@ class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext {
spark.sql("DROP TABLE IF EXISTS libsvmTable")
}
}
+
+ test("SPARK-32815: Test LibSVM data source on file paths with glob metacharacters") {
+ withTempDir { dir =>
+ val basePath = dir.getCanonicalPath
+ // test libsvm writer / reader without specifying schema
+ val svmFileName = "[abc]"
+ val escapedSvmFileName = "\\[abc\\]"
+ val rawData = new java.util.ArrayList[Row]()
+ rawData.add(Row(1.0, Vectors.sparse(2, Seq((0, 2.0), (1, 3.0)))))
+ val struct = new StructType()
+ .add("labelFoo", DoubleType, false)
+ .add("featuresBar", VectorType, false)
+ val df = spark.createDataFrame(rawData, struct)
+ df.write.format("libsvm").save(s"$basePath/$svmFileName")
+ val df2 = spark.read.format("libsvm").load(s"$basePath/$escapedSvmFileName")
+ val row1 = df2.first()
+ val v = row1.getAs[SparseVector](1)
+ assert(v == Vectors.sparse(2, Seq((0, 2.0), (1, 3.0))))
+ }
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org