You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by sr...@apache.org on 2020/04/26 16:36:42 UTC
[spark] branch master updated: [SPARK-31400][ML] The catalogString
doesn't distinguish Vectors in ml and mllib
This is an automated email from the ASF dual-hosted git repository.
srowen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new fe07b21 [SPARK-31400][ML] The catalogString doesn't distinguish Vectors in ml and mllib
fe07b21 is described below
commit fe07b21b8ab60def6c4451c661e4dd46a4d48b5a
Author: TJX2014 <xi...@gmail.com>
AuthorDate: Sun Apr 26 11:35:44 2020 -0500
[SPARK-31400][ML] The catalogString doesn't distinguish Vectors in ml and mllib
What changes were proposed in this pull request?
1.Add class info output in org.apache.spark.ml.util.SchemaUtils#checkColumnType to distinct Vectors in ml and mllib
2.Add unit test
Why are the changes needed?
the catalogString doesn't distinguish Vectors in ml and mllib when mllib vector misused in ml
https://issues.apache.org/jira/browse/SPARK-31400
Does this PR introduce any user-facing change?
No
How was this patch tested?
Unit test is added
Closes #28347 from TJX2014/master-catalogString-distinguish-Vectors-in-ml-and-mllib.
Authored-by: TJX2014 <xi...@gmail.com>
Signed-off-by: Sean Owen <sr...@gmail.com>
---
.../org/apache/spark/ml/util/SchemaUtils.scala | 4 ++--
.../apache/spark/mllib/util/TestingUtilsSuite.scala | 21 ++++++++++++++++++++-
2 files changed, 22 insertions(+), 3 deletions(-)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/SchemaUtils.scala b/mllib/src/main/scala/org/apache/spark/ml/util/SchemaUtils.scala
index 752069d..c08d7e8 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/util/SchemaUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/util/SchemaUtils.scala
@@ -42,8 +42,8 @@ private[spark] object SchemaUtils {
val actualDataType = schema(colName).dataType
val message = if (msg != null && msg.trim.length > 0) " " + msg else ""
require(actualDataType.equals(dataType),
- s"Column $colName must be of type ${dataType.catalogString} but was actually " +
- s"${actualDataType.catalogString}.$message")
+ s"Column $colName must be of type ${dataType.getClass}:${dataType.catalogString} " +
+ s"but was actually ${actualDataType.getClass}:${actualDataType.catalogString}.$message")
}
/**
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/util/TestingUtilsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/util/TestingUtilsSuite.scala
index 3fcf1cf..bc80e86 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/util/TestingUtilsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/util/TestingUtilsSuite.scala
@@ -20,9 +20,11 @@ package org.apache.spark.mllib.util
import org.scalatest.exceptions.TestFailedException
import org.apache.spark.SparkFunSuite
+import org.apache.spark.ml.linalg.VectorUDT
+import org.apache.spark.ml.util.SchemaUtils
import org.apache.spark.mllib.linalg.{Matrices, Vectors}
import org.apache.spark.mllib.util.TestingUtils._
-
+import org.apache.spark.sql.types.{StructField, StructType}
class TestingUtilsSuite extends SparkFunSuite {
test("Comparing doubles using relative error.") {
@@ -457,4 +459,21 @@ class TestingUtilsSuite extends SparkFunSuite {
assert(Matrices.sparse(2, 2, Array(0, 1, 2), Array(0, 1), Array(3.1, 3.5)) !~=
Matrices.dense(0, 0, Array()) relTol 0.01)
}
+
+ test("SPARK-31400, catalogString distinguish Vectors in ml and mllib") {
+ val schema = StructType(Array[StructField] {
+ StructField("features", new org.apache.spark.mllib.linalg.VectorUDT)
+ })
+ val e = intercept[IllegalArgumentException] {
+ SchemaUtils.checkColumnType(schema, "features", new VectorUDT)
+ }
+ assert(e.getMessage.contains(
+ "org.apache.spark.mllib.linalg.VectorUDT:struct<type:tinyint,size:int,indices:array<int>"),
+ "dataType is not desired")
+
+ val normalSchema = StructType(Array[StructField] {
+ StructField("features", new VectorUDT)
+ })
+ SchemaUtils.checkColumnType(normalSchema, "features", new VectorUDT)
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org