You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by sr...@apache.org on 2020/04/26 16:36:42 UTC
[spark] branch master updated: [SPARK-31400][ML] The catalogString doesn't distinguish Vectors in ml and mllib

This is an automated email from the ASF dual-hosted git repository.

srowen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new fe07b21  [SPARK-31400][ML] The catalogString doesn't distinguish Vectors in ml and mllib
fe07b21 is described below

commit fe07b21b8ab60def6c4451c661e4dd46a4d48b5a
Author: TJX2014 <xi...@gmail.com>
AuthorDate: Sun Apr 26 11:35:44 2020 -0500

    [SPARK-31400][ML] The catalogString doesn't distinguish Vectors in ml and mllib
    
    What changes were proposed in this pull request?
    1.Add class info output in org.apache.spark.ml.util.SchemaUtils#checkColumnType to distinct Vectors in ml and mllib
    2.Add unit test
    
    Why are the changes needed?
    the catalogString doesn't distinguish Vectors in ml and mllib when mllib vector misused in ml
    https://issues.apache.org/jira/browse/SPARK-31400
    
    Does this PR introduce any user-facing change?
    No
    
    How was this patch tested?
    Unit test is added
    
    Closes #28347 from TJX2014/master-catalogString-distinguish-Vectors-in-ml-and-mllib.
    
    Authored-by: TJX2014 <xi...@gmail.com>
    Signed-off-by: Sean Owen <sr...@gmail.com>
---
 .../org/apache/spark/ml/util/SchemaUtils.scala      |  4 ++--
 .../apache/spark/mllib/util/TestingUtilsSuite.scala | 21 ++++++++++++++++++++-
 2 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/SchemaUtils.scala b/mllib/src/main/scala/org/apache/spark/ml/util/SchemaUtils.scala
index 752069d..c08d7e8 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/util/SchemaUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/util/SchemaUtils.scala
@@ -42,8 +42,8 @@ private[spark] object SchemaUtils {
     val actualDataType = schema(colName).dataType
     val message = if (msg != null && msg.trim.length > 0) " " + msg else ""
     require(actualDataType.equals(dataType),
-      s"Column $colName must be of type ${dataType.catalogString} but was actually " +
-        s"${actualDataType.catalogString}.$message")
+      s"Column $colName must be of type ${dataType.getClass}:${dataType.catalogString} " +
+        s"but was actually ${actualDataType.getClass}:${actualDataType.catalogString}.$message")
   }
 
   /**
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/util/TestingUtilsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/util/TestingUtilsSuite.scala
index 3fcf1cf..bc80e86 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/util/TestingUtilsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/util/TestingUtilsSuite.scala
@@ -20,9 +20,11 @@ package org.apache.spark.mllib.util
 import org.scalatest.exceptions.TestFailedException
 
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.ml.linalg.VectorUDT
+import org.apache.spark.ml.util.SchemaUtils
 import org.apache.spark.mllib.linalg.{Matrices, Vectors}
 import org.apache.spark.mllib.util.TestingUtils._
-
+import org.apache.spark.sql.types.{StructField, StructType}
 class TestingUtilsSuite extends SparkFunSuite {
 
   test("Comparing doubles using relative error.") {
@@ -457,4 +459,21 @@ class TestingUtilsSuite extends SparkFunSuite {
     assert(Matrices.sparse(2, 2, Array(0, 1, 2), Array(0, 1), Array(3.1, 3.5)) !~=
       Matrices.dense(0, 0, Array()) relTol 0.01)
   }
+
+  test("SPARK-31400, catalogString distinguish Vectors in ml and mllib") {
+    val schema = StructType(Array[StructField] {
+      StructField("features", new org.apache.spark.mllib.linalg.VectorUDT)
+    })
+    val e = intercept[IllegalArgumentException] {
+      SchemaUtils.checkColumnType(schema, "features", new VectorUDT)
+    }
+    assert(e.getMessage.contains(
+      "org.apache.spark.mllib.linalg.VectorUDT:struct<type:tinyint,size:int,indices:array<int>"),
+      "dataType is not desired")
+
+    val normalSchema = StructType(Array[StructField] {
+      StructField("features", new VectorUDT)
+    })
+    SchemaUtils.checkColumnType(normalSchema, "features", new VectorUDT)
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org