You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by me...@apache.org on 2015/09/08 23:44:10 UTC
spark git commit: [SPARK-10468] [ MLLIB ] Verify schema before Dataframe select API call

Repository: spark
Updated Branches:
  refs/heads/master 7a9dcbc91 -> e6f8d3686


[SPARK-10468] [ MLLIB ] Verify schema before Dataframe select API call

Loader.checkSchema was called to verify the schema after dataframe.select(...).
Schema verification should be done before dataframe.select(...)

Author: Vinod K C <vi...@huawei.com>

Closes #8636 from vinodkc/fix_GaussianMixtureModel_load_verification.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e6f8d368
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e6f8d368
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e6f8d368

Branch: refs/heads/master
Commit: e6f8d3686016a305a747c5bcc85f46fd4c0cbe83
Parents: 7a9dcbc
Author: Vinod K C <vi...@huawei.com>
Authored: Tue Sep 8 14:44:05 2015 -0700
Committer: Xiangrui Meng <me...@databricks.com>
Committed: Tue Sep 8 14:44:05 2015 -0700

----------------------------------------------------------------------
 .../org/apache/spark/mllib/clustering/GaussianMixtureModel.scala | 3 +--
 .../src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala | 4 +---
 2 files changed, 2 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/e6f8d368/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
index 7f6163e..a590219 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
@@ -168,10 +168,9 @@ object GaussianMixtureModel extends Loader[GaussianMixtureModel] {
       val dataPath = Loader.dataPath(path)
       val sqlContext = new SQLContext(sc)
       val dataFrame = sqlContext.read.parquet(dataPath)
-      val dataArray = dataFrame.select("weight", "mu", "sigma").collect()
-
       // Check schema explicitly since erasure makes it hard to use match-case for checking.
       Loader.checkSchema[Data](dataFrame.schema)
+      val dataArray = dataFrame.select("weight", "mu", "sigma").collect()
 
       val (weights, gaussians) = dataArray.map {
         case Row(weight: Double, mu: Vector, sigma: Matrix) =>

http://git-wip-us.apache.org/repos/asf/spark/blob/e6f8d368/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
index 36b124c..58857c3 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
@@ -590,12 +590,10 @@ object Word2VecModel extends Loader[Word2VecModel] {
       val dataPath = Loader.dataPath(path)
       val sqlContext = new SQLContext(sc)
       val dataFrame = sqlContext.read.parquet(dataPath)
-
-      val dataArray = dataFrame.select("word", "vector").collect()
-
       // Check schema explicitly since erasure makes it hard to use match-case for checking.
       Loader.checkSchema[Data](dataFrame.schema)
 
+      val dataArray = dataFrame.select("word", "vector").collect()
       val word2VecMap = dataArray.map(i => (i.getString(0), i.getSeq[Float](1).toArray)).toMap
       new Word2VecModel(word2VecMap)
     }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org