You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@flink.apache.org by tr...@apache.org on 2015/06/09 10:29:53 UTC

[2/2] flink git commit: [ml] Makes StandardScalers state package private and reduce redundant code. Adjusts flink-ml readme.

[ml] Makes StandardScalers state package private and reduce redundant code. Adjusts flink-ml readme.


Project: http://git-wip-us.apache.org/repos/asf/flink/repo
Commit: http://git-wip-us.apache.org/repos/asf/flink/commit/97611c24
Tree: http://git-wip-us.apache.org/repos/asf/flink/tree/97611c24
Diff: http://git-wip-us.apache.org/repos/asf/flink/diff/97611c24

Branch: refs/heads/master
Commit: 97611c245f4df5820124fba25e55a2bac59086b4
Parents: 73f9911
Author: Till Rohrmann <tr...@apache.org>
Authored: Tue Jun 9 10:23:09 2015 +0200
Committer: Till Rohrmann <tr...@apache.org>
Committed: Tue Jun 9 10:29:24 2015 +0200

----------------------------------------------------------------------
 flink-staging/flink-ml/README.md                |  5 +--
 .../flink/ml/preprocessing/StandardScaler.scala | 44 +++++++++++++-------
 2 files changed, 32 insertions(+), 17 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/flink/blob/97611c24/flink-staging/flink-ml/README.md
----------------------------------------------------------------------
diff --git a/flink-staging/flink-ml/README.md b/flink-staging/flink-ml/README.md
index 5721e8f..5cabd7c 100644
--- a/flink-staging/flink-ml/README.md
+++ b/flink-staging/flink-ml/README.md
@@ -7,10 +7,9 @@ Theses implementations allow to scale to data sizes which vastly exceed the memo
 Flink-ML currently comprises the following algorithms:
 
 * Classification
+** Soft-margin SVM
 * Regression
-** Logistic regression
-* Clustering
-** k-Means
+** Multiple linear regression
 * Recommendation
 ** Alternating least squares (ALS)
 

http://git-wip-us.apache.org/repos/asf/flink/blob/97611c24/flink-staging/flink-ml/src/main/scala/org/apache/flink/ml/preprocessing/StandardScaler.scala
----------------------------------------------------------------------
diff --git a/flink-staging/flink-ml/src/main/scala/org/apache/flink/ml/preprocessing/StandardScaler.scala b/flink-staging/flink-ml/src/main/scala/org/apache/flink/ml/preprocessing/StandardScaler.scala
index 3b9c8d2..bf09b20 100644
--- a/flink-staging/flink-ml/src/main/scala/org/apache/flink/ml/preprocessing/StandardScaler.scala
+++ b/flink-staging/flink-ml/src/main/scala/org/apache/flink/ml/preprocessing/StandardScaler.scala
@@ -21,10 +21,8 @@ package org.apache.flink.ml.preprocessing
 import breeze.linalg
 import breeze.numerics.sqrt
 import breeze.numerics.sqrt._
-import org.apache.flink.api.common.functions._
 import org.apache.flink.api.common.typeinfo.TypeInformation
 import org.apache.flink.api.scala._
-import org.apache.flink.configuration.Configuration
 import org.apache.flink.ml._
 import org.apache.flink.ml.common.{LabeledVector, Parameter, ParameterMap}
 import org.apache.flink.ml.math.Breeze._
@@ -62,7 +60,9 @@ import scala.reflect.ClassTag
   */
 class StandardScaler extends Transformer[StandardScaler] {
 
-  var metricsOption: Option[DataSet[(linalg.Vector[Double], linalg.Vector[Double])]] = None
+  private[preprocessing] var metricsOption: Option[
+      DataSet[(linalg.Vector[Double], linalg.Vector[Double])]
+    ] = None
 
   /** Sets the target mean of the transformed data
     *
@@ -213,12 +213,7 @@ object StandardScaler {
             input.mapWithBcVariable(metrics){
               (vector, metrics) => {
                 val (broadcastMean, broadcastStd) = metrics
-                var myVector = vector.asBreeze
-
-                myVector -= broadcastMean
-                myVector :/= broadcastStd
-                myVector = (myVector :* std) + mean
-                myVector.fromBreeze
+                scaleVector(vector, broadcastMean, broadcastStd, mean, std)
               }
             }
           }
@@ -245,12 +240,8 @@ object StandardScaler {
               (labeledVector, metrics) => {
                 val (broadcastMean, broadcastStd) = metrics
                 val LabeledVector(label, vector) = labeledVector
-                var breezeVector = vector.asBreeze
 
-                breezeVector -= broadcastMean
-                breezeVector :/= broadcastStd
-                breezeVector = (breezeVector :* std) + mean
-                LabeledVector(label, breezeVector.fromBreeze)
+                LabeledVector(label, scaleVector(vector, broadcastMean, broadcastStd, mean, std))
               }
             }
           }
@@ -262,4 +253,29 @@ object StandardScaler {
       }
     }
   }
+
+  /** Scales the given vector such that it has the given mean and std
+    *
+    * @param vector Vector to be scaled
+    * @param dataMean Mean of the training data
+    * @param dataStd Standard deviation of the training data
+    * @param mean Mean of the scaled data
+    * @param std Standard deviation of the scaled data
+    * @tparam T Type of [[Vector]]
+    * @return Scaled vector
+    */
+  private def scaleVector[T <: Vector: BreezeVectorConverter](
+      vector: T,
+      dataMean: linalg.Vector[Double],
+      dataStd: linalg.Vector[Double],
+      mean: Double,
+      std: Double)
+    : T = {
+    var myVector = vector.asBreeze
+
+    myVector -= dataMean
+    myVector :/= dataStd
+    myVector = (myVector :* std) + mean
+    myVector.fromBreeze
+  }
 }