You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@flink.apache.org by tr...@apache.org on 2015/06/09 10:29:53 UTC
[2/2] flink git commit: [ml] Makes StandardScalers state package
private and reduce redundant code. Adjusts flink-ml readme.
[ml] Makes StandardScalers state package private and reduce redundant code. Adjusts flink-ml readme.
Project: http://git-wip-us.apache.org/repos/asf/flink/repo
Commit: http://git-wip-us.apache.org/repos/asf/flink/commit/97611c24
Tree: http://git-wip-us.apache.org/repos/asf/flink/tree/97611c24
Diff: http://git-wip-us.apache.org/repos/asf/flink/diff/97611c24
Branch: refs/heads/master
Commit: 97611c245f4df5820124fba25e55a2bac59086b4
Parents: 73f9911
Author: Till Rohrmann <tr...@apache.org>
Authored: Tue Jun 9 10:23:09 2015 +0200
Committer: Till Rohrmann <tr...@apache.org>
Committed: Tue Jun 9 10:29:24 2015 +0200
----------------------------------------------------------------------
flink-staging/flink-ml/README.md | 5 +--
.../flink/ml/preprocessing/StandardScaler.scala | 44 +++++++++++++-------
2 files changed, 32 insertions(+), 17 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/flink/blob/97611c24/flink-staging/flink-ml/README.md
----------------------------------------------------------------------
diff --git a/flink-staging/flink-ml/README.md b/flink-staging/flink-ml/README.md
index 5721e8f..5cabd7c 100644
--- a/flink-staging/flink-ml/README.md
+++ b/flink-staging/flink-ml/README.md
@@ -7,10 +7,9 @@ Theses implementations allow to scale to data sizes which vastly exceed the memo
Flink-ML currently comprises the following algorithms:
* Classification
+** Soft-margin SVM
* Regression
-** Logistic regression
-* Clustering
-** k-Means
+** Multiple linear regression
* Recommendation
** Alternating least squares (ALS)
http://git-wip-us.apache.org/repos/asf/flink/blob/97611c24/flink-staging/flink-ml/src/main/scala/org/apache/flink/ml/preprocessing/StandardScaler.scala
----------------------------------------------------------------------
diff --git a/flink-staging/flink-ml/src/main/scala/org/apache/flink/ml/preprocessing/StandardScaler.scala b/flink-staging/flink-ml/src/main/scala/org/apache/flink/ml/preprocessing/StandardScaler.scala
index 3b9c8d2..bf09b20 100644
--- a/flink-staging/flink-ml/src/main/scala/org/apache/flink/ml/preprocessing/StandardScaler.scala
+++ b/flink-staging/flink-ml/src/main/scala/org/apache/flink/ml/preprocessing/StandardScaler.scala
@@ -21,10 +21,8 @@ package org.apache.flink.ml.preprocessing
import breeze.linalg
import breeze.numerics.sqrt
import breeze.numerics.sqrt._
-import org.apache.flink.api.common.functions._
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.api.scala._
-import org.apache.flink.configuration.Configuration
import org.apache.flink.ml._
import org.apache.flink.ml.common.{LabeledVector, Parameter, ParameterMap}
import org.apache.flink.ml.math.Breeze._
@@ -62,7 +60,9 @@ import scala.reflect.ClassTag
*/
class StandardScaler extends Transformer[StandardScaler] {
- var metricsOption: Option[DataSet[(linalg.Vector[Double], linalg.Vector[Double])]] = None
+ private[preprocessing] var metricsOption: Option[
+ DataSet[(linalg.Vector[Double], linalg.Vector[Double])]
+ ] = None
/** Sets the target mean of the transformed data
*
@@ -213,12 +213,7 @@ object StandardScaler {
input.mapWithBcVariable(metrics){
(vector, metrics) => {
val (broadcastMean, broadcastStd) = metrics
- var myVector = vector.asBreeze
-
- myVector -= broadcastMean
- myVector :/= broadcastStd
- myVector = (myVector :* std) + mean
- myVector.fromBreeze
+ scaleVector(vector, broadcastMean, broadcastStd, mean, std)
}
}
}
@@ -245,12 +240,8 @@ object StandardScaler {
(labeledVector, metrics) => {
val (broadcastMean, broadcastStd) = metrics
val LabeledVector(label, vector) = labeledVector
- var breezeVector = vector.asBreeze
- breezeVector -= broadcastMean
- breezeVector :/= broadcastStd
- breezeVector = (breezeVector :* std) + mean
- LabeledVector(label, breezeVector.fromBreeze)
+ LabeledVector(label, scaleVector(vector, broadcastMean, broadcastStd, mean, std))
}
}
}
@@ -262,4 +253,29 @@ object StandardScaler {
}
}
}
+
+ /** Scales the given vector such that it has the given mean and std
+ *
+ * @param vector Vector to be scaled
+ * @param dataMean Mean of the training data
+ * @param dataStd Standard deviation of the training data
+ * @param mean Mean of the scaled data
+ * @param std Standard deviation of the scaled data
+ * @tparam T Type of [[Vector]]
+ * @return Scaled vector
+ */
+ private def scaleVector[T <: Vector: BreezeVectorConverter](
+ vector: T,
+ dataMean: linalg.Vector[Double],
+ dataStd: linalg.Vector[Double],
+ mean: Double,
+ std: Double)
+ : T = {
+ var myVector = vector.asBreeze
+
+ myVector -= dataMean
+ myVector :/= dataStd
+ myVector = (myVector :* std) + mean
+ myVector.fromBreeze
+ }
}