You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@flink.apache.org by tr...@apache.org on 2015/05/26 20:05:40 UTC
flink git commit: [FLINK-2083] [ml] [docs] Improves FlinkML documentation

Repository: flink
Updated Branches:
  refs/heads/master f78339faa -> b015a32f6


[FLINK-2083] [ml] [docs] Improves FlinkML documentation

Added tables for parameter values

Adds links to optimization parameter values in the description

This closes #715.


Project: http://git-wip-us.apache.org/repos/asf/flink/repo
Commit: http://git-wip-us.apache.org/repos/asf/flink/commit/b015a32f
Tree: http://git-wip-us.apache.org/repos/asf/flink/tree/b015a32f
Diff: http://git-wip-us.apache.org/repos/asf/flink/diff/b015a32f

Branch: refs/heads/master
Commit: b015a32f6126d759fc6dee90b78f90f7ff8dfbac
Parents: f78339f
Author: Theodore Vasiloudis <tv...@sics.se>
Authored: Wed May 20 13:35:44 2015 +0200
Committer: Till Rohrmann <tr...@apache.org>
Committed: Tue May 26 20:04:55 2015 +0200

----------------------------------------------------------------------
 docs/_includes/latex_commands.html         |   3 +
 docs/_layouts/base.html                    |   9 +-
 docs/libs/ml/als.md                        |  10 +-
 docs/libs/ml/distance_metrics.md           |  10 +-
 docs/libs/ml/multiple_linear_regression.md |   6 +-
 docs/libs/ml/optimization.md               | 231 ++++++++++++++++--------
 6 files changed, 179 insertions(+), 90 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/flink/blob/b015a32f/docs/_includes/latex_commands.html
----------------------------------------------------------------------
diff --git a/docs/_includes/latex_commands.html b/docs/_includes/latex_commands.html
index a070b68..b2f8f08 100644
--- a/docs/_includes/latex_commands.html
+++ b/docs/_includes/latex_commands.html
@@ -16,6 +16,8 @@ KIND, either express or implied.  See the License for the
 specific language governing permissions and limitations
 under the License.
 -->
+
+<!--Some of the Latex math notation has been adapted from Apache Spark MLlib's documentation-->
 $$
 \newcommand{\R}{\mathbb{R}}
 \newcommand{\E}{\mathbb{E}}
@@ -32,4 +34,5 @@ $$
 \newcommand{\one}{\mathbf{1}}
 \newcommand{\zero}{\mathbf{0}}
 \newcommand\rfrac[2]{^{#1}\!/_{#2}}
+\newcommand{\norm}[1]{\left\lVert#1\right\rVert}
 $$
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/flink/blob/b015a32f/docs/_layouts/base.html
----------------------------------------------------------------------
diff --git a/docs/_layouts/base.html b/docs/_layouts/base.html
index b5a2e7b..d4e74ef 100644
--- a/docs/_layouts/base.html
+++ b/docs/_layouts/base.html
@@ -34,7 +34,12 @@ under the License.
     <link rel="stylesheet" href="{{ site.baseurl }}/page/css/codetabs.css">
     {% if page.mathjax %}
     <script type="text/x-mathjax-config">
-        MathJax.Hub.Config({tex2jax: {inlineMath: [['$','$'], ['\\(','\\)']]}});
+        MathJax.Hub.Config({
+        tex2jax: {
+          inlineMath: [['$','$'], ['\\(','\\)']] },
+        TeX: {
+          equationNumbers: { autoNumber: "AMS" } }
+        });
     </script>
     <script type="text/javascript"
             src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML">
@@ -47,7 +52,7 @@ under the License.
       <script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
     <![endif]-->
   </head>
-  <body>  
+  <body>
     {% comment %} Includes are found in the _includes directory. {% endcomment %}
     {% include navbar.html %}
 

http://git-wip-us.apache.org/repos/asf/flink/blob/b015a32f/docs/libs/ml/als.md
----------------------------------------------------------------------
diff --git a/docs/libs/ml/als.md b/docs/libs/ml/als.md
index f827e4f..6b9e10f 100644
--- a/docs/libs/ml/als.md
+++ b/docs/libs/ml/als.md
@@ -114,9 +114,9 @@ The alternating least squares implementation can be controlled by the following
         <td><strong>TemporaryPath</strong></td>
         <td>
           <p>
-            Path to a temporary directory into which intermediate results are stored. 
-            If this value is set, then the algorithm is split into two preprocessing steps, the ALS iteration  and a post-processing step which calculates a last ALS half-step. 
-            The preprocessing steps calculate the <code>OutBlockInformation</code> and <code>InBlockInformation</code> for the given rating matrix. 
+            Path to a temporary directory into which intermediate results are stored.
+            If this value is set, then the algorithm is split into two preprocessing steps, the ALS iteration and a post-processing step which calculates a last ALS half-step.
+            The preprocessing steps calculate the <code>OutBlockInformation</code> and <code>InBlockInformation</code> for the given rating matrix.
             The results of the individual steps are stored in the specified directory.
             By splitting the algorithm into multiple smaller steps, Flink does not have to split the available memory amongst too many operators. 
             This allows the system to process bigger individual messages and improves the overall performance.
@@ -144,7 +144,7 @@ val als = ALS()
 // Set the other parameters via a parameter map
 val parameters = ParameterMap()
 .add(ALS.Lambda, 0.9)
-.add(ALS.Seed, 42l)
+.add(ALS.Seed, 42L)
 
 // Calculate the factorization
 als.fit(inputDS, parameters)
@@ -154,4 +154,4 @@ val testingDS: DataSet[(Int, Int)] = env.readCsvFile[(Int, Int)](pathToData)
 
 // Calculate the ratings according to the matrix factorization
 val predictedRatings = als.predict(testingDS)
-{% endhighlight %}
\ No newline at end of file
+{% endhighlight %}

http://git-wip-us.apache.org/repos/asf/flink/blob/b015a32f/docs/libs/ml/distance_metrics.md
----------------------------------------------------------------------
diff --git a/docs/libs/ml/distance_metrics.md b/docs/libs/ml/distance_metrics.md
index 4dfd1af..6f8b64d 100644
--- a/docs/libs/ml/distance_metrics.md
+++ b/docs/libs/ml/distance_metrics.md
@@ -26,9 +26,9 @@ under the License.
 
 ## Description
 
-Different metrics of distance are convenient for different types of analysis. Flink Machine
-Learning Library provides built-in implementations for many standard distance metric. You can also
-use other distance metric by implementing `DistanceMetric` trait.
+Different metrics of distance are convenient for different types of analysis. Flink ML provides
+built-in implementations for many standard distance metrics. You can create custom
+distance metrics by implementing the `DistanceMetric` trait.
 
 ## Built-in Implementations
 
@@ -91,7 +91,7 @@ Currently, FlinkML supports the following metrics:
 
 ## Custom Implementation
 
-You can use your own distance metric by implementing `DistanceMetric` trait.
+You can create your own distance metric by implementing the `DistanceMetric` trait.
 
 {% highlight scala %}
 class MyDistance extends DistanceMetric {
@@ -102,5 +102,5 @@ object MyDistance {
   def apply() = new MyDistance()
 }
 
-val metric = MyDistance()
+val myMetric = MyDistance()
 {% endhighlight %}

http://git-wip-us.apache.org/repos/asf/flink/blob/b015a32f/docs/libs/ml/multiple_linear_regression.md
----------------------------------------------------------------------
diff --git a/docs/libs/ml/multiple_linear_regression.md b/docs/libs/ml/multiple_linear_regression.md
index c11425d..85a925d 100644
--- a/docs/libs/ml/multiple_linear_regression.md
+++ b/docs/libs/ml/multiple_linear_regression.md
@@ -27,7 +27,7 @@ under the License.
 ## Description
 
  Multiple linear regression tries to find a linear function which best fits the provided input data.
- Given a set of input data with its value $(\mathbf{x}, y)$, the multiple linear regression finds
+ Given a set of input data with its value $(\mathbf{x}, y)$, multiple linear regression finds
  a vector $\mathbf{w}$ such that the sum of the squared residuals is minimized:
 
  $$ S(\mathbf{w}) = \sum_{i=1} \left(y - \mathbf{w}^T\mathbf{x_i} \right)^2$$
@@ -42,13 +42,13 @@ under the License.
 
   However, in cases where the input data set is so huge that a complete parse over the whole data
   set is prohibitive, one can apply stochastic gradient descent (SGD) to approximate the solution.
-  The SGD first calculates for a random subset of the input data set the gradients. The gradient
+  SGD first calculates for a random subset of the input data set the gradients. The gradient
   for a given point $\mathbf{x}_i$ is given by:
 
   $$\nabla_{\mathbf{w}} S(\mathbf{w}, \mathbf{x_i}) = 2\left(\mathbf{w}^T\mathbf{x_i} -
     y\right)\mathbf{x_i}$$
 
-  The gradients are averaged and scaled. The scaling is defined by $\gamma = \frac{s}{\sqrt{j}}$
+  The gradients are averaged and scaled. The scaling is defined by $\gamma = s/\sqrt{j}}$
   with $s$ being the initial step size and $j$ being the current iteration number. The resulting gradient is subtracted from the
   current weight vector giving the new weight vector for the next iteration:
 

http://git-wip-us.apache.org/repos/asf/flink/blob/b015a32f/docs/libs/ml/optimization.md
----------------------------------------------------------------------
diff --git a/docs/libs/ml/optimization.md b/docs/libs/ml/optimization.md
index b30e0d0..d3ee9fb 100644
--- a/docs/libs/ml/optimization.md
+++ b/docs/libs/ml/optimization.md
@@ -1,7 +1,6 @@
 ---
 mathjax: include
-title: "ML - Optimization"
-displayTitle: <a href="index.md">ML</a> - Optimization
+title: "FlinkML - Optimization"
 ---
 <!--
 Licensed to the Apache Software Foundation (ASF) under one
@@ -25,43 +24,25 @@ under the License.
 * Table of contents
 {:toc}
 
-$$
-\newcommand{\R}{\mathbb{R}}
-\newcommand{\E}{\mathbb{E}} 
-\newcommand{\x}{\mathbf{x}}
-\newcommand{\y}{\mathbf{y}}
-\newcommand{\wv}{\mathbf{w}}
-\newcommand{\av}{\mathbf{\alpha}}
-\newcommand{\bv}{\mathbf{b}}
-\newcommand{\N}{\mathbb{N}}
-\newcommand{\id}{\mathbf{I}}
-\newcommand{\ind}{\mathbf{1}} 
-\newcommand{\0}{\mathbf{0}} 
-\newcommand{\unit}{\mathbf{e}} 
-\newcommand{\one}{\mathbf{1}} 
-\newcommand{\zero}{\mathbf{0}}
-$$
-
 ## Mathematical Formulation
 
-The optimization framework in Flink is a developer-oriented package that can be used to solve
-[optimization](https://en.wikipedia.org/wiki/Mathematical_optimization) 
-problems common in Machine Learning (ML) tasks. In the supervised learning context, this usually 
-involves finding a model, as defined by a set of parameters $w$, that minimize a function $f(\wv)$ 
+The optimization framework in FlinkML is a developer-oriented package that can be used to solve
+[optimization](https://en.wikipedia.org/wiki/Mathematical_optimization)
+problems common in Machine Learning (ML) tasks. In the supervised learning context, this usually
+involves finding a model, as defined by a set of parameters $w$, that minimize a function $f(\wv)$
 given a set of $(\x, y)$ examples,
-where $\x$ is a feature vector and $y$ is a real number, which can represent either a real value in 
-the regression case, or a class label in the classification case. In supervised learning, the 
+where $\x$ is a feature vector and $y$ is a real number, which can represent either a real value in
+the regression case, or a class label in the classification case. In supervised learning, the
 function to be minimized is usually of the form:
 
-$$
-\begin{equation}
-    f(\wv) := 
+
+\begin{equation} \label{eq:objectiveFunc}
+    f(\wv) :=
     \frac1n \sum_{i=1}^n L(\wv;\x_i,y_i) +
     \lambda\, R(\wv)
-    \label{eq:objectiveFunc}
     \ .
 \end{equation}
-$$
+
 
 where $L$ is the loss function and $R(\wv)$ the regularization penalty. We use $L$ to measure how
 well the model fits the observed data, and we use $R$ in order to impose a complexity cost to the
@@ -69,42 +50,42 @@ model, with $\lambda > 0$ being the regularization parameter.
 
 ### Loss Functions
 
-In supervised learning, we use loss functions in order to measure the model fit, by 
-penalizing errors in the predictions $p$ made by the model compared to the true $y$ for each 
-example. Different loss function can be used for regression (e.g. Squared Loss) and classification
-(e.g. Hinge Loss).
+In supervised learning, we use loss functions in order to measure the model fit, by
+penalizing errors in the predictions $p$ made by the model compared to the true $y$ for each
+example. Different loss functions can be used for regression (e.g. Squared Loss) and classification
+(e.g. Hinge Loss) tasks.
 
 Some common loss functions are:
- 
-* Squared Loss: $ \frac{1}{2} (\wv^T \x - y)^2, \quad y \in \R $ 
-* Hinge Loss: $ \max (0, 1 - y ~ \wv^T \x), \quad y \in \{-1, +1\} $
-* Logistic Loss: $ \log(1+\exp( -y ~ \wv^T \x)), \quad y \in \{-1, +1\} $
 
-Currently, only the Squared Loss function is implemented in Flink.
+* Squared Loss: $ \frac{1}{2} \left(\wv^T \cdot \x - y\right)^2, \quad y \in \R $
+* Hinge Loss: $ \max \left(0, 1 - y ~ \wv^T \cdot \x\right), \quad y \in \{-1, +1\} $
+* Logistic Loss: $ \log\left(1+\exp\left( -y ~ \wv^T \cdot \x\right)\right), \quad y \in \{-1, +1\}$
 
 ### Regularization Types
 
-[Regularization](https://en.wikipedia.org/wiki/Regularization_(mathematics)) in machine learning is 
+[Regularization](https://en.wikipedia.org/wiki/Regularization_(mathematics)) in machine learning
 imposes penalties to the estimated models, in order to reduce overfitting. The most common penalties
 are the $L_1$ and $L_2$ penalties, defined as:
 
-* $L_1$: $R(\wv) = \|\wv\|_1$
-* $L_2$: $R(\wv) = \frac{1}{2}\|\wv\|_2^2$
+* $L_1$: $R(\wv) = \norm{\wv}_1$
+* $L_2$: $R(\wv) = \frac{1}{2}\norm{\wv}_2^2$
 
 The $L_2$ penalty penalizes large weights, favoring solutions with more small weights rather than
 few large ones.
 The $L_1$ penalty can be used to drive a number of the solution coefficients to 0, thereby
 producing sparse solutions.
-The optimization framework in Flink supports the $L_1$ and $L_2$ penalties, as well as no 
-regularization. The 
-regularization parameter $\lambda$ in $\eqref{objectiveFunc}$ determines the amount of 
+The optimization framework in Flink supports the $L_1$ and $L_2$ penalties, as well as no
+regularization. The
+regularization parameter $\lambda$ in $\eqref{eq:objectiveFunc}$ determines the amount of
 regularization applied to the model,
-and is usually determined through model cross-validation.
+and is usually determined through model cross-validation. A good comparison of regularization
+types can
+be found in [this](http://www.robotics.stanford.edu/~ang/papers/icml04-l1l2.pdf) paper by Andrew Ng.
 
 ## Stochastic Gradient Descent
 
 In order to find a (local) minimum of a function, Gradient Descent methods take steps in the
-direction opposite to the gradient of the function $\eqref{objectiveFunc}$ taken with
+direction opposite to the gradient of the function $\eqref{eq:objectiveFunc}$ taken with
 respect to the current parameters (weights).
 In order to compute the exact gradient we need to perform one pass through all the points in
 a dataset, making the process computationally expensive.
@@ -116,12 +97,12 @@ over each batch. At each iteration of the algorithm we update the weights once,
 the average of the gradients computed from each mini-batch.
 
 An important parameter is the learning rate $\eta$, or step size, which is currently determined as
-$\eta = \frac{\eta_0}{\sqrt{j}}$, where $\eta_0$ is the initial step size and $j$ is the iteration 
-number. The setting of the initial step size can significantly affect the performance of the 
-algorithm. For some practical tips on tuning SGD see Leon Botou's 
+$\eta = \eta_0/\sqrt{j}$, where $\eta_0$ is the initial step size and $j$ is the iteration
+number. The setting of the initial step size can significantly affect the performance of the
+algorithm. For some practical tips on tuning SGD see Leon Botou's
 "[Stochastic Gradient Descent Tricks](http://research.microsoft.com/pubs/192769/tricks-2012.pdf)".
 
-The current implementation of SGD  uses the whole partition, making it 
+The current implementation of SGD  uses the whole partition, making it
 effectively a batch gradient descent. Once a sampling operator has been introduced in Flink, true
 mini-batch SGD will be performed.
 
@@ -129,7 +110,7 @@ mini-batch SGD will be performed.
 ### Parameters
 
   The stochastic gradient descent implementation can be controlled by the following parameters:
-  
+
    <table class="table table-bordered">
     <thead>
       <tr>
@@ -139,10 +120,10 @@ mini-batch SGD will be performed.
     </thead>
     <tbody>
       <tr>
-        <td><strong>Loss Function</strong></td>
+        <td><strong>LossFunction</strong></td>
         <td>
           <p>
-            The class of the loss function to be used. (Default value: 
+            The class of the loss function to be used. See <a href="#loss-function-values">loss function values</a> for a list of supported values. (Default value:
             <strong>SquaredLoss</strong>, used for regression tasks)
           </p>
         </td>
@@ -151,7 +132,7 @@ mini-batch SGD will be performed.
         <td><strong>RegularizationType</strong></td>
         <td>
           <p>
-            The type of regularization penalty to apply. (Default value: 
+            The type of regularization penalty to apply. See <a href="#regularization-function-values">regularization function values</a> for a list of supported values. (Default value:
             <strong>NoRegularization</strong>)
           </p>
         </td>
@@ -160,10 +141,21 @@ mini-batch SGD will be performed.
         <td><strong>RegularizationParameter</strong></td>
         <td>
           <p>
-            The amount of regularization to apply. (Default value:<strong>0</strong>)
+            The amount of regularization to apply. (Default value: <strong>0.0</strong>)
           </p>
         </td>
-      </tr>     
+      </tr>
+      <tr>
+        <td><strong>PredictionFunction</strong></td>
+        <td>
+          <p>
+            Class that provides the prediction function, used to calculate $\hat{y}$ and the
+            prediction gradient based on the weights $\wv$ and the example features $\x$. See
+            <a href="#prediction-function-values">prediction function values</a> for a list of supported values.
+            (Default value: <strong>LinearPrediction</strong>)
+          </p>
+        </td>
+      </tr>
       <tr>
         <td><strong>Iterations</strong></td>
         <td>
@@ -177,21 +169,115 @@ mini-batch SGD will be performed.
         <td>
           <p>
             Initial step size for the gradient descent method.
-            This value controls how far the gradient descent method moves in the opposite direction 
+            This value controls how far the gradient descent method moves in the opposite direction
             of the gradient.
             (Default value: <strong>0.1</strong>)
           </p>
         </td>
       </tr>
       <tr>
-        <td><strong>Prediction Function</strong></td>
+        <td><strong>ConvergenceThreshold</strong></td>
         <td>
           <p>
-            Class that provides the prediction function, used to calculate $\hat{y}$ based on the 
-            weights and the example features, and the prediction gradient.
-            (Default value: <strong>LinearPrediction</strong>)
+            When set, iterations stop if the relative change in the value of the objective function $\eqref{eq:objectiveFunc}$ is less than the provided threshold, $\tau$.
+            The convergence criterion is defined as follows: $\left| \frac{f(\wv)_{i-1} - f(\wv)_i}{f(\wv)_{i-1}}\right| < \tau$.
+            (Default value: <strong>None</strong>)
+          </p>
+        </td>
+      </tr>
+    </tbody>
+  </table>
+  
+#### Loss Function Values ##
+
+  <table class="table table-bordered">
+    <thead>
+      <tr>
+        <th class="text-left" style="width: 20%">Function Name</th>
+        <th class="text-center">Description</th>
+        <th class="text-center">Loss</th>
+        <th class="text-center">Loss Derivative</th>
+      </tr>
+    </thead>
+    <tbody>
+      <tr>
+        <td><strong>SquaredLoss</strong></td>
+        <td>
+          <p>
+            Loss function most commonly used for regression tasks.
           </p>
         </td>
+        <td class="text-center">$\frac{1}{2} (\wv^T \cdot \x - y)^2$</td>
+        <td class="text-center">$\wv^T \cdot \x - y$</td>
+      </tr>
+    </tbody>
+  </table>
+
+#### Prediction Function Values ##
+
+  <table class="table table-bordered">
+      <thead>
+        <tr>
+          <th class="text-left" style="width: 20%">Function Name</th>
+          <th class="text-center">Description</th>
+          <th class="text-center">Prediction</th>
+          <th class="text-center">Prediction Gradient</th>
+        </tr>
+      </thead>
+      <tbody>
+        <tr>
+          <td><strong>LinearPrediction</strong></td>
+          <td>
+            <p>
+              The function most commonly used for linear models, such as linear regression and
+              linear classifiers.
+            </p>
+          </td>
+          <td class="text-center">$\x^T \cdot \wv$</td>
+          <td class="text-center">$\x$</td>
+        </tr>
+      </tbody>
+    </table>
+
+#### Regularization Function Values ##
+
+  <table class="table table-bordered">
+    <thead>
+      <tr>
+        <th class="text-left" style="width: 20%">Regularization Name</th>
+        <th class="text-center">Description</th>
+        <th class="text-center">$R(\wv)$</th>
+      </tr>
+    </thead>
+    <tbody>
+      <tr>
+        <td><strong>L1Regularization</strong></td>
+        <td>
+          <p>
+            This type of regularization will drive small weights to 0, potentially providing sparse
+            solutions.
+          </p>
+        </td>
+        <td class="text-center">$\norm{\wv}_1$</td>
+      </tr>
+      <tr>
+        <td><strong>L2Regularization</strong></td>
+        <td>
+          <p>
+            This type of regularization will keep weights from growing too large, favoring solutions
+            with more small weights, rather than few large ones.
+          </p>
+        </td>
+        <td class="text-center">$\frac{1}{2}\norm{\wv}_2^2$</td>
+      </tr>
+      <tr>
+        <td><strong>NoRegularization</strong></td>
+        <td>
+          <p>
+            No regularization is applied to the weights when this regularization type is used.
+          </p>
+        </td>
+        <td class="text-center">$0$</td>
       </tr>
     </tbody>
   </table>
@@ -212,24 +298,19 @@ weight vector. This allows us to avoid applying regularization to the intercept.
 {% highlight scala %}
 // Create stochastic gradient descent solver
 val sgd = GradientDescent()
-.setLossFunction(new SquaredLoss)
-.setRegularizationType(new L1Regularization)
-.setRegularizationParameter(0.2)
-.setIterations(100)
-.setStepsize(0.01)
+  .setLossFunction(SquaredLoss())
+  .setRegularizationType(L1Regularization())
+  .setRegularizationParameter(0.2)
+  .setIterations(100)
+  .setStepsize(0.01)
 
 
 // Obtain data
 val trainingDS: DataSet[LabeledVector] = ...
 
-// Fit the solver to the provided data, using initial weights set to 0.0
-val weightDS = sgd.optimize(inputDS, None)
-
-// Retrieve the optimized weights
-val weightVector = weightDS
+// Optimize the weights, according to the provided data
+val weightDS = sgd.optimize(trainingDS)
 
-// We can now use the weightVector to make predictions
+// We can now use weightDS to make predictions
 
 {% endhighlight %}
-
-Note: Some of the Latex math notation was adapted from Apache Spark MLlib's documentation