You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by me...@apache.org on 2015/02/05 08:03:51 UTC

spark git commit: [SPARK-5599] Check MLlib public APIs for 1.3

Repository: spark
Updated Branches:
  refs/heads/master 975bcef46 -> db3469046


[SPARK-5599] Check MLlib public APIs for 1.3

There are no break changes (against 1.2) in this PR. I hide the PythonMLLibAPI, which is only called by Py4J, and renamed `SparseMatrix.diag` to `SparseMatrix.spdiag`. All other changes are documentation and annotations. The `Experimental` tag is removed from `ALS.setAlpha` and `Rating`. One issue not addressed in this PR is the `setCheckpointDir` in `LDA` (https://issues.apache.org/jira/browse/SPARK-5604).

CC: srowen jkbradley

Author: Xiangrui Meng <me...@databricks.com>

Closes #4377 from mengxr/SPARK-5599 and squashes the following commits:

17975dc [Xiangrui Meng] fix tests
4487f20 [Xiangrui Meng] remove experimental tag from each stat method because Statistics is experimental already
3cd969a [Xiangrui Meng] remove freeman (sorry~) from StreamLA public doc
55900f5 [Xiangrui Meng] make IR experimental and update its doc
9b8eed3 [Xiangrui Meng] graduate Rating and setAlpha in ALS
b854d28 [Xiangrui Meng] correct iid doc in RandomRDDs
27f5bdd [Xiangrui Meng] update linalg docs and some new method signatures
371721b [Xiangrui Meng] mark fpg as experimental and update its doc
8aca7ee [Xiangrui Meng] change SLR to experimental and update the doc
ebbb2e9 [Xiangrui Meng] mark PIC experimental and update the doc
7830d3b [Xiangrui Meng] mark GMM experimental
a378496 [Xiangrui Meng] use the correct subscript syntax in PIC
c65c424 [Xiangrui Meng] update LDAModel doc
a213b0c [Xiangrui Meng] update GMM constructor
3993054 [Xiangrui Meng] hide algorithm in SLR
ad6b9ce [Xiangrui Meng] Revert "make ClassificatinModel.predict(JavaRDD) return JavaDoubleRDD"
0054684 [Xiangrui Meng] add doc to LRModel's constructor
a89763b [Xiangrui Meng] make ClassificatinModel.predict(JavaRDD) return JavaDoubleRDD
7c0946c [Xiangrui Meng] hide PythonMLLibAPI


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/db346904
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/db346904
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/db346904

Branch: refs/heads/master
Commit: db34690466d67f9c8ac6a145fddb5f7ea30a8d8d
Parents: 975bcef
Author: Xiangrui Meng <me...@databricks.com>
Authored: Wed Feb 4 23:03:47 2015 -0800
Committer: Xiangrui Meng <me...@databricks.com>
Committed: Wed Feb 4 23:03:47 2015 -0800

----------------------------------------------------------------------
 .../spark/mllib/api/python/PythonMLLibAPI.scala |  6 +--
 .../classification/LogisticRegression.scala     |  3 ++
 .../StreamingLogisticRegressionWithSGD.scala    |  5 +-
 .../mllib/clustering/GaussianMixture.scala      | 13 +++--
 .../mllib/clustering/GaussianMixtureModel.scala |  6 ++-
 .../spark/mllib/clustering/LDAModel.scala       |  2 +-
 .../clustering/PowerIterationClustering.scala   | 26 ++++++---
 .../mllib/clustering/StreamingKMeans.scala      | 16 ++++--
 .../org/apache/spark/mllib/fpm/FPGrowth.scala   | 27 +++++++---
 .../apache/spark/mllib/linalg/Matrices.scala    | 48 +++++++++--------
 .../org/apache/spark/mllib/linalg/Vectors.scala |  4 +-
 .../mllib/linalg/distributed/BlockMatrix.scala  | 12 ++++-
 .../apache/spark/mllib/random/RandomRDDs.scala  | 57 ++++++++++----------
 .../apache/spark/mllib/recommendation/ALS.scala |  4 --
 .../mllib/regression/IsotonicRegression.scala   | 13 ++++-
 .../regression/StreamingLinearAlgorithm.scala   |  4 +-
 .../apache/spark/mllib/stat/Statistics.scala    | 21 +-------
 .../spark/mllib/linalg/JavaMatricesSuite.java   |  4 +-
 .../spark/mllib/linalg/MatricesSuite.scala      |  8 +--
 19 files changed, 160 insertions(+), 119 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/db346904/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index 3f29b82..cbd87ea 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -54,11 +54,9 @@ import org.apache.spark.storage.StorageLevel
 import org.apache.spark.util.Utils
 
 /**
- * :: DeveloperApi ::
- * The Java stubs necessary for the Python mllib bindings.
+ * The Java stubs necessary for the Python mllib bindings. It is called by Py4J on the Python side.
  */
-@DeveloperApi
-class PythonMLLibAPI extends Serializable {
+private[python] class PythonMLLibAPI extends Serializable {
 
 
   /**

http://git-wip-us.apache.org/repos/asf/spark/blob/db346904/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
index 5c9feb6..a668e7a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
@@ -62,6 +62,9 @@ class LogisticRegressionModel (
       s" but was given weights of length ${weights.size}")
   }
 
+  /**
+   * Constructs a [[LogisticRegressionModel]] with weights and intercept for binary classification.
+   */
   def this(weights: Vector, intercept: Double) = this(weights, intercept, weights.size, 2)
 
   private var threshold: Option[Double] = Some(0.5)

http://git-wip-us.apache.org/repos/asf/spark/blob/db346904/mllib/src/main/scala/org/apache/spark/mllib/classification/StreamingLogisticRegressionWithSGD.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/StreamingLogisticRegressionWithSGD.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/StreamingLogisticRegressionWithSGD.scala
index 6a3893d..b89f38c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/StreamingLogisticRegressionWithSGD.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/StreamingLogisticRegressionWithSGD.scala
@@ -35,12 +35,13 @@ import org.apache.spark.mllib.regression.StreamingLinearAlgorithm
  * Use a builder pattern to construct a streaming logistic regression
  * analysis in an application, like:
  *
+ * {{{
  *  val model = new StreamingLogisticRegressionWithSGD()
  *    .setStepSize(0.5)
  *    .setNumIterations(10)
  *    .setInitialWeights(Vectors.dense(...))
  *    .trainOn(DStream)
- *
+ * }}}
  */
 @Experimental
 class StreamingLogisticRegressionWithSGD private[mllib] (
@@ -59,7 +60,7 @@ class StreamingLogisticRegressionWithSGD private[mllib] (
    */
   def this() = this(0.1, 50, 1.0, 0.0)
 
-  val algorithm = new LogisticRegressionWithSGD(
+  protected val algorithm = new LogisticRegressionWithSGD(
     stepSize, numIterations, regParam, miniBatchFraction)
 
   /** Set the step size for gradient descent. Default: 0.1. */

http://git-wip-us.apache.org/repos/asf/spark/blob/db346904/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
index 5c626fd..0be3014 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
@@ -19,15 +19,18 @@ package org.apache.spark.mllib.clustering
 
 import scala.collection.mutable.IndexedSeq
 
-import breeze.linalg.{DenseVector => BreezeVector, DenseMatrix => BreezeMatrix, diag, Transpose}
+import breeze.linalg.{DenseMatrix => BreezeMatrix, DenseVector => BreezeVector, Transpose, diag}
 
-import org.apache.spark.mllib.linalg.{Matrices, Vector, Vectors, DenseVector, DenseMatrix, BLAS}
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.mllib.linalg.{BLAS, DenseMatrix, DenseVector, Matrices, Vector, Vectors}
 import org.apache.spark.mllib.stat.distribution.MultivariateGaussian
 import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.rdd.RDD
 import org.apache.spark.util.Utils
 
 /**
+ * :: Experimental ::
+ *
  * This class performs expectation maximization for multivariate Gaussian
  * Mixture Models (GMMs).  A GMM represents a composite distribution of
  * independent Gaussian distributions with associated "mixing" weights
@@ -44,13 +47,17 @@ import org.apache.spark.util.Utils
  * is considered to have occurred.
  * @param maxIterations The maximum number of iterations to perform
  */
+@Experimental
 class GaussianMixture private (
     private var k: Int, 
     private var convergenceTol: Double, 
     private var maxIterations: Int,
     private var seed: Long) extends Serializable {
   
-  /** A default instance, 2 Gaussians, 100 iterations, 0.01 log-likelihood threshold */
+  /**
+   * Constructs a default instance. The default parameters are {k: 2, convergenceTol: 0.01,
+   * maxIterations: 100, seed: random}.
+   */
   def this() = this(2, 0.01, 100, Utils.random.nextLong())
   
   // number of samples per cluster to use when initializing Gaussians

http://git-wip-us.apache.org/repos/asf/spark/blob/db346904/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
index 1a2178e..af6f83c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
@@ -19,12 +19,15 @@ package org.apache.spark.mllib.clustering
 
 import breeze.linalg.{DenseVector => BreezeVector}
 
-import org.apache.spark.rdd.RDD
+import org.apache.spark.annotation.Experimental
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.mllib.stat.distribution.MultivariateGaussian
 import org.apache.spark.mllib.util.MLUtils
+import org.apache.spark.rdd.RDD
 
 /**
+ * :: Experimental ::
+ *
  * Multivariate Gaussian Mixture Model (GMM) consisting of k Gaussians, where points 
  * are drawn from each Gaussian i=1..k with probability w(i); mu(i) and sigma(i) are 
  * the respective mean and covariance for each Gaussian distribution i=1..k. 
@@ -35,6 +38,7 @@ import org.apache.spark.mllib.util.MLUtils
  * @param sigma Covariance maxtrix for each Gaussian in the mixture, where sigma(i) is the
  *              covariance matrix for Gaussian i
  */
+@Experimental
 class GaussianMixtureModel(
   val weights: Array[Double], 
   val gaussians: Array[MultivariateGaussian]) extends Serializable {

http://git-wip-us.apache.org/repos/asf/spark/blob/db346904/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
index 19e8aab..b0e991d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -335,7 +335,7 @@ class DistributedLDAModel private (
 
   /**
    * For each document in the training set, return the distribution over topics for that document
-   * (i.e., "theta_doc").
+   * ("theta_doc").
    *
    * @return  RDD of (document ID, topic distribution) pairs
    */

http://git-wip-us.apache.org/repos/asf/spark/blob/db346904/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
index 9b5c155..3b1caf0 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.mllib.clustering
 
 import org.apache.spark.{Logging, SparkException}
+import org.apache.spark.annotation.Experimental
 import org.apache.spark.graphx._
 import org.apache.spark.graphx.impl.GraphImpl
 import org.apache.spark.mllib.linalg.Vectors
@@ -26,25 +27,33 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.util.random.XORShiftRandom
 
 /**
+ * :: Experimental ::
+ *
  * Model produced by [[PowerIterationClustering]].
  *
  * @param k number of clusters
  * @param assignments an RDD of (vertexID, clusterID) pairs
  */
+@Experimental
 class PowerIterationClusteringModel(
     val k: Int,
     val assignments: RDD[(Long, Int)]) extends Serializable
 
 /**
- * Power Iteration Clustering (PIC), a scalable graph clustering algorithm developed by Lin and
- * Cohen (see http://www.icml2010.org/papers/387.pdf). From the abstract: PIC finds a very
+ * :: Experimental ::
+ *
+ * Power Iteration Clustering (PIC), a scalable graph clustering algorithm developed by
+ * [[http://www.icml2010.org/papers/387.pdf Lin and Cohen]]. From the abstract: PIC finds a very
  * low-dimensional embedding of a dataset using truncated power iteration on a normalized pair-wise
  * similarity matrix of the data.
  *
  * @param k Number of clusters.
  * @param maxIterations Maximum number of iterations of the PIC algorithm.
  * @param initMode Initialization mode.
+ *
+ * @see [[http://en.wikipedia.org/wiki/Spectral_clustering Spectral clustering (Wikipedia)]]
  */
+@Experimental
 class PowerIterationClustering private[clustering] (
     private var k: Int,
     private var maxIterations: Int,
@@ -88,11 +97,12 @@ class PowerIterationClustering private[clustering] (
   /**
    * Run the PIC algorithm.
    *
-   * @param similarities an RDD of (i, j, s_ij_) tuples representing the affinity matrix, which is
-   *                     the matrix A in the PIC paper. The similarity s_ij_ must be nonnegative.
-   *                     This is a symmetric matrix and hence s_ij_ = s_ji_. For any (i, j) with
-   *                     nonzero similarity, there should be either (i, j, s_ij_) or (j, i, s_ji_)
-   *                     in the input. Tuples with i = j are ignored, because we assume s_ij_ = 0.0.
+   * @param similarities an RDD of (i, j, s,,ij,,) tuples representing the affinity matrix, which is
+   *                     the matrix A in the PIC paper. The similarity s,,ij,, must be nonnegative.
+   *                     This is a symmetric matrix and hence s,,ij,, = s,,ji,,. For any (i, j) with
+   *                     nonzero similarity, there should be either (i, j, s,,ij,,) or
+   *                     (j, i, s,,ji,,) in the input. Tuples with i = j are ignored, because we
+   *                     assume s,,ij,, = 0.0.
    *
    * @return a [[PowerIterationClusteringModel]] that contains the clustering result
    */
@@ -109,7 +119,7 @@ class PowerIterationClustering private[clustering] (
    * Runs the PIC algorithm.
    *
    * @param w The normalized affinity matrix, which is the matrix W in the PIC paper with
-   *          w_ij_ = a_ij_ / d_ii_ as its edge properties and the initial vector of the power
+   *          w,,ij,, = a,,ij,, / d,,ii,, as its edge properties and the initial vector of the power
    *          iteration as its vertex properties.
    */
   private def pic(w: Graph[Double, Double]): PowerIterationClusteringModel = {

http://git-wip-us.apache.org/repos/asf/spark/blob/db346904/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
index 7752c19..f483fd1 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
@@ -21,7 +21,7 @@ import scala.reflect.ClassTag
 
 import org.apache.spark.Logging
 import org.apache.spark.SparkContext._
-import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.annotation.{Experimental, DeveloperApi}
 import org.apache.spark.mllib.linalg.{BLAS, Vector, Vectors}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.streaming.dstream.DStream
@@ -29,7 +29,8 @@ import org.apache.spark.util.Utils
 import org.apache.spark.util.random.XORShiftRandom
 
 /**
- * :: DeveloperApi ::
+ * :: Experimental ::
+ *
  * StreamingKMeansModel extends MLlib's KMeansModel for streaming
  * algorithms, so it can keep track of a continuously updated weight
  * associated with each cluster, and also update the model by
@@ -39,8 +40,10 @@ import org.apache.spark.util.random.XORShiftRandom
  * generalized to incorporate forgetfullness (i.e. decay).
  * The update rule (for each cluster) is:
  *
+ * {{{
  * c_t+1 = [(c_t * n_t * a) + (x_t * m_t)] / [n_t + m_t]
  * n_t+t = n_t * a + m_t
+ * }}}
  *
  * Where c_t is the previously estimated centroid for that cluster,
  * n_t is the number of points assigned to it thus far, x_t is the centroid
@@ -61,7 +64,7 @@ import org.apache.spark.util.random.XORShiftRandom
  * as batches or points.
  *
  */
-@DeveloperApi
+@Experimental
 class StreamingKMeansModel(
     override val clusterCenters: Array[Vector],
     val clusterWeights: Array[Double]) extends KMeansModel(clusterCenters) with Logging {
@@ -140,7 +143,8 @@ class StreamingKMeansModel(
 }
 
 /**
- * :: DeveloperApi ::
+ * :: Experimental ::
+ *
  * StreamingKMeans provides methods for configuring a
  * streaming k-means analysis, training the model on streaming,
  * and using the model to make predictions on streaming data.
@@ -149,13 +153,15 @@ class StreamingKMeansModel(
  * Use a builder pattern to construct a streaming k-means analysis
  * in an application, like:
  *
+ * {{{
  *  val model = new StreamingKMeans()
  *    .setDecayFactor(0.5)
  *    .setK(3)
  *    .setRandomCenters(5, 100.0)
  *    .trainOn(DStream)
+ * }}}
  */
-@DeveloperApi
+@Experimental
 class StreamingKMeans(
     var k: Int,
     var decayFactor: Double,

http://git-wip-us.apache.org/repos/asf/spark/blob/db346904/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
index 1433ee9..3168d60 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
@@ -25,16 +25,20 @@ import scala.collection.JavaConverters._
 import scala.reflect.ClassTag
 
 import org.apache.spark.{HashPartitioner, Logging, Partitioner, SparkException}
+import org.apache.spark.annotation.Experimental
 import org.apache.spark.api.java.{JavaPairRDD, JavaRDD}
 import org.apache.spark.api.java.JavaSparkContext.fakeClassTag
 import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.StorageLevel
 
 /**
+ * :: Experimental ::
+ *
  * Model trained by [[FPGrowth]], which holds frequent itemsets.
  * @param freqItemsets frequent itemset, which is an RDD of (itemset, frequency) pairs
  * @tparam Item item type
  */
+@Experimental
 class FPGrowthModel[Item: ClassTag](
     val freqItemsets: RDD[(Array[Item], Long)]) extends Serializable {
 
@@ -45,28 +49,35 @@ class FPGrowthModel[Item: ClassTag](
 }
 
 /**
- * This class implements Parallel FP-growth algorithm to do frequent pattern matching on input data.
- * Parallel FPGrowth (PFP) partitions computation in such a way that each machine executes an
- * independent group of mining tasks. More detail of this algorithm can be found at
- * [[http://dx.doi.org/10.1145/1454008.1454027, PFP]], and the original FP-growth paper can be
- * found at [[http://dx.doi.org/10.1145/335191.335372, FP-growth]]
+ * :: Experimental ::
+ *
+ * A parallel FP-growth algorithm to mine frequent itemsets. The algorithm is described in
+ * [[http://dx.doi.org/10.1145/1454008.1454027 Li et al., PFP: Parallel FP-Growth for Query
+ *  Recommendation]]. PFP distributes computation in such a way that each worker executes an
+ * independent group of mining tasks. The FP-Growth algorithm is described in
+ * [[http://dx.doi.org/10.1145/335191.335372 Han et al., Mining frequent patterns without candidate
+ *  generation]].
  *
  * @param minSupport the minimal support level of the frequent pattern, any pattern appears
  *                   more than (minSupport * size-of-the-dataset) times will be output
  * @param numPartitions number of partitions used by parallel FP-growth
+ *
+ * @see [[http://en.wikipedia.org/wiki/Association_rule_learning Association rule learning
+ *       (Wikipedia)]]
  */
+@Experimental
 class FPGrowth private (
     private var minSupport: Double,
     private var numPartitions: Int) extends Logging with Serializable {
 
   /**
-   * Constructs a FPGrowth instance with default parameters:
-   * {minSupport: 0.3, numPartitions: auto}
+   * Constructs a default instance with default parameters {minSupport: `0.3`, numPartitions: same
+   * as the input data}.
    */
   def this() = this(0.3, -1)
 
   /**
-   * Sets the minimal support level (default: 0.3).
+   * Sets the minimal support level (default: `0.3`).
    */
   def setMinSupport(minSupport: Double): this.type = {
     this.minSupport = minSupport

http://git-wip-us.apache.org/repos/asf/spark/blob/db346904/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
index ad7e868..84f8ac2 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
@@ -115,7 +115,7 @@ sealed trait Matrix extends Serializable {
  *
  * @param numRows number of rows
  * @param numCols number of columns
- * @param values matrix entries in column major
+ * @param values matrix entries in column major if not transposed or in row major otherwise
  * @param isTransposed whether the matrix is transposed. If true, `values` stores the matrix in
  *                     row major.
  */
@@ -187,7 +187,7 @@ class DenseMatrix(
     this
   }
 
-  override def transpose: Matrix = new DenseMatrix(numCols, numRows, values, !isTransposed)
+  override def transpose: DenseMatrix = new DenseMatrix(numCols, numRows, values, !isTransposed)
 
   private[spark] override def foreachActive(f: (Int, Int, Double) => Unit): Unit = {
     if (!isTransposed) {
@@ -217,9 +217,11 @@ class DenseMatrix(
     }
   }
 
-  /** Generate a `SparseMatrix` from the given `DenseMatrix`. The new matrix will have isTransposed
-    * set to false. */
-  def toSparse(): SparseMatrix = {
+  /**
+   * Generate a `SparseMatrix` from the given `DenseMatrix`. The new matrix will have isTransposed
+   * set to false.
+   */
+  def toSparse: SparseMatrix = {
     val spVals: MArrayBuilder[Double] = new MArrayBuilder.ofDouble
     val colPtrs: Array[Int] = new Array[Int](numCols + 1)
     val rowIndices: MArrayBuilder[Int] = new MArrayBuilder.ofInt
@@ -282,7 +284,7 @@ object DenseMatrix {
   }
 
   /**
-   * Generate a `DenseMatrix` consisting of i.i.d. uniform random numbers.
+   * Generate a `DenseMatrix` consisting of `i.i.d.` uniform random numbers.
    * @param numRows number of rows of the matrix
    * @param numCols number of columns of the matrix
    * @param rng a random number generator
@@ -293,7 +295,7 @@ object DenseMatrix {
   }
 
   /**
-   * Generate a `DenseMatrix` consisting of i.i.d. gaussian random numbers.
+   * Generate a `DenseMatrix` consisting of `i.i.d.` gaussian random numbers.
    * @param numRows number of rows of the matrix
    * @param numCols number of columns of the matrix
    * @param rng a random number generator
@@ -336,10 +338,10 @@ object DenseMatrix {
  *
  * @param numRows number of rows
  * @param numCols number of columns
- * @param colPtrs the index corresponding to the start of a new column
- * @param rowIndices the row index of the entry. They must be in strictly increasing order for each
- *                   column
- * @param values non-zero matrix entries in column major
+ * @param colPtrs the index corresponding to the start of a new column (if not transposed)
+ * @param rowIndices the row index of the entry (if not transposed). They must be in strictly
+ *                   increasing order for each column
+ * @param values nonzero matrix entries in column major (if not transposed)
  * @param isTransposed whether the matrix is transposed. If true, the matrix can be considered
  *                     Compressed Sparse Row (CSR) format, where `colPtrs` behaves as rowPtrs,
  *                     and `rowIndices` behave as colIndices, and `values` are stored in row major.
@@ -434,7 +436,7 @@ class SparseMatrix(
     this
   }
 
-  override def transpose: Matrix =
+  override def transpose: SparseMatrix =
     new SparseMatrix(numCols, numRows, colPtrs, rowIndices, values, !isTransposed)
 
   private[spark] override def foreachActive(f: (Int, Int, Double) => Unit): Unit = {
@@ -464,9 +466,11 @@ class SparseMatrix(
     }
   }
 
-  /** Generate a `DenseMatrix` from the given `SparseMatrix`. The new matrix will have isTransposed
-    * set to false. */
-  def toDense(): DenseMatrix = {
+  /**
+   * Generate a `DenseMatrix` from the given `SparseMatrix`. The new matrix will have isTransposed
+   * set to false.
+   */
+  def toDense: DenseMatrix = {
     new DenseMatrix(numRows, numCols, toArray)
   }
 }
@@ -593,7 +597,7 @@ object SparseMatrix {
   }
 
   /**
-   * Generate a `SparseMatrix` consisting of i.i.d. uniform random numbers. The number of non-zero
+   * Generate a `SparseMatrix` consisting of `i.i.d`. uniform random numbers. The number of non-zero
    * elements equal the ceiling of `numRows` x `numCols` x `density`
    *
    * @param numRows number of rows of the matrix
@@ -608,7 +612,7 @@ object SparseMatrix {
   }
 
   /**
-   * Generate a `SparseMatrix` consisting of i.i.d. gaussian random numbers.
+   * Generate a `SparseMatrix` consisting of `i.i.d`. gaussian random numbers.
    * @param numRows number of rows of the matrix
    * @param numCols number of columns of the matrix
    * @param density the desired density for the matrix
@@ -626,7 +630,7 @@ object SparseMatrix {
    * @return Square `SparseMatrix` with size `values.length` x `values.length` and non-zero
    *         `values` on the diagonal
    */
-  def diag(vector: Vector): SparseMatrix = {
+  def spdiag(vector: Vector): SparseMatrix = {
     val n = vector.size
     vector match {
       case sVec: SparseVector =>
@@ -722,7 +726,7 @@ object Matrices {
   def speye(n: Int): Matrix = SparseMatrix.speye(n)
 
   /**
-   * Generate a `DenseMatrix` consisting of i.i.d. uniform random numbers.
+   * Generate a `DenseMatrix` consisting of `i.i.d.` uniform random numbers.
    * @param numRows number of rows of the matrix
    * @param numCols number of columns of the matrix
    * @param rng a random number generator
@@ -732,7 +736,7 @@ object Matrices {
     DenseMatrix.rand(numRows, numCols, rng)
 
   /**
-   * Generate a `SparseMatrix` consisting of i.i.d. gaussian random numbers.
+   * Generate a `SparseMatrix` consisting of `i.i.d.` gaussian random numbers.
    * @param numRows number of rows of the matrix
    * @param numCols number of columns of the matrix
    * @param density the desired density for the matrix
@@ -743,7 +747,7 @@ object Matrices {
     SparseMatrix.sprand(numRows, numCols, density, rng)
 
   /**
-   * Generate a `DenseMatrix` consisting of i.i.d. gaussian random numbers.
+   * Generate a `DenseMatrix` consisting of `i.i.d.` gaussian random numbers.
    * @param numRows number of rows of the matrix
    * @param numCols number of columns of the matrix
    * @param rng a random number generator
@@ -753,7 +757,7 @@ object Matrices {
     DenseMatrix.randn(numRows, numCols, rng)
 
   /**
-   * Generate a `SparseMatrix` consisting of i.i.d. gaussian random numbers.
+   * Generate a `SparseMatrix` consisting of `i.i.d.` gaussian random numbers.
    * @param numRows number of rows of the matrix
    * @param numCols number of columns of the matrix
    * @param density the desired density for the matrix

http://git-wip-us.apache.org/repos/asf/spark/blob/db346904/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
index 8f75e6f..77785bd 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
@@ -244,8 +244,7 @@ object Vectors {
   }
 
   /**
-   * Parses a string resulted from `Vector#toString` into
-   * an [[org.apache.spark.mllib.linalg.Vector]].
+   * Parses a string resulted from [[Vector.toString]] into a [[Vector]].
    */
   def parse(s: String): Vector = {
     parseNumeric(NumericParser.parse(s))
@@ -483,6 +482,7 @@ class DenseVector(val values: Array[Double]) extends Vector {
 }
 
 object DenseVector {
+  /** Extracts the value array from a dense vector. */
   def unapply(dv: DenseVector): Option[Array[Double]] = Some(dv.values)
 }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/db346904/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala
index 3871152..1d25396 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala
@@ -21,7 +21,8 @@ import scala.collection.mutable.ArrayBuffer
 
 import breeze.linalg.{DenseMatrix => BDM}
 
-import org.apache.spark.{SparkException, Logging, Partitioner}
+import org.apache.spark.{Logging, Partitioner, SparkException}
+import org.apache.spark.annotation.Experimental
 import org.apache.spark.mllib.linalg.{DenseMatrix, Matrices, Matrix, SparseMatrix}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.StorageLevel
@@ -104,6 +105,8 @@ private[mllib] object GridPartitioner {
 }
 
 /**
+ * :: Experimental ::
+ *
  * Represents a distributed matrix in blocks of local matrices.
  *
  * @param blocks The RDD of sub-matrix blocks ((blockRowIndex, blockColIndex), sub-matrix) that
@@ -118,6 +121,7 @@ private[mllib] object GridPartitioner {
  * @param nCols Number of columns of this matrix. If the supplied value is less than or equal to
  *              zero, the number of columns will be calculated when `numCols` is invoked.
  */
+@Experimental
 class BlockMatrix(
     val blocks: RDD[((Int, Int), Matrix)],
     val rowsPerBlock: Int,
@@ -177,6 +181,10 @@ class BlockMatrix(
     assert(cols <= nCols, s"The number of columns $cols is more than claimed $nCols.")
   }
 
+  /**
+   * Validates the block matrix info against the matrix data (`blocks`) and throws an exception if
+   * any error is found.
+   */
   def validate(): Unit = {
     logDebug("Validating BlockMatrix...")
     // check if the matrix is larger than the claimed dimensions
@@ -351,7 +359,7 @@ class BlockMatrix(
           if (a.nonEmpty && b.nonEmpty) {
             val C = b.head match {
               case dense: DenseMatrix => a.head.multiply(dense)
-              case sparse: SparseMatrix => a.head.multiply(sparse.toDense())
+              case sparse: SparseMatrix => a.head.multiply(sparse.toDense)
               case _ => throw new SparkException(s"Unrecognized matrix type ${b.head.getClass}.")
             }
             Iterator(((blockRowIndex, blockColIndex), C.toBreeze))

http://git-wip-us.apache.org/repos/asf/spark/blob/db346904/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala
index 955c593..8341bb8 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomRDDs.scala
@@ -29,13 +29,13 @@ import org.apache.spark.util.Utils
 
 /**
  * :: Experimental ::
- * Generator methods for creating RDDs comprised of i.i.d. samples from some distribution.
+ * Generator methods for creating RDDs comprised of `i.i.d.` samples from some distribution.
  */
 @Experimental
 object RandomRDDs {
 
   /**
-   * Generates an RDD comprised of i.i.d. samples from the uniform distribution `U(0.0, 1.0)`.
+   * Generates an RDD comprised of `i.i.d.` samples from the uniform distribution `U(0.0, 1.0)`.
    *
    * To transform the distribution in the generated RDD from `U(0.0, 1.0)` to `U(a, b)`, use
    * `RandomRDDs.uniformRDD(sc, n, p, seed).map(v => a + (b - a) * v)`.
@@ -44,7 +44,7 @@ object RandomRDDs {
    * @param size Size of the RDD.
    * @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`).
    * @param seed Random seed (default: a random long integer).
-   * @return RDD[Double] comprised of i.i.d. samples ~ `U(0.0, 1.0)`.
+   * @return RDD[Double] comprised of `i.i.d.` samples ~ `U(0.0, 1.0)`.
    */
   def uniformRDD(
       sc: SparkContext,
@@ -81,7 +81,7 @@ object RandomRDDs {
   }
 
   /**
-   * Generates an RDD comprised of i.i.d. samples from the standard normal distribution.
+   * Generates an RDD comprised of `i.i.d.` samples from the standard normal distribution.
    *
    * To transform the distribution in the generated RDD from standard normal to some other normal
    * `N(mean, sigma^2^)`, use `RandomRDDs.normalRDD(sc, n, p, seed).map(v => mean + sigma * v)`.
@@ -90,7 +90,7 @@ object RandomRDDs {
    * @param size Size of the RDD.
    * @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`).
    * @param seed Random seed (default: a random long integer).
-   * @return RDD[Double] comprised of i.i.d. samples ~ N(0.0, 1.0).
+   * @return RDD[Double] comprised of `i.i.d.` samples ~ N(0.0, 1.0).
    */
   def normalRDD(
       sc: SparkContext,
@@ -127,14 +127,15 @@ object RandomRDDs {
   }
 
   /**
-   * Generates an RDD comprised of i.i.d. samples from the Poisson distribution with the input mean.
+   * Generates an RDD comprised of `i.i.d.` samples from the Poisson distribution with the input
+   * mean.
    *
    * @param sc SparkContext used to create the RDD.
    * @param mean Mean, or lambda, for the Poisson distribution.
    * @param size Size of the RDD.
    * @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`).
    * @param seed Random seed (default: a random long integer).
-   * @return RDD[Double] comprised of i.i.d. samples ~ Pois(mean).
+   * @return RDD[Double] comprised of `i.i.d.` samples ~ Pois(mean).
    */
   def poissonRDD(
       sc: SparkContext,
@@ -177,7 +178,7 @@ object RandomRDDs {
   }
 
   /**
-   * Generates an RDD comprised of i.i.d. samples from the exponential distribution with
+   * Generates an RDD comprised of `i.i.d.` samples from the exponential distribution with
    * the input mean.
    *
    * @param sc SparkContext used to create the RDD.
@@ -185,7 +186,7 @@ object RandomRDDs {
    * @param size Size of the RDD.
    * @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`).
    * @param seed Random seed (default: a random long integer).
-   * @return RDD[Double] comprised of i.i.d. samples ~ Pois(mean).
+   * @return RDD[Double] comprised of `i.i.d.` samples ~ Pois(mean).
    */
   def exponentialRDD(
       sc: SparkContext,
@@ -228,7 +229,7 @@ object RandomRDDs {
   }
 
   /**
-   * Generates an RDD comprised of i.i.d. samples from the gamma distribution with the input
+   * Generates an RDD comprised of `i.i.d.` samples from the gamma distribution with the input
    *  shape and scale.
    *
    * @param sc SparkContext used to create the RDD.
@@ -237,7 +238,7 @@ object RandomRDDs {
    * @param size Size of the RDD.
    * @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`).
    * @param seed Random seed (default: a random long integer).
-   * @return RDD[Double] comprised of i.i.d. samples ~ Pois(mean).
+   * @return RDD[Double] comprised of `i.i.d.` samples ~ Pois(mean).
    */
   def gammaRDD(
       sc: SparkContext,
@@ -287,7 +288,7 @@ object RandomRDDs {
   }
 
   /**
-   * Generates an RDD comprised of i.i.d. samples from the log normal distribution with the input
+   * Generates an RDD comprised of `i.i.d.` samples from the log normal distribution with the input
    *  mean and standard deviation
    *
    * @param sc SparkContext used to create the RDD.
@@ -296,7 +297,7 @@ object RandomRDDs {
    * @param size Size of the RDD.
    * @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`).
    * @param seed Random seed (default: a random long integer).
-   * @return RDD[Double] comprised of i.i.d. samples ~ Pois(mean).
+   * @return RDD[Double] comprised of `i.i.d.` samples ~ Pois(mean).
    */
   def logNormalRDD(
       sc: SparkContext,
@@ -348,14 +349,14 @@ object RandomRDDs {
 
   /**
    * :: DeveloperApi ::
-   * Generates an RDD comprised of i.i.d. samples produced by the input RandomDataGenerator.
+   * Generates an RDD comprised of `i.i.d.` samples produced by the input RandomDataGenerator.
    *
    * @param sc SparkContext used to create the RDD.
    * @param generator RandomDataGenerator used to populate the RDD.
    * @param size Size of the RDD.
    * @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`).
    * @param seed Random seed (default: a random long integer).
-   * @return RDD[Double] comprised of i.i.d. samples produced by generator.
+   * @return RDD[Double] comprised of `i.i.d.` samples produced by generator.
    */
   @DeveloperApi
   def randomRDD[T: ClassTag](
@@ -370,7 +371,7 @@ object RandomRDDs {
   // TODO Generate RDD[Vector] from multivariate distributions.
 
   /**
-   * Generates an RDD[Vector] with vectors containing i.i.d. samples drawn from the
+   * Generates an RDD[Vector] with vectors containing `i.i.d.` samples drawn from the
    * uniform distribution on `U(0.0, 1.0)`.
    *
    * @param sc SparkContext used to create the RDD.
@@ -424,7 +425,7 @@ object RandomRDDs {
   }
 
   /**
-   * Generates an RDD[Vector] with vectors containing i.i.d. samples drawn from the
+   * Generates an RDD[Vector] with vectors containing `i.i.d.` samples drawn from the
    * standard normal distribution.
    *
    * @param sc SparkContext used to create the RDD.
@@ -432,7 +433,7 @@ object RandomRDDs {
    * @param numCols Number of elements in each Vector.
    * @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`).
    * @param seed Random seed (default: a random long integer).
-   * @return RDD[Vector] with vectors containing i.i.d. samples ~ `N(0.0, 1.0)`.
+   * @return RDD[Vector] with vectors containing `i.i.d.` samples ~ `N(0.0, 1.0)`.
    */
   def normalVectorRDD(
       sc: SparkContext,
@@ -478,7 +479,7 @@ object RandomRDDs {
   }
 
   /**
-   * Generates an RDD[Vector] with vectors containing i.i.d. samples drawn from a
+   * Generates an RDD[Vector] with vectors containing `i.i.d.` samples drawn from a
    * log normal distribution.
    *
    * @param sc SparkContext used to create the RDD.
@@ -488,7 +489,7 @@ object RandomRDDs {
    * @param numCols Number of elements in each Vector.
    * @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`).
    * @param seed Random seed (default: a random long integer).
-   * @return RDD[Vector] with vectors containing i.i.d. samples.
+   * @return RDD[Vector] with vectors containing `i.i.d.` samples.
    */
   def logNormalVectorRDD(
       sc: SparkContext,
@@ -544,7 +545,7 @@ object RandomRDDs {
   }
 
   /**
-   * Generates an RDD[Vector] with vectors containing i.i.d. samples drawn from the
+   * Generates an RDD[Vector] with vectors containing `i.i.d.` samples drawn from the
    * Poisson distribution with the input mean.
    *
    * @param sc SparkContext used to create the RDD.
@@ -553,7 +554,7 @@ object RandomRDDs {
    * @param numCols Number of elements in each Vector.
    * @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`)
    * @param seed Random seed (default: a random long integer).
-   * @return RDD[Vector] with vectors containing i.i.d. samples ~ Pois(mean).
+   * @return RDD[Vector] with vectors containing `i.i.d.` samples ~ Pois(mean).
    */
   def poissonVectorRDD(
       sc: SparkContext,
@@ -603,7 +604,7 @@ object RandomRDDs {
   }
 
   /**
-   * Generates an RDD[Vector] with vectors containing i.i.d. samples drawn from the
+   * Generates an RDD[Vector] with vectors containing `i.i.d.` samples drawn from the
    * exponential distribution with the input mean.
    *
    * @param sc SparkContext used to create the RDD.
@@ -612,7 +613,7 @@ object RandomRDDs {
    * @param numCols Number of elements in each Vector.
    * @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`)
    * @param seed Random seed (default: a random long integer).
-   * @return RDD[Vector] with vectors containing i.i.d. samples ~ Exp(mean).
+   * @return RDD[Vector] with vectors containing `i.i.d.` samples ~ Exp(mean).
    */
   def exponentialVectorRDD(
       sc: SparkContext,
@@ -665,7 +666,7 @@ object RandomRDDs {
 
 
   /**
-   * Generates an RDD[Vector] with vectors containing i.i.d. samples drawn from the
+   * Generates an RDD[Vector] with vectors containing `i.i.d.` samples drawn from the
    * gamma distribution with the input shape and scale.
    *
    * @param sc SparkContext used to create the RDD.
@@ -675,7 +676,7 @@ object RandomRDDs {
    * @param numCols Number of elements in each Vector.
    * @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`)
    * @param seed Random seed (default: a random long integer).
-   * @return RDD[Vector] with vectors containing i.i.d. samples ~ Exp(mean).
+   * @return RDD[Vector] with vectors containing `i.i.d.` samples ~ Exp(mean).
    */
   def gammaVectorRDD(
       sc: SparkContext,
@@ -731,7 +732,7 @@ object RandomRDDs {
 
   /**
    * :: DeveloperApi ::
-   * Generates an RDD[Vector] with vectors containing i.i.d. samples produced by the
+   * Generates an RDD[Vector] with vectors containing `i.i.d.` samples produced by the
    * input RandomDataGenerator.
    *
    * @param sc SparkContext used to create the RDD.
@@ -740,7 +741,7 @@ object RandomRDDs {
    * @param numCols Number of elements in each Vector.
    * @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`).
    * @param seed Random seed (default: a random long integer).
-   * @return RDD[Vector] with vectors containing i.i.d. samples produced by generator.
+   * @return RDD[Vector] with vectors containing `i.i.d.` samples produced by generator.
    */
   @DeveloperApi
   def randomVectorRDD(sc: SparkContext,

http://git-wip-us.apache.org/repos/asf/spark/blob/db346904/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
index f4f51f2..4bb28d1 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
@@ -25,10 +25,8 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.StorageLevel
 
 /**
- * :: Experimental ::
  * A more compact class to represent a rating than Tuple3[Int, Int, Double].
  */
-@Experimental
 case class Rating(user: Int, product: Int, rating: Double)
 
 /**
@@ -135,10 +133,8 @@ class ALS private (
   }
 
   /**
-   * :: Experimental ::
    * Sets the constant used in computing confidence in implicit ALS. Default: 1.0.
    */
-  @Experimental
   def setAlpha(alpha: Double): this.type = {
     this.alpha = alpha
     this

http://git-wip-us.apache.org/repos/asf/spark/blob/db346904/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala
index 5ed6477..cb70852 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala
@@ -23,10 +23,13 @@ import java.util.Arrays.binarySearch
 
 import scala.collection.mutable.ArrayBuffer
 
+import org.apache.spark.annotation.Experimental
 import org.apache.spark.api.java.{JavaDoubleRDD, JavaRDD}
 import org.apache.spark.rdd.RDD
 
 /**
+ * :: Experimental ::
+ *
  * Regression model for isotonic regression.
  *
  * @param boundaries Array of boundaries for which predictions are known.
@@ -35,6 +38,7 @@ import org.apache.spark.rdd.RDD
  *                    Results of isotonic regression and therefore monotone.
  * @param isotonic indicates whether this is isotonic or antitonic.
  */
+@Experimental
 class IsotonicRegressionModel (
     val boundaries: Array[Double],
     val predictions: Array[Double],
@@ -123,6 +127,8 @@ class IsotonicRegressionModel (
 }
 
 /**
+ * :: Experimental ::
+ *
  * Isotonic regression.
  * Currently implemented using parallelized pool adjacent violators algorithm.
  * Only univariate (single feature) algorithm supported.
@@ -130,14 +136,17 @@ class IsotonicRegressionModel (
  * Sequential PAV implementation based on:
  * Tibshirani, Ryan J., Holger Hoefling, and Robert Tibshirani.
  *   "Nearly-isotonic regression." Technometrics 53.1 (2011): 54-61.
- *   Available from http://www.stat.cmu.edu/~ryantibs/papers/neariso.pdf
+ *   Available from [[http://www.stat.cmu.edu/~ryantibs/papers/neariso.pdf]]
  *
  * Sequential PAV parallelization based on:
  * Kearsley, Anthony J., Richard A. Tapia, and Michael W. Trosset.
  *   "An approach to parallelizing isotonic regression."
  *   Applied Mathematics and Parallel Computing. Physica-Verlag HD, 1996. 141-147.
- *   Available from http://softlib.rice.edu/pub/CRPC-TRs/reports/CRPC-TR96640.pdf
+ *   Available from [[http://softlib.rice.edu/pub/CRPC-TRs/reports/CRPC-TR96640.pdf]]
+ *
+ * @see [[http://en.wikipedia.org/wiki/Isotonic_regression Isotonic regression (Wikipedia)]]
  */
+@Experimental
 class IsotonicRegression private (private var isotonic: Boolean) extends Serializable {
 
   /**

http://git-wip-us.apache.org/repos/asf/spark/blob/db346904/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala
index 44a8dbb..c854f12 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearAlgorithm.scala
@@ -39,14 +39,14 @@ import org.apache.spark.streaming.dstream.DStream
  *
  * For example usage, see `StreamingLinearRegressionWithSGD`.
  *
- * NOTE(Freeman): In some use cases, the order in which trainOn and predictOn
+ * NOTE: In some use cases, the order in which trainOn and predictOn
  * are called in an application will affect the results. When called on
  * the same DStream, if trainOn is called before predictOn, when new data
  * arrive the model will update and the prediction will be based on the new
  * model. Whereas if predictOn is called first, the prediction will use the model
  * from the previous update.
  *
- * NOTE(Freeman): It is ok to call predictOn repeatedly on multiple streams; this
+ * NOTE: It is ok to call predictOn repeatedly on multiple streams; this
  * will generate predictions for each one all using the current model.
  * It is also ok to call trainOn on different streams; this will update
  * the model using each of the different sources, in sequence.

http://git-wip-us.apache.org/repos/asf/spark/blob/db346904/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
index 3cf4e80..b3fad0c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
@@ -26,36 +26,32 @@ import org.apache.spark.mllib.stat.test.{ChiSqTest, ChiSqTestResult}
 import org.apache.spark.rdd.RDD
 
 /**
+ * :: Experimental ::
  * API for statistical functions in MLlib.
  */
 @Experimental
 object Statistics {
 
   /**
-   * :: Experimental ::
    * Computes column-wise summary statistics for the input RDD[Vector].
    *
    * @param X an RDD[Vector] for which column-wise summary statistics are to be computed.
    * @return [[MultivariateStatisticalSummary]] object containing column-wise summary statistics.
    */
-  @Experimental
   def colStats(X: RDD[Vector]): MultivariateStatisticalSummary = {
     new RowMatrix(X).computeColumnSummaryStatistics()
   }
 
   /**
-   * :: Experimental ::
    * Compute the Pearson correlation matrix for the input RDD of Vectors.
    * Columns with 0 covariance produce NaN entries in the correlation matrix.
    *
    * @param X an RDD[Vector] for which the correlation matrix is to be computed.
    * @return Pearson correlation matrix comparing columns in X.
    */
-  @Experimental
   def corr(X: RDD[Vector]): Matrix = Correlations.corrMatrix(X)
 
   /**
-   * :: Experimental ::
    * Compute the correlation matrix for the input RDD of Vectors using the specified method.
    * Methods currently supported: `pearson` (default), `spearman`.
    *
@@ -69,11 +65,9 @@ object Statistics {
    *               Supported: `pearson` (default), `spearman`
    * @return Correlation matrix comparing columns in X.
    */
-  @Experimental
   def corr(X: RDD[Vector], method: String): Matrix = Correlations.corrMatrix(X, method)
 
   /**
-   * :: Experimental ::
    * Compute the Pearson correlation for the input RDDs.
    * Returns NaN if either vector has 0 variance.
    *
@@ -84,11 +78,9 @@ object Statistics {
    * @param y RDD[Double] of the same cardinality as x.
    * @return A Double containing the Pearson correlation between the two input RDD[Double]s
    */
-  @Experimental
   def corr(x: RDD[Double], y: RDD[Double]): Double = Correlations.corr(x, y)
 
   /**
-   * :: Experimental ::
    * Compute the correlation for the input RDDs using the specified method.
    * Methods currently supported: `pearson` (default), `spearman`.
    *
@@ -99,14 +91,12 @@ object Statistics {
    * @param y RDD[Double] of the same cardinality as x.
    * @param method String specifying the method to use for computing correlation.
    *               Supported: `pearson` (default), `spearman`
-   *@return A Double containing the correlation between the two input RDD[Double]s using the
+   * @return A Double containing the correlation between the two input RDD[Double]s using the
    *         specified method.
    */
-  @Experimental
   def corr(x: RDD[Double], y: RDD[Double], method: String): Double = Correlations.corr(x, y, method)
 
   /**
-   * :: Experimental ::
    * Conduct Pearson's chi-squared goodness of fit test of the observed data against the
    * expected distribution.
    *
@@ -120,13 +110,11 @@ object Statistics {
    * @return ChiSquaredTest object containing the test statistic, degrees of freedom, p-value,
    *         the method used, and the null hypothesis.
    */
-  @Experimental
   def chiSqTest(observed: Vector, expected: Vector): ChiSqTestResult = {
     ChiSqTest.chiSquared(observed, expected)
   }
 
   /**
-   * :: Experimental ::
    * Conduct Pearson's chi-squared goodness of fit test of the observed data against the uniform
    * distribution, with each category having an expected frequency of `1 / observed.size`.
    *
@@ -136,11 +124,9 @@ object Statistics {
    * @return ChiSquaredTest object containing the test statistic, degrees of freedom, p-value,
    *         the method used, and the null hypothesis.
    */
-  @Experimental
   def chiSqTest(observed: Vector): ChiSqTestResult = ChiSqTest.chiSquared(observed)
 
   /**
-   * :: Experimental ::
    * Conduct Pearson's independence test on the input contingency matrix, which cannot contain
    * negative entries or columns or rows that sum up to 0.
    *
@@ -148,11 +134,9 @@ object Statistics {
    * @return ChiSquaredTest object containing the test statistic, degrees of freedom, p-value,
    *         the method used, and the null hypothesis.
    */
-  @Experimental
   def chiSqTest(observed: Matrix): ChiSqTestResult = ChiSqTest.chiSquaredMatrix(observed)
 
   /**
-   * :: Experimental ::
    * Conduct Pearson's independence test for every feature against the label across the input RDD.
    * For each feature, the (feature, label) pairs are converted into a contingency matrix for which
    * the chi-squared statistic is computed. All label and feature values must be categorical.
@@ -162,7 +146,6 @@ object Statistics {
    * @return an array containing the ChiSquaredTestResult for every feature against the label.
    *         The order of the elements in the returned array reflects the order of input features.
    */
-  @Experimental
   def chiSqTest(data: RDD[LabeledPoint]): Array[ChiSqTestResult] = {
     ChiSqTest.chiSquaredFeatures(data)
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/db346904/mllib/src/test/java/org/apache/spark/mllib/linalg/JavaMatricesSuite.java
----------------------------------------------------------------------
diff --git a/mllib/src/test/java/org/apache/spark/mllib/linalg/JavaMatricesSuite.java b/mllib/src/test/java/org/apache/spark/mllib/linalg/JavaMatricesSuite.java
index 704d484..3349c50 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/linalg/JavaMatricesSuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/linalg/JavaMatricesSuite.java
@@ -71,8 +71,8 @@ public class JavaMatricesSuite implements Serializable {
         Matrix sm = Matrices.diag(sv);
         DenseMatrix d = DenseMatrix.diag(v);
         DenseMatrix sd = DenseMatrix.diag(sv);
-        SparseMatrix s = SparseMatrix.diag(v);
-        SparseMatrix ss = SparseMatrix.diag(sv);
+        SparseMatrix s = SparseMatrix.spdiag(v);
+        SparseMatrix ss = SparseMatrix.spdiag(sv);
 
         assertArrayEquals(m.toArray(), sm.toArray(), 0.0);
         assertArrayEquals(d.toArray(), sm.toArray(), 0.0);

http://git-wip-us.apache.org/repos/asf/spark/blob/db346904/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala
----------------------------------------------------------------------
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala
index b1ebfde..c098b54 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala
@@ -137,8 +137,8 @@ class MatricesSuite extends FunSuite {
     val spMat1 = new SparseMatrix(m, n, colPtrs, rowIndices, values)
     val deMat1 = new DenseMatrix(m, n, allValues)
 
-    val spMat2 = deMat1.toSparse()
-    val deMat2 = spMat1.toDense()
+    val spMat2 = deMat1.toSparse
+    val deMat2 = spMat1.toDense
 
     assert(spMat1.toBreeze === spMat2.toBreeze)
     assert(deMat1.toBreeze === deMat2.toBreeze)
@@ -185,8 +185,8 @@ class MatricesSuite extends FunSuite {
     assert(!dA.toArray.eq(dAT.toArray), "has to have a new array")
     assert(dA.values.eq(dAT.transpose.asInstanceOf[DenseMatrix].values), "should not copy array")
 
-    assert(dAT.toSparse().toBreeze === sATexpected.toBreeze)
-    assert(sAT.toDense().toBreeze === dATexpected.toBreeze)
+    assert(dAT.toSparse.toBreeze === sATexpected.toBreeze)
+    assert(sAT.toDense.toBreeze === dATexpected.toBreeze)
   }
 
   test("foreachActive") {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org