You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by jk...@apache.org on 2015/05/07 10:12:18 UTC
spark git commit: [SPARK-7421] [MLLIB] OnlineLDA cleanups

Repository: spark
Updated Branches:
  refs/heads/master fae4e2d60 -> 8b6b46e4f


[SPARK-7421] [MLLIB] OnlineLDA cleanups

Small changes, primarily to allow us more flexibility in the future:
* Rename "tau_0" to "tau0"
* Mark LDAOptimizer trait sealed and DeveloperApi.
* Mark LDAOptimizer subclasses as final.
* Mark setOptimizer (the one taking an LDAOptimizer) and getOptimizer as DeveloperApi since we may need to change them in the future

CC: hhbyyh

Author: Joseph K. Bradley <jo...@databricks.com>

Closes #5956 from jkbradley/onlinelda-cleanups and squashes the following commits:

f4be508 [Joseph K. Bradley] added newline
f4003e4 [Joseph K. Bradley] Changes: * Rename "tau_0" to "tau0" * Mark LDAOptimizer trait sealed and DeveloperApi. * Mark LDAOptimizer subclasses as final. * Mark setOptimizer (the one taking an LDAOptimizer) and getOptimizer as DeveloperApi since we may need to change them in the future


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8b6b46e4
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8b6b46e4
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8b6b46e4

Branch: refs/heads/master
Commit: 8b6b46e4ff5f19fb7befecaaa0eda63bf29a0e2c
Parents: fae4e2d
Author: Joseph K. Bradley <jo...@databricks.com>
Authored: Thu May 7 01:12:14 2015 -0700
Committer: Joseph K. Bradley <jo...@databricks.com>
Committed: Thu May 7 01:12:14 2015 -0700

----------------------------------------------------------------------
 .../org/apache/spark/mllib/clustering/LDA.scala | 15 ++++++--
 .../spark/mllib/clustering/LDAOptimizer.scala   | 37 +++++++++-----------
 .../spark/mllib/clustering/JavaLDASuite.java    |  2 +-
 .../spark/mllib/clustering/LDASuite.scala       |  8 ++---
 4 files changed, 34 insertions(+), 28 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/8b6b46e4/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
index c8daa23..a410547 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
@@ -18,8 +18,9 @@
 package org.apache.spark.mllib.clustering
 
 import breeze.linalg.{DenseVector => BDV}
+
 import org.apache.spark.Logging
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{DeveloperApi, Experimental}
 import org.apache.spark.api.java.JavaPairRDD
 import org.apache.spark.graphx._
 import org.apache.spark.mllib.linalg.Vector
@@ -197,12 +198,20 @@ class LDA private (
   }
 
 
-  /** LDAOptimizer used to perform the actual calculation */
+  /**
+   * :: DeveloperApi ::
+   *
+   * LDAOptimizer used to perform the actual calculation
+   */
+  @DeveloperApi
   def getOptimizer: LDAOptimizer = ldaOptimizer
 
   /**
+   * :: DeveloperApi ::
+   *
    * LDAOptimizer used to perform the actual calculation (default = EMLDAOptimizer)
    */
+  @DeveloperApi
   def setOptimizer(optimizer: LDAOptimizer): this.type = {
     this.ldaOptimizer = optimizer
     this
@@ -210,7 +219,7 @@ class LDA private (
 
   /**
    * Set the LDAOptimizer used to perform the actual calculation by algorithm name.
-   * Currently "em", "online" is supported.
+   * Currently "em", "online" are supported.
    */
   def setOptimizer(optimizerName: String): this.type = {
     this.ldaOptimizer =

http://git-wip-us.apache.org/repos/asf/spark/blob/8b6b46e4/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
index 093aa0f..6fa2fe0 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
@@ -23,7 +23,7 @@ import breeze.linalg.{DenseVector => BDV, DenseMatrix => BDM, sum, normalize, kr
 import breeze.numerics.{digamma, exp, abs}
 import breeze.stats.distributions.{Gamma, RandBasis}
 
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.graphx._
 import org.apache.spark.graphx.impl.GraphImpl
 import org.apache.spark.mllib.impl.PeriodicGraphCheckpointer
@@ -31,13 +31,13 @@ import org.apache.spark.mllib.linalg.{Matrices, SparseVector, DenseVector, Vecto
 import org.apache.spark.rdd.RDD
 
 /**
- * :: Experimental ::
+ * :: DeveloperApi ::
  *
  * An LDAOptimizer specifies which optimization/learning/inference algorithm to use, and it can
  * hold optimizer-specific parameters for users to set.
  */
-@Experimental
-trait LDAOptimizer {
+@DeveloperApi
+sealed trait LDAOptimizer {
 
   /*
     DEVELOPERS NOTE:
@@ -59,7 +59,7 @@ trait LDAOptimizer {
 }
 
 /**
- * :: Experimental ::
+ * :: DeveloperApi ::
  *
  * Optimizer for EM algorithm which stores data + parameter graph, plus algorithm parameters.
  *
@@ -75,8 +75,8 @@ trait LDAOptimizer {
  *    "On Smoothing and Inference for Topic Models."  UAI, 2009.
  *
  */
-@Experimental
-class EMLDAOptimizer extends LDAOptimizer {
+@DeveloperApi
+final class EMLDAOptimizer extends LDAOptimizer {
 
   import LDA._
 
@@ -211,7 +211,7 @@ class EMLDAOptimizer extends LDAOptimizer {
 
 
 /**
- * :: Experimental ::
+ * :: DeveloperApi ::
  *
  * An online optimizer for LDA. The Optimizer implements the Online variational Bayes LDA
  * algorithm, which processes a subset of the corpus on each iteration, and updates the term-topic
@@ -220,8 +220,8 @@ class EMLDAOptimizer extends LDAOptimizer {
  * Original Online LDA paper:
  *   Hoffman, Blei and Bach, "Online Learning for Latent Dirichlet Allocation." NIPS, 2010.
  */
-@Experimental
-class OnlineLDAOptimizer extends LDAOptimizer {
+@DeveloperApi
+final class OnlineLDAOptimizer extends LDAOptimizer {
 
   // LDA common parameters
   private var k: Int = 0
@@ -243,8 +243,8 @@ class OnlineLDAOptimizer extends LDAOptimizer {
   private var randomGenerator: java.util.Random = null
 
   // Online LDA specific parameters
-  // Learning rate is: (tau_0 + t)^{-kappa}
-  private var tau_0: Double = 1024
+  // Learning rate is: (tau0 + t)^{-kappa}
+  private var tau0: Double = 1024
   private var kappa: Double = 0.51
   private var miniBatchFraction: Double = 0.05
 
@@ -265,16 +265,16 @@ class OnlineLDAOptimizer extends LDAOptimizer {
    * A (positive) learning parameter that downweights early iterations. Larger values make early
    * iterations count less.
    */
-  def getTau_0: Double = this.tau_0
+  def getTau0: Double = this.tau0
 
   /**
    * A (positive) learning parameter that downweights early iterations. Larger values make early
    * iterations count less.
    * Default: 1024, following the original Online LDA paper.
    */
-  def setTau_0(tau_0: Double): this.type = {
-    require(tau_0 > 0,  s"LDA tau_0 must be positive, but was set to $tau_0")
-    this.tau_0 = tau_0
+  def setTau0(tau0: Double): this.type = {
+    require(tau0 > 0,  s"LDA tau0 must be positive, but was set to $tau0")
+    this.tau0 = tau0
     this
   }
 
@@ -434,11 +434,8 @@ class OnlineLDAOptimizer extends LDAOptimizer {
    * Update lambda based on the batch submitted. batchSize can be different for each iteration.
    */
   private[clustering] def update(stat: BDM[Double], iter: Int, batchSize: Int): Unit = {
-    val tau_0 = this.getTau_0
-    val kappa = this.getKappa
-
     // weight of the mini-batch.
-    val weight = math.pow(tau_0 + iter, -kappa)
+    val weight = math.pow(getTau0 + iter, -getKappa)
 
     // Update lambda based on documents.
     lambda = lambda * (1 - weight) +

http://git-wip-us.apache.org/repos/asf/spark/blob/8b6b46e4/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java
----------------------------------------------------------------------
diff --git a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java
index f394d90..96c2da1 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java
@@ -117,7 +117,7 @@ public class JavaLDASuite implements Serializable {
 
     // Train a model
     OnlineLDAOptimizer op = new OnlineLDAOptimizer()
-      .setTau_0(1024)
+      .setTau0(1024)
       .setKappa(0.51)
       .setGammaShape(1e40)
       .setMiniBatchFraction(0.5);

http://git-wip-us.apache.org/repos/asf/spark/blob/8b6b46e4/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
----------------------------------------------------------------------
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
index 2dcc881..d5b7d96 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
@@ -138,12 +138,12 @@ class LDASuite extends FunSuite with MLlibTestSparkContext {
     val lda = new LDA().setK(2)
     val corpus = sc.parallelize(tinyCorpus, 2)
     val op = new OnlineLDAOptimizer().initialize(corpus, lda)
-    op.setKappa(0.9876).setMiniBatchFraction(0.123).setTau_0(567)
+    op.setKappa(0.9876).setMiniBatchFraction(0.123).setTau0(567)
     assert(op.getAlpha == 0.5) // default 1.0 / k
     assert(op.getEta == 0.5)   // default 1.0 / k
     assert(op.getKappa == 0.9876)
     assert(op.getMiniBatchFraction == 0.123)
-    assert(op.getTau_0 == 567)
+    assert(op.getTau0 == 567)
   }
 
   test("OnlineLDAOptimizer one iteration") {
@@ -159,7 +159,7 @@ class LDASuite extends FunSuite with MLlibTestSparkContext {
     val corpus = sc.parallelize(docs, 2)
 
     // Set GammaShape large to avoid the stochastic impact.
-    val op = new OnlineLDAOptimizer().setTau_0(1024).setKappa(0.51).setGammaShape(1e40)
+    val op = new OnlineLDAOptimizer().setTau0(1024).setKappa(0.51).setGammaShape(1e40)
       .setMiniBatchFraction(1)
     val lda = new LDA().setK(k).setMaxIterations(1).setOptimizer(op).setSeed(12345)
 
@@ -192,7 +192,7 @@ class LDASuite extends FunSuite with MLlibTestSparkContext {
     ).zipWithIndex.map { case (wordCounts, docId) => (docId.toLong, wordCounts) }
 
     val docs = sc.parallelize(toydata)
-    val op = new OnlineLDAOptimizer().setMiniBatchFraction(1).setTau_0(1024).setKappa(0.51)
+    val op = new OnlineLDAOptimizer().setMiniBatchFraction(1).setTau0(1024).setKappa(0.51)
       .setGammaShape(1e10)
     val lda = new LDA().setK(2)
       .setDocConcentration(0.01)


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org