You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ss...@apache.org on 2012/06/03 20:39:44 UTC

svn commit: r1345736 - in /mahout/trunk: core/src/main/java/org/apache/mahout/clustering/lda/ core/src/test/java/org/apache/mahout/clustering/ core/src/test/java/org/apache/mahout/clustering/lda/ src/conf/

Author: ssc
Date: Sun Jun  3 18:39:43 2012
New Revision: 1345736

URL: http://svn.apache.org/viewvc?rev=1345736&view=rev
Log:
MAHOUT-986 Remove old LDA implementation from codebase

Removed:
    mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADocumentTopicMapper.java
    mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java
    mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDAInference.java
    mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDAReducer.java
    mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDASampler.java
    mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDAState.java
    mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDAUtil.java
    mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDAWordTopicMapper.java
    mahout/trunk/core/src/test/java/org/apache/mahout/clustering/lda/TestLDAInference.java
    mahout/trunk/core/src/test/java/org/apache/mahout/clustering/lda/TestMapReduce.java
    mahout/trunk/src/conf/lda.props
    mahout/trunk/src/conf/ldatopics.props
Modified:
    mahout/trunk/core/src/test/java/org/apache/mahout/clustering/ClusteringTestUtils.java
    mahout/trunk/src/conf/driver.classes.props

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/ClusteringTestUtils.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/ClusteringTestUtils.java?rev=1345736&r1=1345735&r2=1345736&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/ClusteringTestUtils.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/ClusteringTestUtils.java Sun Jun  3 18:39:43 2012
@@ -17,6 +17,7 @@
 
 package org.apache.mahout.clustering;
 
+import com.google.common.base.Preconditions;
 import com.google.common.io.Closeables;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
@@ -24,7 +25,6 @@ import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.IntWritable;
 import org.apache.hadoop.io.LongWritable;
 import org.apache.hadoop.io.SequenceFile;
-import org.apache.mahout.clustering.lda.LDASampler;
 import org.apache.mahout.math.DenseMatrix;
 import org.apache.mahout.math.DenseVector;
 import org.apache.mahout.math.Matrix;
@@ -32,6 +32,7 @@ import org.apache.mahout.math.SparseRowM
 import org.apache.mahout.math.Vector;
 import org.apache.mahout.math.VectorWritable;
 import org.apache.mahout.math.function.DoubleFunction;
+import org.apache.mahout.math.stats.Sampler;
 
 import java.io.IOException;
 import java.util.Random;
@@ -109,4 +110,43 @@ public final class ClusteringTestUtils {
     }
     return model;
   }
+
+  /**
+   * Takes in a {@link Matrix} of topic distributions (such as generated by {@link org.apache.mahout.clustering.lda.cvb.CVB0Driver} or
+   * {@link org.apache.mahout.clustering.lda.cvb.InMemoryCollapsedVariationalBayes0}, and constructs
+   * a set of samplers over this distribution, which may be sampled from by providing a distribution
+   * over topics, and a number of samples desired
+   */
+  static class LDASampler {
+      private final Random random;
+      private final Sampler[] samplers;
+
+      public LDASampler(Matrix model, Random random) {
+          this.random = random;
+          samplers = new Sampler[model.numRows()];
+          for (int i = 0; i < samplers.length; i++) {
+              samplers[i] = new Sampler(random, model.viewRow(i));
+          }
+      }
+
+      /**
+       *
+       * @param topicDistribution vector of p(topicId) for all topicId < model.numTopics()
+       * @param numSamples the number of times to sample (with replacement) from the model
+       * @return array of length numSamples, with each entry being a sample from the model.  There
+       * may be repeats
+       */
+      public int[] sample(Vector topicDistribution, int numSamples) {
+          Preconditions.checkNotNull(topicDistribution);
+          Preconditions.checkArgument(numSamples > 0, "numSamples must be positive");
+          Preconditions.checkArgument(topicDistribution.size() == samplers.length,
+                  "topicDistribution must have same cardinality as the sampling model");
+          int[] samples = new int[numSamples];
+          Sampler topicSampler = new Sampler(random, topicDistribution);
+          for (int i = 0; i < numSamples; i++) {
+              samples[i] = samplers[topicSampler.sample()].sample();
+          }
+          return samples;
+      }
+  }
 }

Modified: mahout/trunk/src/conf/driver.classes.props
URL: http://svn.apache.org/viewvc/mahout/trunk/src/conf/driver.classes.props?rev=1345736&r1=1345735&r2=1345736&view=diff
==============================================================================
--- mahout/trunk/src/conf/driver.classes.props (original)
+++ mahout/trunk/src/conf/driver.classes.props Sun Jun  3 18:39:43 2012
@@ -26,8 +26,6 @@ org.apache.mahout.math.hadoop.stochastic
 org.apache.mahout.clustering.kmeans.KMeansDriver = kmeans : K-means clustering
 org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver = fkmeans : Fuzzy K-means clustering
 org.apache.mahout.clustering.minhash.MinHashDriver = minhash : Run Minhash clustering
-org.apache.mahout.clustering.lda.LDADriver = lda : Latent Dirchlet Allocation
-org.apache.mahout.clustering.lda.LDAPrintTopics = ldatopics : LDA Print Topics
 org.apache.mahout.clustering.lda.cvb.CVB0Driver = cvb : LDA via Collapsed Variation Bayes (0th deriv. approx)
 org.apache.mahout.clustering.lda.cvb.InMemoryCollapsedVariationalBayes0 = cvb0_local : LDA via Collapsed Variation Bayes, in memory locally.
 org.apache.mahout.clustering.dirichlet.DirichletDriver = dirichlet : Dirichlet Clustering