You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ss...@apache.org on 2012/06/03 20:39:44 UTC
svn commit: r1345736 - in /mahout/trunk:
core/src/main/java/org/apache/mahout/clustering/lda/
core/src/test/java/org/apache/mahout/clustering/
core/src/test/java/org/apache/mahout/clustering/lda/ src/conf/
Author: ssc
Date: Sun Jun 3 18:39:43 2012
New Revision: 1345736
URL: http://svn.apache.org/viewvc?rev=1345736&view=rev
Log:
MAHOUT-986 Remove old LDA implementation from codebase
Removed:
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADocumentTopicMapper.java
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDAInference.java
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDAReducer.java
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDASampler.java
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDAState.java
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDAUtil.java
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDAWordTopicMapper.java
mahout/trunk/core/src/test/java/org/apache/mahout/clustering/lda/TestLDAInference.java
mahout/trunk/core/src/test/java/org/apache/mahout/clustering/lda/TestMapReduce.java
mahout/trunk/src/conf/lda.props
mahout/trunk/src/conf/ldatopics.props
Modified:
mahout/trunk/core/src/test/java/org/apache/mahout/clustering/ClusteringTestUtils.java
mahout/trunk/src/conf/driver.classes.props
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/ClusteringTestUtils.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/ClusteringTestUtils.java?rev=1345736&r1=1345735&r2=1345736&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/ClusteringTestUtils.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/ClusteringTestUtils.java Sun Jun 3 18:39:43 2012
@@ -17,6 +17,7 @@
package org.apache.mahout.clustering;
+import com.google.common.base.Preconditions;
import com.google.common.io.Closeables;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
@@ -24,7 +25,6 @@ import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
-import org.apache.mahout.clustering.lda.LDASampler;
import org.apache.mahout.math.DenseMatrix;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.Matrix;
@@ -32,6 +32,7 @@ import org.apache.mahout.math.SparseRowM
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.math.function.DoubleFunction;
+import org.apache.mahout.math.stats.Sampler;
import java.io.IOException;
import java.util.Random;
@@ -109,4 +110,43 @@ public final class ClusteringTestUtils {
}
return model;
}
+
+ /**
+ * Takes in a {@link Matrix} of topic distributions (such as generated by {@link org.apache.mahout.clustering.lda.cvb.CVB0Driver} or
+ * {@link org.apache.mahout.clustering.lda.cvb.InMemoryCollapsedVariationalBayes0}, and constructs
+ * a set of samplers over this distribution, which may be sampled from by providing a distribution
+ * over topics, and a number of samples desired
+ */
+ static class LDASampler {
+ private final Random random;
+ private final Sampler[] samplers;
+
+ public LDASampler(Matrix model, Random random) {
+ this.random = random;
+ samplers = new Sampler[model.numRows()];
+ for (int i = 0; i < samplers.length; i++) {
+ samplers[i] = new Sampler(random, model.viewRow(i));
+ }
+ }
+
+ /**
+ *
+ * @param topicDistribution vector of p(topicId) for all topicId < model.numTopics()
+ * @param numSamples the number of times to sample (with replacement) from the model
+ * @return array of length numSamples, with each entry being a sample from the model. There
+ * may be repeats
+ */
+ public int[] sample(Vector topicDistribution, int numSamples) {
+ Preconditions.checkNotNull(topicDistribution);
+ Preconditions.checkArgument(numSamples > 0, "numSamples must be positive");
+ Preconditions.checkArgument(topicDistribution.size() == samplers.length,
+ "topicDistribution must have same cardinality as the sampling model");
+ int[] samples = new int[numSamples];
+ Sampler topicSampler = new Sampler(random, topicDistribution);
+ for (int i = 0; i < numSamples; i++) {
+ samples[i] = samplers[topicSampler.sample()].sample();
+ }
+ return samples;
+ }
+ }
}
Modified: mahout/trunk/src/conf/driver.classes.props
URL: http://svn.apache.org/viewvc/mahout/trunk/src/conf/driver.classes.props?rev=1345736&r1=1345735&r2=1345736&view=diff
==============================================================================
--- mahout/trunk/src/conf/driver.classes.props (original)
+++ mahout/trunk/src/conf/driver.classes.props Sun Jun 3 18:39:43 2012
@@ -26,8 +26,6 @@ org.apache.mahout.math.hadoop.stochastic
org.apache.mahout.clustering.kmeans.KMeansDriver = kmeans : K-means clustering
org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver = fkmeans : Fuzzy K-means clustering
org.apache.mahout.clustering.minhash.MinHashDriver = minhash : Run Minhash clustering
-org.apache.mahout.clustering.lda.LDADriver = lda : Latent Dirchlet Allocation
-org.apache.mahout.clustering.lda.LDAPrintTopics = ldatopics : LDA Print Topics
org.apache.mahout.clustering.lda.cvb.CVB0Driver = cvb : LDA via Collapsed Variation Bayes (0th deriv. approx)
org.apache.mahout.clustering.lda.cvb.InMemoryCollapsedVariationalBayes0 = cvb0_local : LDA via Collapsed Variation Bayes, in memory locally.
org.apache.mahout.clustering.dirichlet.DirichletDriver = dirichlet : Dirichlet Clustering