You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ro...@apache.org on 2010/03/01 17:37:26 UTC
svn commit: r917577 - in /lucene/mahout/trunk: bin/mahout
utils/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java
Author: robinanil
Date: Mon Mar 1 16:37:26 2010
New Revision: 917577
URL: http://svn.apache.org/viewvc?rev=917577&view=rev
Log:
Adding LDA print topics to the shell script and some cleanup of style
Modified:
lucene/mahout/trunk/bin/mahout
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java
Modified: lucene/mahout/trunk/bin/mahout
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/bin/mahout?rev=917577&r1=917576&r2=917577&view=diff
==============================================================================
--- lucene/mahout/trunk/bin/mahout (original)
+++ lucene/mahout/trunk/bin/mahout Mon Mar 1 16:37:26 2010
@@ -60,6 +60,7 @@
echo " fpg run FPGrowth Driver for freq. pattern mining"
echo " kmeans run kmeans clustering"
echo " lda run LDA clustering"
+ echo " ldadump dump the output state of LDA"
echo " lucenevector generate vectors from a lucene index"
echo " trainclassifier run Bayes/CBayes classifier training job"
echo " testclassifier test Bayes/CBayes model using a pre-classified data"
@@ -186,6 +187,8 @@
CLASS=org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver
elif [ "$COMMAND" = "lda" ] ; then
CLASS=org.apache.mahout.clustering.lda.LDADriver
+elif [ "$COMMAND" = "ldadump" ] ; then
+ CLASS=org.apache.mahout.clustering.lda.LDAPrintTopics
elif [ "$COMMAND" = "fpg" ] ; then
CLASS=org.apache.mahout.fpm.pfpgrowth.FPGrowthDriver
elif [ "$COMMAND" = "dirichlet" ] ; then
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java?rev=917577&r1=917576&r2=917577&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java Mon Mar 1 16:37:26 2010
@@ -50,7 +50,7 @@
*/
public class LDAPrintTopics {
- private LDAPrintTopics() { }
+ private LDAPrintTopics() {}
private static class StringDoublePair implements Comparable<StringDoublePair> {
private final double score;
@@ -83,9 +83,9 @@
}
public static List<List<String>> topWordsForTopics(String dir,
- Configuration job,
- List<String> wordList,
- int numWordsToPrint) throws IOException {
+ Configuration job,
+ List<String> wordList,
+ int numWordsToPrint) throws IOException {
FileSystem fs = new Path(dir).getFileSystem(job);
List<PriorityQueue<StringDoublePair>> queues = new ArrayList<PriorityQueue<StringDoublePair>>();
@@ -121,18 +121,14 @@
}
// Expands the queue list to have a Queue for topic K
- private static void ensureQueueSize(List<PriorityQueue<StringDoublePair>> queues,
- int k) {
+ private static void ensureQueueSize(List<PriorityQueue<StringDoublePair>> queues, int k) {
for (int i = queues.size(); i <= k; ++i) {
queues.add(new PriorityQueue<StringDoublePair>());
}
}
// Adds the word if the queue is below capacity, or the score is high enough
- private static void maybeEnqueue(Queue<StringDoublePair> q,
- String word,
- double score,
- int numWordsToPrint) {
+ private static void maybeEnqueue(Queue<StringDoublePair> q, String word, double score, int numWordsToPrint) {
if (q.size() >= numWordsToPrint && score > q.peek().score) {
q.poll();
}
@@ -146,43 +142,30 @@
ArgumentBuilder abuilder = new ArgumentBuilder();
GroupBuilder gbuilder = new GroupBuilder();
- Option inputOpt = obuilder.withLongName("input").withRequired(true)
- .withArgument(
- abuilder.withName("input").withMinimum(1).withMaximum(1).create())
- .withDescription("Path to an LDA output (a state)").withShortName("i")
- .create();
-
- Option dictOpt = obuilder.withLongName("dict").withRequired(true)
- .withArgument(
- abuilder.withName("dict").withMinimum(1).withMaximum(1).create())
- .withDescription(
- "Dictionary to read in, in the same format as one created by "
- + "org.apache.mahout.utils.vectors.lucene.Driver").withShortName(
- "d").create();
-
- Option outOpt = obuilder.withLongName("output").withRequired(true)
- .withArgument(
- abuilder.withName("output").withMinimum(1).withMaximum(1).create())
- .withDescription("Output directory to write top words").withShortName(
- "o").create();
-
- Option wordOpt = obuilder.withLongName("words").withRequired(false)
- .withArgument(
- abuilder.withName("words").withMinimum(0).withMaximum(1).withDefault(
- "20").create()).withDescription("Number of words to print")
- .withShortName("w").create();
- Option dictTypeOpt = obuilder.withLongName("dictionaryType").withRequired(
- false).withArgument(
- abuilder.withName("dictionaryType").withMinimum(1).withMaximum(1)
- .create()).withDescription(
- "The dictionary file type (text|sequencefile)").withShortName("dt")
+ Option inputOpt = obuilder.withLongName("input").withRequired(true).withArgument(
+ abuilder.withName("input").withMinimum(1).withMaximum(1).create()).withDescription(
+ "Path to an LDA output (a state)").withShortName("i").create();
+
+ Option dictOpt = obuilder.withLongName("dict").withRequired(true).withArgument(
+ abuilder.withName("dict").withMinimum(1).withMaximum(1).create()).withDescription(
+ "Dictionary to read in, in the same format as one created by "
+ + "org.apache.mahout.utils.vectors.lucene.Driver").withShortName("d").create();
+
+ Option outOpt = obuilder.withLongName("output").withRequired(true).withArgument(
+ abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription(
+ "Output directory to write top words").withShortName("o").create();
+
+ Option wordOpt = obuilder.withLongName("words").withRequired(false).withArgument(
+ abuilder.withName("words").withMinimum(0).withMaximum(1).withDefault("20").create()).withDescription(
+ "Number of words to print").withShortName("w").create();
+ Option dictTypeOpt = obuilder.withLongName("dictionaryType").withRequired(false).withArgument(
+ abuilder.withName("dictionaryType").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The dictionary file type (text|sequencefile)").withShortName("dt").create();
+ Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
.create();
- Option helpOpt = obuilder.withLongName("help").withDescription(
- "Print out help").withShortName("h").create();
- Group group = gbuilder.withName("Options").withOption(dictOpt).withOption(
- outOpt).withOption(wordOpt).withOption(inputOpt).withOption(dictTypeOpt)
- .create();
+ Group group = gbuilder.withName("Options").withOption(dictOpt).withOption(outOpt).withOption(wordOpt)
+ .withOption(inputOpt).withOption(dictTypeOpt).create();
try {
Parser parser = new Parser();
parser.setGroup(group);
@@ -209,18 +192,15 @@
List<String> wordList;
if (dictionaryType.equals("text")) {
- wordList = Arrays.asList(VectorHelper.loadTermDictionary(new File(
- dictFile)));
+ wordList = Arrays.asList(VectorHelper.loadTermDictionary(new File(dictFile)));
} else if (dictionaryType.equals("sequencefile")) {
FileSystem fs = FileSystem.get(new Path(dictFile).toUri(), config);
- wordList = Arrays.asList(VectorHelper.loadTermDictionary(config, fs,
- dictFile));
+ wordList = Arrays.asList(VectorHelper.loadTermDictionary(config, fs, dictFile));
} else {
throw new IllegalArgumentException("Invalid dictionary format");
}
- List<List<String>> topWords = topWordsForTopics(input, config, wordList,
- numWords);
+ List<List<String>> topWords = topWordsForTopics(input, config, wordList, numWords);
if (!output.exists()) {
if (!output.mkdirs()) {