You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ro...@apache.org on 2010/03/01 17:37:26 UTC

svn commit: r917577 - in /lucene/mahout/trunk: bin/mahout utils/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java

Author: robinanil
Date: Mon Mar  1 16:37:26 2010
New Revision: 917577

URL: http://svn.apache.org/viewvc?rev=917577&view=rev
Log:
Adding LDA print topics to the shell script and some cleanup of style

Modified:
    lucene/mahout/trunk/bin/mahout
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java

Modified: lucene/mahout/trunk/bin/mahout
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/bin/mahout?rev=917577&r1=917576&r2=917577&view=diff
==============================================================================
--- lucene/mahout/trunk/bin/mahout (original)
+++ lucene/mahout/trunk/bin/mahout Mon Mar  1 16:37:26 2010
@@ -60,6 +60,7 @@
   echo "  fpg                   run FPGrowth Driver for freq. pattern mining"
   echo "  kmeans                run kmeans clustering"
   echo "  lda                   run LDA clustering"
+  echo "  ldadump               dump the output state of LDA"
   echo "  lucenevector          generate vectors from a lucene index"
   echo "  trainclassifier       run Bayes/CBayes classifier training job"
   echo "  testclassifier        test Bayes/CBayes model using a pre-classified data"
@@ -186,6 +187,8 @@
   CLASS=org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver
 elif [ "$COMMAND" = "lda" ] ; then
   CLASS=org.apache.mahout.clustering.lda.LDADriver
+elif [ "$COMMAND" = "ldadump" ] ; then
+  CLASS=org.apache.mahout.clustering.lda.LDAPrintTopics
 elif [ "$COMMAND" = "fpg" ] ; then
   CLASS=org.apache.mahout.fpm.pfpgrowth.FPGrowthDriver  
 elif [ "$COMMAND" = "dirichlet" ] ; then

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java?rev=917577&r1=917576&r2=917577&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java Mon Mar  1 16:37:26 2010
@@ -50,7 +50,7 @@
  */
 public class LDAPrintTopics {
   
-  private LDAPrintTopics() { }
+  private LDAPrintTopics() {}
   
   private static class StringDoublePair implements Comparable<StringDoublePair> {
     private final double score;
@@ -83,9 +83,9 @@
   }
   
   public static List<List<String>> topWordsForTopics(String dir,
-    Configuration job,
-    List<String> wordList,
-    int numWordsToPrint) throws IOException {
+                                                     Configuration job,
+                                                     List<String> wordList,
+                                                     int numWordsToPrint) throws IOException {
     FileSystem fs = new Path(dir).getFileSystem(job);
     
     List<PriorityQueue<StringDoublePair>> queues = new ArrayList<PriorityQueue<StringDoublePair>>();
@@ -121,18 +121,14 @@
   }
   
   // Expands the queue list to have a Queue for topic K
-  private static void ensureQueueSize(List<PriorityQueue<StringDoublePair>> queues,
-                                      int k) {
+  private static void ensureQueueSize(List<PriorityQueue<StringDoublePair>> queues, int k) {
     for (int i = queues.size(); i <= k; ++i) {
       queues.add(new PriorityQueue<StringDoublePair>());
     }
   }
   
   // Adds the word if the queue is below capacity, or the score is high enough
-  private static void maybeEnqueue(Queue<StringDoublePair> q,
-                                   String word,
-                                   double score,
-                                   int numWordsToPrint) {
+  private static void maybeEnqueue(Queue<StringDoublePair> q, String word, double score, int numWordsToPrint) {
     if (q.size() >= numWordsToPrint && score > q.peek().score) {
       q.poll();
     }
@@ -146,43 +142,30 @@
     ArgumentBuilder abuilder = new ArgumentBuilder();
     GroupBuilder gbuilder = new GroupBuilder();
     
-    Option inputOpt = obuilder.withLongName("input").withRequired(true)
-    .withArgument(
-      abuilder.withName("input").withMinimum(1).withMaximum(1).create())
-      .withDescription("Path to an LDA output (a state)").withShortName("i")
-      .create();
-    
-    Option dictOpt = obuilder.withLongName("dict").withRequired(true)
-    .withArgument(
-      abuilder.withName("dict").withMinimum(1).withMaximum(1).create())
-      .withDescription(
-        "Dictionary to read in, in the same format as one created by "
-        + "org.apache.mahout.utils.vectors.lucene.Driver").withShortName(
-        "d").create();
-    
-    Option outOpt = obuilder.withLongName("output").withRequired(true)
-    .withArgument(
-      abuilder.withName("output").withMinimum(1).withMaximum(1).create())
-      .withDescription("Output directory to write top words").withShortName(
-      "o").create();
-    
-    Option wordOpt = obuilder.withLongName("words").withRequired(false)
-    .withArgument(
-      abuilder.withName("words").withMinimum(0).withMaximum(1).withDefault(
-      "20").create()).withDescription("Number of words to print")
-      .withShortName("w").create();
-    Option dictTypeOpt = obuilder.withLongName("dictionaryType").withRequired(
-      false).withArgument(
-        abuilder.withName("dictionaryType").withMinimum(1).withMaximum(1)
-        .create()).withDescription(
-        "The dictionary file type (text|sequencefile)").withShortName("dt")
+    Option inputOpt = obuilder.withLongName("input").withRequired(true).withArgument(
+      abuilder.withName("input").withMinimum(1).withMaximum(1).create()).withDescription(
+      "Path to an LDA output (a state)").withShortName("i").create();
+    
+    Option dictOpt = obuilder.withLongName("dict").withRequired(true).withArgument(
+      abuilder.withName("dict").withMinimum(1).withMaximum(1).create()).withDescription(
+      "Dictionary to read in, in the same format as one created by "
+          + "org.apache.mahout.utils.vectors.lucene.Driver").withShortName("d").create();
+    
+    Option outOpt = obuilder.withLongName("output").withRequired(true).withArgument(
+      abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription(
+      "Output directory to write top words").withShortName("o").create();
+    
+    Option wordOpt = obuilder.withLongName("words").withRequired(false).withArgument(
+      abuilder.withName("words").withMinimum(0).withMaximum(1).withDefault("20").create()).withDescription(
+      "Number of words to print").withShortName("w").create();
+    Option dictTypeOpt = obuilder.withLongName("dictionaryType").withRequired(false).withArgument(
+      abuilder.withName("dictionaryType").withMinimum(1).withMaximum(1).create()).withDescription(
+      "The dictionary file type (text|sequencefile)").withShortName("dt").create();
+    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
         .create();
-    Option helpOpt = obuilder.withLongName("help").withDescription(
-    "Print out help").withShortName("h").create();
     
-    Group group = gbuilder.withName("Options").withOption(dictOpt).withOption(
-      outOpt).withOption(wordOpt).withOption(inputOpt).withOption(dictTypeOpt)
-      .create();
+    Group group = gbuilder.withName("Options").withOption(dictOpt).withOption(outOpt).withOption(wordOpt)
+        .withOption(inputOpt).withOption(dictTypeOpt).create();
     try {
       Parser parser = new Parser();
       parser.setGroup(group);
@@ -209,18 +192,15 @@
       
       List<String> wordList;
       if (dictionaryType.equals("text")) {
-        wordList = Arrays.asList(VectorHelper.loadTermDictionary(new File(
-          dictFile)));
+        wordList = Arrays.asList(VectorHelper.loadTermDictionary(new File(dictFile)));
       } else if (dictionaryType.equals("sequencefile")) {
         FileSystem fs = FileSystem.get(new Path(dictFile).toUri(), config);
-        wordList = Arrays.asList(VectorHelper.loadTermDictionary(config, fs,
-          dictFile));
+        wordList = Arrays.asList(VectorHelper.loadTermDictionary(config, fs, dictFile));
       } else {
         throw new IllegalArgumentException("Invalid dictionary format");
       }
       
-      List<List<String>> topWords = topWordsForTopics(input, config, wordList,
-        numWords);
+      List<List<String>> topWords = topWordsForTopics(input, config, wordList, numWords);
       
       if (!output.exists()) {
         if (!output.mkdirs()) {