You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ro...@apache.org on 2010/03/01 17:51:37 UTC

svn commit: r917589 - /lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java

Author: robinanil
Date: Mon Mar  1 16:51:37 2010
New Revision: 917589

URL: http://svn.apache.org/viewvc?rev=917589&view=rev
Log:
Fix for 0.3: LDAPrintTopics will match seqdump, clusterdump behaviour by printing output to std out when output file is not specified

Modified:
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java?rev=917589&r1=917588&r2=917589&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java Mon Mar  1 16:51:37 2010
@@ -50,8 +50,6 @@
  */
 public class LDAPrintTopics {
   
-  private LDAPrintTopics() {}
-  
   private static class StringDoublePair implements Comparable<StringDoublePair> {
     private final double score;
     private final String word;
@@ -82,44 +80,6 @@
     
   }
   
-  public static List<List<String>> topWordsForTopics(String dir,
-                                                     Configuration job,
-                                                     List<String> wordList,
-                                                     int numWordsToPrint) throws IOException {
-    FileSystem fs = new Path(dir).getFileSystem(job);
-    
-    List<PriorityQueue<StringDoublePair>> queues = new ArrayList<PriorityQueue<StringDoublePair>>();
-    
-    IntPairWritable key = new IntPairWritable();
-    DoubleWritable value = new DoubleWritable();
-    for (FileStatus status : fs.globStatus(new Path(dir, "*"))) {
-      Path path = status.getPath();
-      SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, job);
-      while (reader.next(key, value)) {
-        int topic = key.getX();
-        int word = key.getY();
-        
-        ensureQueueSize(queues, topic);
-        if (word >= 0 && topic >= 0) {
-          double score = value.get();
-          String realWord = wordList.get(word);
-          maybeEnqueue(queues.get(topic), realWord, score, numWordsToPrint);
-        }
-      }
-      reader.close();
-    }
-    
-    List<List<String>> result = new ArrayList<List<String>>();
-    for (int i = 0; i < queues.size(); ++i) {
-      result.add(i, new LinkedList<String>());
-      for (StringDoublePair sdp : queues.get(i)) {
-        result.get(i).add(0, sdp.word); // prepend
-      }
-    }
-    
-    return result;
-  }
-  
   // Expands the queue list to have a Queue for topic K
   private static void ensureQueueSize(List<PriorityQueue<StringDoublePair>> queues, int k) {
     for (int i = queues.size(); i <= k; ++i) {
@@ -127,16 +87,6 @@
     }
   }
   
-  // Adds the word if the queue is below capacity, or the score is high enough
-  private static void maybeEnqueue(Queue<StringDoublePair> q, String word, double score, int numWordsToPrint) {
-    if (q.size() >= numWordsToPrint && score > q.peek().score) {
-      q.poll();
-    }
-    if (q.size() < numWordsToPrint) {
-      q.add(new StringDoublePair(score, word));
-    }
-  }
-  
   public static void main(String[] args) throws Exception {
     DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
     ArgumentBuilder abuilder = new ArgumentBuilder();
@@ -151,7 +101,7 @@
       "Dictionary to read in, in the same format as one created by "
           + "org.apache.mahout.utils.vectors.lucene.Driver").withShortName("d").create();
     
-    Option outOpt = obuilder.withLongName("output").withRequired(true).withArgument(
+    Option outOpt = obuilder.withLongName("output").withRequired(false).withArgument(
       abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription(
       "Output directory to write top words").withShortName("o").create();
     
@@ -177,7 +127,6 @@
       }
       
       String input = cmdLine.getValue(inputOpt).toString();
-      File output = new File(cmdLine.getValue(outOpt).toString());
       String dictFile = cmdLine.getValue(dictOpt).toString();
       int numWords = 20;
       if (cmdLine.hasOption(wordOpt)) {
@@ -202,22 +151,16 @@
       
       List<List<String>> topWords = topWordsForTopics(input, config, wordList, numWords);
       
-      if (!output.exists()) {
-        if (!output.mkdirs()) {
-          throw new IOException("Could not create directory: " + output);
+      if (cmdLine.hasOption(outOpt)) {
+        File output = new File(cmdLine.getValue(outOpt).toString());
+        if (!output.exists()) {
+          if (!output.mkdirs()) {
+            throw new IOException("Could not create directory: " + output);
+          }
         }
-      }
-      
-      for (int i = 0; i < topWords.size(); ++i) {
-        List<String> topK = topWords.get(i);
-        File out = new File(output, "topic-" + i);
-        PrintWriter writer = new PrintWriter(new FileWriter(out));
-        writer.println("Topic " + i);
-        writer.println("===========");
-        for (String word : topK) {
-          writer.println(word);
-        }
-        writer.close();
+        writeTopWords(topWords, output);
+      } else {
+        printTopWords(topWords);
       }
       
     } catch (OptionException e) {
@@ -226,4 +169,78 @@
     }
   }
   
+  // Adds the word if the queue is below capacity, or the score is high enough
+  private static void maybeEnqueue(Queue<StringDoublePair> q, String word, double score, int numWordsToPrint) {
+    if (q.size() >= numWordsToPrint && score > q.peek().score) {
+      q.poll();
+    }
+    if (q.size() < numWordsToPrint) {
+      q.add(new StringDoublePair(score, word));
+    }
+  }
+  
+  private static void printTopWords(List<List<String>> topWords) throws IOException {
+    for (int i = 0; i < topWords.size(); ++i) {
+      List<String> topK = topWords.get(i);
+      System.out.println("Topic " + i);
+      System.out.println("===========");
+      for (String word : topK) {
+        System.out.println(word);
+      }
+    }
+  }
+  
+  public static List<List<String>> topWordsForTopics(String dir,
+                                                     Configuration job,
+                                                     List<String> wordList,
+                                                     int numWordsToPrint) throws IOException {
+    FileSystem fs = new Path(dir).getFileSystem(job);
+    
+    List<PriorityQueue<StringDoublePair>> queues = new ArrayList<PriorityQueue<StringDoublePair>>();
+    
+    IntPairWritable key = new IntPairWritable();
+    DoubleWritable value = new DoubleWritable();
+    for (FileStatus status : fs.globStatus(new Path(dir, "*"))) {
+      Path path = status.getPath();
+      SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, job);
+      while (reader.next(key, value)) {
+        int topic = key.getX();
+        int word = key.getY();
+        
+        ensureQueueSize(queues, topic);
+        if (word >= 0 && topic >= 0) {
+          double score = value.get();
+          String realWord = wordList.get(word);
+          maybeEnqueue(queues.get(topic), realWord, score, numWordsToPrint);
+        }
+      }
+      reader.close();
+    }
+    
+    List<List<String>> result = new ArrayList<List<String>>();
+    for (int i = 0; i < queues.size(); ++i) {
+      result.add(i, new LinkedList<String>());
+      for (StringDoublePair sdp : queues.get(i)) {
+        result.get(i).add(0, sdp.word); // prepend
+      }
+    }
+    
+    return result;
+  }
+  
+  private static void writeTopWords(List<List<String>> topWords, File output) throws IOException {
+    for (int i = 0; i < topWords.size(); ++i) {
+      List<String> topK = topWords.get(i);
+      File out = new File(output, "topic-" + i);
+      PrintWriter writer = new PrintWriter(new FileWriter(out));
+      writer.println("Topic " + i);
+      writer.println("===========");
+      for (String word : topK) {
+        writer.println(word);
+      }
+      writer.close();
+    }
+  }
+  
+  private LDAPrintTopics() { }
 }