You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ro...@apache.org on 2010/03/01 17:51:37 UTC
svn commit: r917589 -
/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java
Author: robinanil
Date: Mon Mar 1 16:51:37 2010
New Revision: 917589
URL: http://svn.apache.org/viewvc?rev=917589&view=rev
Log:
Fix for 0.3: LDAPrintTopics will match seqdump, clusterdump behaviour by printing output to std out when output file is not specified
Modified:
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java?rev=917589&r1=917588&r2=917589&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java Mon Mar 1 16:51:37 2010
@@ -50,8 +50,6 @@
*/
public class LDAPrintTopics {
- private LDAPrintTopics() {}
-
private static class StringDoublePair implements Comparable<StringDoublePair> {
private final double score;
private final String word;
@@ -82,44 +80,6 @@
}
- public static List<List<String>> topWordsForTopics(String dir,
- Configuration job,
- List<String> wordList,
- int numWordsToPrint) throws IOException {
- FileSystem fs = new Path(dir).getFileSystem(job);
-
- List<PriorityQueue<StringDoublePair>> queues = new ArrayList<PriorityQueue<StringDoublePair>>();
-
- IntPairWritable key = new IntPairWritable();
- DoubleWritable value = new DoubleWritable();
- for (FileStatus status : fs.globStatus(new Path(dir, "*"))) {
- Path path = status.getPath();
- SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, job);
- while (reader.next(key, value)) {
- int topic = key.getX();
- int word = key.getY();
-
- ensureQueueSize(queues, topic);
- if (word >= 0 && topic >= 0) {
- double score = value.get();
- String realWord = wordList.get(word);
- maybeEnqueue(queues.get(topic), realWord, score, numWordsToPrint);
- }
- }
- reader.close();
- }
-
- List<List<String>> result = new ArrayList<List<String>>();
- for (int i = 0; i < queues.size(); ++i) {
- result.add(i, new LinkedList<String>());
- for (StringDoublePair sdp : queues.get(i)) {
- result.get(i).add(0, sdp.word); // prepend
- }
- }
-
- return result;
- }
-
// Expands the queue list to have a Queue for topic K
private static void ensureQueueSize(List<PriorityQueue<StringDoublePair>> queues, int k) {
for (int i = queues.size(); i <= k; ++i) {
@@ -127,16 +87,6 @@
}
}
- // Adds the word if the queue is below capacity, or the score is high enough
- private static void maybeEnqueue(Queue<StringDoublePair> q, String word, double score, int numWordsToPrint) {
- if (q.size() >= numWordsToPrint && score > q.peek().score) {
- q.poll();
- }
- if (q.size() < numWordsToPrint) {
- q.add(new StringDoublePair(score, word));
- }
- }
-
public static void main(String[] args) throws Exception {
DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
ArgumentBuilder abuilder = new ArgumentBuilder();
@@ -151,7 +101,7 @@
"Dictionary to read in, in the same format as one created by "
+ "org.apache.mahout.utils.vectors.lucene.Driver").withShortName("d").create();
- Option outOpt = obuilder.withLongName("output").withRequired(true).withArgument(
+ Option outOpt = obuilder.withLongName("output").withRequired(false).withArgument(
abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription(
"Output directory to write top words").withShortName("o").create();
@@ -177,7 +127,6 @@
}
String input = cmdLine.getValue(inputOpt).toString();
- File output = new File(cmdLine.getValue(outOpt).toString());
String dictFile = cmdLine.getValue(dictOpt).toString();
int numWords = 20;
if (cmdLine.hasOption(wordOpt)) {
@@ -202,22 +151,16 @@
List<List<String>> topWords = topWordsForTopics(input, config, wordList, numWords);
- if (!output.exists()) {
- if (!output.mkdirs()) {
- throw new IOException("Could not create directory: " + output);
+ if (cmdLine.hasOption(outOpt)) {
+ File output = new File(cmdLine.getValue(outOpt).toString());
+ if (!output.exists()) {
+ if (!output.mkdirs()) {
+ throw new IOException("Could not create directory: " + output);
+ }
}
- }
-
- for (int i = 0; i < topWords.size(); ++i) {
- List<String> topK = topWords.get(i);
- File out = new File(output, "topic-" + i);
- PrintWriter writer = new PrintWriter(new FileWriter(out));
- writer.println("Topic " + i);
- writer.println("===========");
- for (String word : topK) {
- writer.println(word);
- }
- writer.close();
+ writeTopWords(topWords, output);
+ } else {
+ printTopWords(topWords);
}
} catch (OptionException e) {
@@ -226,4 +169,78 @@
}
}
+ // Adds the word if the queue is below capacity, or the score is high enough
+ private static void maybeEnqueue(Queue<StringDoublePair> q, String word, double score, int numWordsToPrint) {
+ if (q.size() >= numWordsToPrint && score > q.peek().score) {
+ q.poll();
+ }
+ if (q.size() < numWordsToPrint) {
+ q.add(new StringDoublePair(score, word));
+ }
+ }
+
+ private static void printTopWords(List<List<String>> topWords) throws IOException {
+ for (int i = 0; i < topWords.size(); ++i) {
+ List<String> topK = topWords.get(i);
+ System.out.println("Topic " + i);
+ System.out.println("===========");
+ for (String word : topK) {
+ System.out.println(word);
+ }
+ }
+ }
+
+ public static List<List<String>> topWordsForTopics(String dir,
+ Configuration job,
+ List<String> wordList,
+ int numWordsToPrint) throws IOException {
+ FileSystem fs = new Path(dir).getFileSystem(job);
+
+ List<PriorityQueue<StringDoublePair>> queues = new ArrayList<PriorityQueue<StringDoublePair>>();
+
+ IntPairWritable key = new IntPairWritable();
+ DoubleWritable value = new DoubleWritable();
+ for (FileStatus status : fs.globStatus(new Path(dir, "*"))) {
+ Path path = status.getPath();
+ SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, job);
+ while (reader.next(key, value)) {
+ int topic = key.getX();
+ int word = key.getY();
+
+ ensureQueueSize(queues, topic);
+ if (word >= 0 && topic >= 0) {
+ double score = value.get();
+ String realWord = wordList.get(word);
+ maybeEnqueue(queues.get(topic), realWord, score, numWordsToPrint);
+ }
+ }
+ reader.close();
+ }
+
+ List<List<String>> result = new ArrayList<List<String>>();
+ for (int i = 0; i < queues.size(); ++i) {
+ result.add(i, new LinkedList<String>());
+ for (StringDoublePair sdp : queues.get(i)) {
+ result.get(i).add(0, sdp.word); // prepend
+ }
+ }
+
+ return result;
+ }
+
+ private static void writeTopWords(List<List<String>> topWords, File output) throws IOException {
+ for (int i = 0; i < topWords.size(); ++i) {
+ List<String> topK = topWords.get(i);
+ File out = new File(output, "topic-" + i);
+ PrintWriter writer = new PrintWriter(new FileWriter(out));
+ writer.println("Topic " + i);
+ writer.println("===========");
+ for (String word : topK) {
+ writer.println(word);
+ }
+ writer.close();
+ }
+ }
+
+ private LDAPrintTopics() { }
}