You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by gs...@apache.org on 2011/07/15 13:30:14 UTC
svn commit: r1147100 -
/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/SequenceFileDumper.java
Author: gsingers
Date: Fri Jul 15 11:30:14 2011
New Revision: 1147100
URL: http://svn.apache.org/viewvc?rev=1147100&view=rev
Log:
add ability to restrict the number of KV pairs to dump
Modified:
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/SequenceFileDumper.java
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/SequenceFileDumper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/SequenceFileDumper.java?rev=1147100&r1=1147099&r2=1147100&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/SequenceFileDumper.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/SequenceFileDumper.java Fri Jul 15 11:30:14 2011
@@ -17,10 +17,6 @@
package org.apache.mahout.utils;
-import java.io.File;
-import java.io.OutputStreamWriter;
-import java.io.Writer;
-
import com.google.common.base.Charsets;
import com.google.common.io.Closeables;
import com.google.common.io.Files;
@@ -41,45 +37,52 @@ import org.apache.mahout.common.iterator
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import java.io.File;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+
public final class SequenceFileDumper {
-
+
private static final Logger log = LoggerFactory.getLogger(SequenceFileDumper.class);
-
+
private SequenceFileDumper() {
}
-
+
public static void main(String[] args) throws Exception {
DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
ArgumentBuilder abuilder = new ArgumentBuilder();
GroupBuilder gbuilder = new GroupBuilder();
-
+
Option seqOpt = obuilder.withLongName("seqFile").withRequired(false).withArgument(
- abuilder.withName("seqFile").withMinimum(1).withMaximum(1).create()).
- withDescription("The Sequence File containing the Clusters").withShortName("s").create();
+ abuilder.withName("seqFile").withMinimum(1).withMaximum(1).create()).
+ withDescription("The Sequence File containing the Clusters").withShortName("s").create();
Option outputOpt = obuilder.withLongName("output").withRequired(false).withArgument(
- abuilder.withName("output").withMinimum(1).withMaximum(1).create()).
- withDescription("The output file. If not specified, dumps to the console").withShortName("o").create();
+ abuilder.withName("output").withMinimum(1).withMaximum(1).create()).
+ withDescription("The output file. If not specified, dumps to the console").withShortName("o").create();
Option substringOpt = obuilder.withLongName("substring").withRequired(false).withArgument(
- abuilder.withName("substring").withMinimum(1).withMaximum(1).create()).
- withDescription("The number of chars of the asFormatString() to print").withShortName("b").create();
+ abuilder.withName("substring").withMinimum(1).withMaximum(1).create()).
+ withDescription("The number of chars of the asFormatString() to print").withShortName("b").create();
Option countOpt = obuilder.withLongName("count").withRequired(false).
- withDescription("Report the count only").withShortName("c").create();
+ withDescription("Report the count only").withShortName("c").create();
+ Option numItemsOpt = obuilder.withLongName("n").withRequired(false).withArgument(
+ abuilder.withName("numItems").withMinimum(1).withMaximum(1).create()).
+ withDescription("Output at most <n> key value pairs").withShortName("n").create();
Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h").create();
-
+
Group group = gbuilder.withName("Options").withOption(seqOpt).withOption(outputOpt)
- .withOption(substringOpt).withOption(countOpt).withOption(helpOpt).create();
-
+ .withOption(substringOpt).withOption(countOpt).withOption(numItemsOpt).withOption(helpOpt).create();
+
try {
Parser parser = new Parser();
parser.setGroup(group);
CommandLine cmdLine = parser.parse(args);
-
+
if (cmdLine.hasOption(helpOpt)) {
-
+
printHelp(group);
return;
}
-
+
if (cmdLine.hasOption(seqOpt)) {
Path path = new Path(cmdLine.getValue(seqOpt).toString());
Configuration conf = new Configuration();
@@ -98,7 +101,7 @@ public final class SequenceFileDumper {
sub = Integer.parseInt(cmdLine.getValue(substringOpt).toString());
}
boolean countOnly = cmdLine.hasOption(countOpt);
- SequenceFileIterator<?,?> iterator = new SequenceFileIterator<Writable,Writable>(path, true, conf);
+ SequenceFileIterator<?, ?> iterator = new SequenceFileIterator<Writable, Writable>(path, true, conf);
writer.append("Key class: ").append(iterator.getKeyClass().toString());
writer.append(" Value Class: ").append(iterator.getValueClass().toString()).append('\n');
long count = 0;
@@ -109,8 +112,13 @@ public final class SequenceFileDumper {
}
writer.append("Count: ").append(String.valueOf(count)).append('\n');
} else {
- while (iterator.hasNext()) {
- Pair<?,?> record = iterator.next();
+ long numItems = Long.MAX_VALUE;
+ if (cmdLine.hasOption(numItemsOpt)) {
+ numItems = Long.parseLong(cmdLine.getValue(numItemsOpt).toString());
+ writer.append("Max Items to dump: ").append(String.valueOf(numItems));
+ }
+ while (iterator.hasNext() && count < numItems) {
+ Pair<?, ?> record = iterator.next();
writer.append("Key: ").append(record.getFirst().toString());
String str = record.getSecond().toString();
writer.append(": Value: ").append(str.length() > sub ? str.substring(0, sub) : str);
@@ -123,14 +131,14 @@ public final class SequenceFileDumper {
Closeables.closeQuietly(writer);
}
}
-
+
} catch (OptionException e) {
log.error("Exception", e);
printHelp(group);
}
-
+
}
-
+
private static void printHelp(Group group) {
HelpFormatter formatter = new HelpFormatter();
formatter.setGroup(group);