You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by je...@apache.org on 2010/07/19 21:06:15 UTC
svn commit: r965587 [2/2] - in /mahout/trunk:
core/src/main/java/org/apache/mahout/clustering/canopy/
core/src/main/java/org/apache/mahout/clustering/dirichlet/
core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/
core/src/main/java/org/apache/...
Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java?rev=965587&r1=965586&r2=965587&view=diff
==============================================================================
--- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java (original)
+++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java Mon Jul 19 19:06:14 2010
@@ -31,14 +31,6 @@ import java.util.List;
import java.util.Map;
import java.util.TreeMap;
-import org.apache.commons.cli2.CommandLine;
-import org.apache.commons.cli2.Group;
-import org.apache.commons.cli2.Option;
-import org.apache.commons.cli2.OptionException;
-import org.apache.commons.cli2.builder.ArgumentBuilder;
-import org.apache.commons.cli2.builder.DefaultOptionBuilder;
-import org.apache.commons.cli2.builder.GroupBuilder;
-import org.apache.commons.cli2.commandline.Parser;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
@@ -51,26 +43,42 @@ import org.apache.hadoop.io.Writable;
import org.apache.mahout.clustering.Cluster;
import org.apache.mahout.clustering.ClusterBase;
import org.apache.mahout.clustering.WeightedVectorWritable;
-import org.apache.mahout.common.CommandLineUtil;
+import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.Pair;
import org.apache.mahout.math.Vector;
import org.apache.mahout.utils.vectors.VectorHelper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-public final class ClusterDumper {
+public final class ClusterDumper extends AbstractJob {
+
+ public static final String OUTPUT_OPTION = "output";
+
+ public static final String DICTIONARY_TYPE_OPTION = "dictionaryType";
+
+ public static final String DICTIONARY_OPTION = "dictionary";
+
+ public static final String POINTS_DIR_OPTION = "pointsDir";
+
+ public static final String JSON_OPTION = "json";
+
+ public static final String NUM_WORDS_OPTION = "numWords";
+
+ public static final String SUBSTRING_OPTION = "substring";
+
+ public static final String SEQ_FILE_DIR_OPTION = "seqFileDir";
private static final Logger log = LoggerFactory.getLogger(ClusterDumper.class);
- private final Path seqFileDir;
+ private Path seqFileDir;
- private final Path pointsDir;
+ private Path pointsDir = null;
- private String termDictionary;
+ private String termDictionary = null;
- private String dictionaryFormat;
+ private String dictionaryFormat = null;
- private String outputFile;
+ private String outputFile = null;
private int subString = Integer.MAX_VALUE;
@@ -86,14 +94,50 @@ public final class ClusterDumper {
init();
}
- private void init() throws IOException {
- if (this.pointsDir != null) {
- Configuration conf = new Configuration();
- // read in the points
- clusterIdToPoints = readPoints(this.pointsDir, conf);
- } else {
- clusterIdToPoints = Collections.emptyMap();
+ public ClusterDumper() {
+ setConf(new Configuration());
+ }
+
+ public static void main(String[] args) throws Exception {
+ new ClusterDumper().run(args);
+ }
+
+ @Override
+ public int run(String[] args) throws Exception {
+ addOption(SEQ_FILE_DIR_OPTION, "s", "The directory containing Sequence Files for the Clusters", true);
+ addOption(OUTPUT_OPTION, "o", "Optional output directory. Default is to output to the console.");
+ addOption(SUBSTRING_OPTION, "b", "The number of chars of the asFormatString() to print");
+ addOption(NUM_WORDS_OPTION, "n", "The number of top terms to print");
+ addOption(JSON_OPTION, "j", "Output the centroid as JSON. Otherwise it substitues in the terms for vector cell entries");
+ addOption(POINTS_DIR_OPTION, "p", "The directory containing points sequence files mapping input vectors to their cluster. "
+ + "If specified, then the program will output the points associated with a cluster");
+ addOption(DICTIONARY_OPTION, "d", "The dictionary file");
+ addOption(DICTIONARY_TYPE_OPTION, "dt", "The dictionary file type (text|sequencefile)", "text");
+ if (parseArguments(args) == null) {
+ return -1;
+ }
+
+ seqFileDir = new Path(getOption(SEQ_FILE_DIR_OPTION));
+ if (hasOption(POINTS_DIR_OPTION)) {
+ pointsDir = new Path(getOption(POINTS_DIR_OPTION));
+ }
+ outputFile = getOption(OUTPUT_OPTION);
+ if (hasOption(SUBSTRING_OPTION)) {
+ int sub = Integer.parseInt(getOption(SUBSTRING_OPTION));
+ if (sub >= 0) {
+ setSubString(sub);
+ }
}
+ if (hasOption(JSON_OPTION)) {
+ setUseJSON(true);
+ }
+ termDictionary = getOption(DICTIONARY_OPTION);
+ dictionaryFormat = getOption(DICTIONARY_TYPE_OPTION);
+ if (hasOption(NUM_WORDS_OPTION)) {
+ setNumTopFeatures(Integer.parseInt(getOption(NUM_WORDS_OPTION)));
+ }
+ printClusters(null);
+ return 0;
}
public void printClusters(String[] dictionary) throws IOException, InstantiationException, IllegalAccessException {
@@ -163,6 +207,16 @@ public final class ClusterDumper {
}
}
+ private void init() throws IOException {
+ if (this.pointsDir != null) {
+ Configuration conf = new Configuration();
+ // read in the points
+ clusterIdToPoints = readPoints(this.pointsDir, conf);
+ } else {
+ clusterIdToPoints = Collections.emptyMap();
+ }
+ }
+
public String getOutputFile() {
return outputFile;
}
@@ -200,114 +254,11 @@ public final class ClusterDumper {
return this.numTopFeatures;
}
- public static void main(String[] args) throws IOException, IllegalAccessException, InstantiationException {
- DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
- ArgumentBuilder abuilder = new ArgumentBuilder();
- GroupBuilder gbuilder = new GroupBuilder();
-
- Option seqOpt = obuilder.withLongName("seqFileDir").withRequired(false).withArgument(
- abuilder.withName("seqFileDir").withMinimum(1).withMaximum(1).create()).withDescription(
- "The directory containing Sequence Files for the Clusters").withShortName("s").create();
- Option outputOpt = obuilder.withLongName("output").withRequired(false).withArgument(
- abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription(
- "The output file. If not specified, dumps to the console").withShortName("o").create();
- Option substringOpt = obuilder.withLongName("substring").withRequired(false).withArgument(
- abuilder.withName("substring").withMinimum(1).withMaximum(1).create()).withDescription(
- "The number of chars of the asFormatString() to print").withShortName("b").create();
- Option numWordsOpt = obuilder.withLongName("numWords").withRequired(false).withArgument(
- abuilder.withName("numWords").withMinimum(1).withMaximum(1).create())
- .withDescription("The number of top terms to print").withShortName("n").create();
- Option centroidJSonOpt = obuilder.withLongName("json").withRequired(false).withDescription(
- "Output the centroid as JSON. Otherwise it substitues in the terms for vector cell entries")
- .withShortName("j").create();
- Option pointsOpt = obuilder.withLongName("pointsDir").withRequired(false).withArgument(
- abuilder.withName("pointsDir").withMinimum(1).withMaximum(1).create()).withDescription(
- "The directory containing points sequence files mapping input vectors to their cluster. "
- + "If specified, then the program will output the points associated with a cluster")
- .withShortName("p").create();
- Option dictOpt = obuilder.withLongName("dictionary").withRequired(false).withArgument(
- abuilder.withName("dictionary").withMinimum(1).withMaximum(1).create())
- .withDescription("The dictionary file. ").withShortName("d").create();
- Option dictTypeOpt = obuilder.withLongName("dictionaryType").withRequired(false).withArgument(
- abuilder.withName("dictionaryType").withMinimum(1).withMaximum(1).create()).withDescription(
- "The dictionary file type (text|sequencefile)").withShortName("dt").create();
- Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h").create();
-
- Group group = gbuilder.withName("Options").withOption(helpOpt).withOption(seqOpt).withOption(outputOpt)
- .withOption(substringOpt).withOption(pointsOpt).withOption(centroidJSonOpt)
- .withOption(dictOpt).withOption(dictTypeOpt)
- .withOption(numWordsOpt).create();
-
- try {
- Parser parser = new Parser();
- parser.setGroup(group);
- CommandLine cmdLine = parser.parse(args);
- if (cmdLine.hasOption(helpOpt)) {
- CommandLineUtil.printHelp(group);
- return;
- }
- if (!cmdLine.hasOption(seqOpt)) {
- return;
- }
- Path seqFileDir = new Path(cmdLine.getValue(seqOpt).toString());
- String termDictionary = null;
- if (cmdLine.hasOption(dictOpt)) {
- termDictionary = cmdLine.getValue(dictOpt).toString();
- }
-
- Path pointsDir = null;
- if (cmdLine.hasOption(pointsOpt)) {
- pointsDir = new Path(cmdLine.getValue(pointsOpt).toString());
- }
- String outputFile = null;
- if (cmdLine.hasOption(outputOpt)) {
- outputFile = cmdLine.getValue(outputOpt).toString();
- }
-
- int sub = -1;
- if (cmdLine.hasOption(substringOpt)) {
- sub = Integer.parseInt(cmdLine.getValue(substringOpt).toString());
- }
-
- ClusterDumper clusterDumper = new ClusterDumper(seqFileDir, pointsDir);
- if (cmdLine.hasOption(centroidJSonOpt)) {
- clusterDumper.setUseJSON(true);
- }
-
- if (outputFile != null) {
- clusterDumper.setOutputFile(outputFile);
- }
-
- String dictionaryType = "text";
- if (cmdLine.hasOption(dictTypeOpt)) {
- dictionaryType = cmdLine.getValue(dictTypeOpt).toString();
- }
-
- if (termDictionary != null) {
- clusterDumper.setTermDictionary(termDictionary, dictionaryType);
- }
-
- if (cmdLine.hasOption(numWordsOpt)) {
- int numWords = Integer.parseInt(cmdLine.getValue(numWordsOpt).toString());
- clusterDumper.setNumTopFeatures(numWords);
- }
-
- if (sub >= 0) {
- clusterDumper.setSubString(sub);
- }
- clusterDumper.printClusters(null);
- } catch (OptionException e) {
- log.error("Exception", e);
- CommandLineUtil.printHelp(group);
- }
- }
-
private void setUseJSON(boolean json) {
this.useJSON = json;
}
- private static Map<Integer, List<WeightedVectorWritable>> readPoints(Path pointsPathDir, Configuration conf)
- throws IOException {
+ private static Map<Integer, List<WeightedVectorWritable>> readPoints(Path pointsPathDir, Configuration conf) throws IOException {
Map<Integer, List<WeightedVectorWritable>> result = new TreeMap<Integer, List<WeightedVectorWritable>>();
FileSystem fs = pointsPathDir.getFileSystem(conf);