You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by gs...@apache.org on 2009/06/27 04:22:31 UTC
svn commit: r788918 - in
/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering:
canopy/CanopyClusteringJob.java canopy/CanopyDriver.java
canopy/ClusterDriver.java kmeans/KMeansDriver.java
Author: gsingers
Date: Sat Jun 27 02:22:31 2009
New Revision: 788918
URL: http://svn.apache.org/viewvc?rev=788918&view=rev
Log:
MAHOUT-138: convert Canopy to use CLI2
Modified:
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyClusteringJob.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyDriver.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/ClusterDriver.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyClusteringJob.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyClusteringJob.java?rev=788918&r1=788917&r2=788918&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyClusteringJob.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyClusteringJob.java Sat Jun 27 02:22:31 2009
@@ -18,11 +18,28 @@
package org.apache.mahout.clustering.canopy;
import org.apache.mahout.matrix.Vector;
+import org.apache.mahout.matrix.SparseVector;
+import org.apache.mahout.utils.CommandLineUtil;
+import org.apache.mahout.utils.SquaredEuclideanDistanceMeasure;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.OptionException;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.commons.logging.LogFactory;
+import org.apache.commons.logging.Log;
import java.io.IOException;
+/**
+ * Runs the {@link org.apache.mahout.clustering.canopy.CanopyDriver#runJob(String, String, String, double, double, Class)}
+ * and then {@link org.apache.mahout.clustering.canopy.ClusterDriver#runJob(String, String, String, String, double, double, Class)}.
+ */
public class CanopyClusteringJob {
-
+ private transient static Log log = LogFactory.getLog(CanopyClusteringJob.class);
/**
* The default name of the canopies output sub-directory.
*/
@@ -39,14 +56,72 @@
* @param args
*/
public static void main(String[] args) throws IOException, ClassNotFoundException {
- String input = args[0];
- String output = args[1];
- String measureClassName = args[2];
- double t1 = Double.parseDouble(args[3]);
- double t2 = Double.parseDouble(args[4]);
- String vectorClassName = args[5];
- Class<? extends Vector> vectorClass = (Class<? extends Vector>) Class.forName(vectorClassName);
- runJob(input, output, measureClassName, t1, t2, vectorClass);
+ DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
+ ArgumentBuilder abuilder = new ArgumentBuilder();
+ GroupBuilder gbuilder = new GroupBuilder();
+
+ Option inputOpt = obuilder.withLongName("input").withRequired(true).withArgument(
+ abuilder.withName("input").withMinimum(1).withMaximum(1).create()).
+ withDescription("The Path for input Vectors. Must be a SequenceFile of Writable, Vector").withShortName("i").create();
+
+ Option outputOpt = obuilder.withLongName("output").withRequired(true).withArgument(
+ abuilder.withName("output").withMinimum(1).withMaximum(1).create()).
+ withDescription("The Path to put the output in").withShortName("o").create();
+
+ Option measureClassOpt = obuilder.withLongName("distance").withRequired(false).withArgument(
+ abuilder.withName("distance").withMinimum(1).withMaximum(1).create()).
+ withDescription("The Distance Measure to use. Default is SquaredEuclidean").withShortName("m").create();
+
+ Option vectorClassOpt = obuilder.withLongName("vectorClass").withRequired(false).withArgument(
+ abuilder.withName("vectorClass").withMinimum(1).withMaximum(1).create()).
+ withDescription("The Vector implementation class name. Default is SparseVector.class").withShortName("v").create();
+ Option t1Opt = obuilder.withLongName("t1").withRequired(true).withArgument(
+ abuilder.withName("t1").withMinimum(1).withMaximum(1).create()).
+ withDescription("t1").withShortName("t1").create();
+ Option t2Opt = obuilder.withLongName("t2").withRequired(true).withArgument(
+ abuilder.withName("t2").withMinimum(1).withMaximum(1).create()).
+ withDescription("t2").withShortName("t2").create();
+
+
+
+ Option helpOpt = obuilder.withLongName("help").
+ withDescription("Print out help").withShortName("h").create();
+
+ Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt)
+ .withOption(measureClassOpt).withOption(vectorClassOpt)
+ .withOption(t1Opt).withOption(t2Opt)
+ .withOption(helpOpt).create();
+
+
+ try {
+ Parser parser = new Parser();
+ parser.setGroup(group);
+ CommandLine cmdLine = parser.parse(args);
+
+ if (cmdLine.hasOption(helpOpt)) {
+ CommandLineUtil.printHelp(group);
+ return;
+ }
+
+ String input = cmdLine.getValue(inputOpt).toString();
+ String output = cmdLine.getValue(outputOpt).toString();
+ String measureClass = SquaredEuclideanDistanceMeasure.class.getName();
+ if (cmdLine.hasOption(measureClassOpt)) {
+ measureClass = cmdLine.getValue(measureClassOpt).toString();
+ }
+
+ Class<? extends Vector> vectorClass = cmdLine.hasOption(vectorClassOpt) == false ?
+ SparseVector.class
+ : (Class<? extends Vector>) Class.forName(cmdLine.getValue(vectorClassOpt).toString());
+ double t1 = Double.parseDouble(cmdLine.getValue(t1Opt).toString());
+ double t2 = Double.parseDouble(cmdLine.getValue(t2Opt).toString());
+
+ runJob(input, output, measureClass, t1, t2, vectorClass);
+
+ } catch (OptionException e) {
+ log.error("Exception", e);
+ CommandLineUtil.printHelp(group);
+ }
}
/**
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyDriver.java?rev=788918&r1=788917&r2=788918&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyDriver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyDriver.java Sat Jun 27 02:22:31 2009
@@ -27,8 +27,19 @@
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.mahout.matrix.Vector;
+import org.apache.mahout.matrix.SparseVector;
+import org.apache.mahout.utils.CommandLineUtil;
+import org.apache.mahout.utils.SquaredEuclideanDistanceMeasure;
import org.apache.commons.logging.LogFactory;
import org.apache.commons.logging.Log;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.OptionException;
+import org.apache.commons.cli2.commandline.Parser;
import java.io.IOException;
@@ -38,14 +49,70 @@
}
public static void main(String[] args) throws IOException, ClassNotFoundException {
- String input = args[0];
- String output = args[1];
- String measureClassName = args[2];
- double t1 = Double.parseDouble(args[3]);
- double t2 = Double.parseDouble(args[4]);
- String vectorClassName = args[5];
- Class<? extends Vector> vectorClass = (Class<? extends Vector>) Class.forName(vectorClassName);
- runJob(input, output, measureClassName, t1, t2, vectorClass);
+ DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
+ ArgumentBuilder abuilder = new ArgumentBuilder();
+ GroupBuilder gbuilder = new GroupBuilder();
+
+ Option inputOpt = obuilder.withLongName("input").withRequired(true).withArgument(
+ abuilder.withName("input").withMinimum(1).withMaximum(1).create()).
+ withDescription("The Path for input Vectors. Must be a SequenceFile of Writable, Vector").withShortName("i").create();
+
+ Option outputOpt = obuilder.withLongName("output").withRequired(true).withArgument(
+ abuilder.withName("output").withMinimum(1).withMaximum(1).create()).
+ withDescription("The Path to put the output in").withShortName("o").create();
+
+ Option measureClassOpt = obuilder.withLongName("distance").withRequired(false).withArgument(
+ abuilder.withName("distance").withMinimum(1).withMaximum(1).create()).
+ withDescription("The Distance Measure to use. Default is SquaredEuclidean").withShortName("m").create();
+
+ Option vectorClassOpt = obuilder.withLongName("vectorClass").withRequired(false).withArgument(
+ abuilder.withName("vectorClass").withMinimum(1).withMaximum(1).create()).
+ withDescription("The Vector implementation class name. Default is SparseVector.class").withShortName("v").create();
+ Option t1Opt = obuilder.withLongName("t1").withRequired(true).withArgument(
+ abuilder.withName("t1").withMinimum(1).withMaximum(1).create()).
+ withDescription("t1").withShortName("t1").create();
+ Option t2Opt = obuilder.withLongName("t2").withRequired(true).withArgument(
+ abuilder.withName("t2").withMinimum(1).withMaximum(1).create()).
+ withDescription("t2").withShortName("t2").create();
+
+
+ Option helpOpt = obuilder.withLongName("help").
+ withDescription("Print out help").withShortName("h").create();
+
+ Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt)
+ .withOption(measureClassOpt).withOption(vectorClassOpt)
+ .withOption(t1Opt).withOption(t2Opt)
+ .withOption(helpOpt).create();
+
+ try {
+ Parser parser = new Parser();
+ parser.setGroup(group);
+ CommandLine cmdLine = parser.parse(args);
+
+ if (cmdLine.hasOption(helpOpt)) {
+ CommandLineUtil.printHelp(group);
+ return;
+ }
+
+ String input = cmdLine.getValue(inputOpt).toString();
+ String output = cmdLine.getValue(outputOpt).toString();
+ String measureClass = SquaredEuclideanDistanceMeasure.class.getName();
+ if (cmdLine.hasOption(measureClassOpt)) {
+ measureClass = cmdLine.getValue(measureClassOpt).toString();
+ }
+
+ Class<? extends Vector> vectorClass = cmdLine.hasOption(vectorClassOpt) == false ?
+ SparseVector.class
+ : (Class<? extends Vector>) Class.forName(cmdLine.getValue(vectorClassOpt).toString());
+ double t1 = Double.parseDouble(cmdLine.getValue(t1Opt).toString());
+ double t2 = Double.parseDouble(cmdLine.getValue(t2Opt).toString());
+
+ runJob(input, output, measureClass, t1, t2, vectorClass);
+ } catch (OptionException e) {
+ log.error("Exception", e);
+ CommandLineUtil.printHelp(group);
+
+ }
}
/**
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/ClusterDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/ClusterDriver.java?rev=788918&r1=788917&r2=788918&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/ClusterDriver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/ClusterDriver.java Sat Jun 27 02:22:31 2009
@@ -28,26 +28,102 @@
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.lib.IdentityReducer;
import org.apache.mahout.matrix.Vector;
+import org.apache.mahout.matrix.SparseVector;
+import org.apache.mahout.utils.CommandLineUtil;
+import org.apache.mahout.utils.SquaredEuclideanDistanceMeasure;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.OptionException;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.commons.logging.LogFactory;
+import org.apache.commons.logging.Log;
import java.io.IOException;
public class ClusterDriver {
+ private transient static Log log = LogFactory.getLog(ClusterDriver.class);
+
public static final String DEFAULT_CLUSTER_OUTPUT_DIRECTORY = "/clusters";
private ClusterDriver() {
}
public static void main(String[] args) throws IOException, ClassNotFoundException {
- String points = args[0];
- String canopies = args[1];
- String output = args[2];
- String measureClassName = args[3];
- double t1 = Double.parseDouble(args[4]);
- double t2 = Double.parseDouble(args[5]);
- String vectorClassName = args[6];
- Class<? extends Vector> vectorClass = (Class<? extends Vector>) Class.forName(vectorClassName);
- runJob(points, canopies, output, measureClassName, t1, t2, vectorClass);
+
+ DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
+ ArgumentBuilder abuilder = new ArgumentBuilder();
+ GroupBuilder gbuilder = new GroupBuilder();
+
+ Option vectorClassOpt = obuilder.withLongName("vectorClass").withRequired(false).withArgument(
+ abuilder.withName("vectorClass").withMinimum(1).withMaximum(1).create()).
+ withDescription("The Vector implementation class name. Default is SparseVector.class")
+ .withShortName("v").create();
+ Option t1Opt = obuilder.withLongName("t1").withRequired(true).withArgument(
+ abuilder.withName("t1").withMinimum(1).withMaximum(1).create()).
+ withDescription("t1").withShortName("t1").create();
+ Option t2Opt = obuilder.withLongName("t2").withRequired(true).withArgument(
+ abuilder.withName("t2").withMinimum(1).withMaximum(1).create()).
+ withDescription("t2").withShortName("t2").create();
+
+ Option pointsOpt = obuilder.withLongName("points").withRequired(true).withArgument(
+ abuilder.withName("points").withMinimum(1).withMaximum(1).create()).
+ withDescription("The path containing the points").withShortName("p").create();
+
+ Option canopiesOpt = obuilder.withLongName("canopies").withRequired(true).withArgument(
+ abuilder.withName("canopies").withMinimum(1).withMaximum(1).create()).
+ withDescription("The location of the canopies, as a Path").withShortName("c").create();
+
+ Option measureClassOpt = obuilder.withLongName("distance").withRequired(false).withArgument(
+ abuilder.withName("distance").withMinimum(1).withMaximum(1).create()).
+ withDescription("The Distance Measure to use. Default is SquaredEuclidean").withShortName("m").create();
+
+ Option outputOpt = obuilder.withLongName("output").withRequired(true).withArgument(
+ abuilder.withName("output").withMinimum(1).withMaximum(1).create()).
+ withDescription("The Path to put the output in").withShortName("o").create();
+
+ Option helpOpt = obuilder.withLongName("help").
+ withDescription("Print out help").withShortName("h").create();
+
+ Group group = gbuilder.withName("Options").withOption(vectorClassOpt)
+ .withOption(t1Opt).withOption(t2Opt)
+ .withOption(pointsOpt).withOption(canopiesOpt).withOption(measureClassOpt).withOption(outputOpt)
+ .withOption(helpOpt).create();
+
+ try {
+ Parser parser = new Parser();
+ parser.setGroup(group);
+ CommandLine cmdLine = parser.parse(args);
+ if (cmdLine.hasOption(helpOpt)) {
+ CommandLineUtil.printHelp(group);
+ return;
+ }
+
+ String measureClass = SquaredEuclideanDistanceMeasure.class.getName();
+ if (cmdLine.hasOption(measureClassOpt)) {
+ measureClass = cmdLine.getValue(measureClassOpt).toString();
+ }
+ String output = cmdLine.getValue(outputOpt).toString();
+ String canopies = cmdLine.getValue(canopiesOpt).toString();
+ String points = cmdLine.getValue(pointsOpt).toString();
+ Class<? extends Vector> vectorClass = cmdLine.hasOption(vectorClassOpt) == false ?
+ SparseVector.class
+ : (Class<? extends Vector>) Class.forName(cmdLine.getValue(vectorClassOpt).toString());
+ double t1 = Double.parseDouble(cmdLine.getValue(t1Opt).toString());
+ double t2 = Double.parseDouble(cmdLine.getValue(t2Opt).toString());
+
+ runJob(points, canopies, output, measureClass, t1, t2, vectorClass);
+
+ } catch (OptionException e) {
+ log.error("Exception", e);
+ CommandLineUtil.printHelp(group);
+ }
+
+
}
/**
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java?rev=788918&r1=788917&r2=788918&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java Sat Jun 27 02:22:31 2009
@@ -105,11 +105,12 @@
abuilder.withName("numReduce").withMinimum(1).withMaximum(1).create()).
withDescription("The number of reduce tasks").withShortName("r").create();
- Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(clustersOpt).withOption(outputOpt).withOption(measureClassOpt)
- .withOption(convergenceDeltaOpt).withOption(maxIterationsOpt).withOption(numReduceTasksOpt).withOption(kOpt)
- .withOption(vectorClassOpt).withOption(overwriteOutput).create();
Option helpOpt = obuilder.withLongName("help").
withDescription("Print out help").withShortName("h").create();
+
+ Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(clustersOpt).withOption(outputOpt).withOption(measureClassOpt)
+ .withOption(convergenceDeltaOpt).withOption(maxIterationsOpt).withOption(numReduceTasksOpt).withOption(kOpt)
+ .withOption(vectorClassOpt).withOption(overwriteOutput).withOption(helpOpt).create();
try {
Parser parser = new Parser();
parser.setGroup(group);