You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by gs...@apache.org on 2009/06/27 04:22:31 UTC

svn commit: r788918 - in /lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering: canopy/CanopyClusteringJob.java canopy/CanopyDriver.java canopy/ClusterDriver.java kmeans/KMeansDriver.java

Author: gsingers
Date: Sat Jun 27 02:22:31 2009
New Revision: 788918

URL: http://svn.apache.org/viewvc?rev=788918&view=rev
Log:
MAHOUT-138: convert Canopy to use CLI2

Modified:
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyClusteringJob.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyDriver.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/ClusterDriver.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyClusteringJob.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyClusteringJob.java?rev=788918&r1=788917&r2=788918&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyClusteringJob.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyClusteringJob.java Sat Jun 27 02:22:31 2009
@@ -18,11 +18,28 @@
 package org.apache.mahout.clustering.canopy;
 
 import org.apache.mahout.matrix.Vector;
+import org.apache.mahout.matrix.SparseVector;
+import org.apache.mahout.utils.CommandLineUtil;
+import org.apache.mahout.utils.SquaredEuclideanDistanceMeasure;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.OptionException;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.commons.logging.LogFactory;
+import org.apache.commons.logging.Log;
 
 import java.io.IOException;
 
+/**
+ * Runs the {@link org.apache.mahout.clustering.canopy.CanopyDriver#runJob(String, String, String, double, double, Class)}
+ * and then {@link org.apache.mahout.clustering.canopy.ClusterDriver#runJob(String, String, String, String, double, double, Class)}.
+ */
 public class CanopyClusteringJob {
-
+  private transient static Log log = LogFactory.getLog(CanopyClusteringJob.class);
   /**
    * The default name of the canopies output sub-directory.
    */     
@@ -39,14 +56,72 @@
    * @param args
    */
   public static void main(String[] args) throws IOException, ClassNotFoundException {
-    String input = args[0];
-    String output = args[1];
-    String measureClassName = args[2];
-    double t1 = Double.parseDouble(args[3]);
-    double t2 = Double.parseDouble(args[4]);
-    String vectorClassName = args[5];
-    Class<? extends Vector> vectorClass = (Class<? extends Vector>) Class.forName(vectorClassName);
-    runJob(input, output, measureClassName, t1, t2, vectorClass);
+    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
+    ArgumentBuilder abuilder = new ArgumentBuilder();
+    GroupBuilder gbuilder = new GroupBuilder();
+
+    Option inputOpt = obuilder.withLongName("input").withRequired(true).withArgument(
+            abuilder.withName("input").withMinimum(1).withMaximum(1).create()).
+            withDescription("The Path for input Vectors. Must be a SequenceFile of Writable, Vector").withShortName("i").create();
+
+    Option outputOpt = obuilder.withLongName("output").withRequired(true).withArgument(
+            abuilder.withName("output").withMinimum(1).withMaximum(1).create()).
+            withDescription("The Path to put the output in").withShortName("o").create();
+
+    Option measureClassOpt = obuilder.withLongName("distance").withRequired(false).withArgument(
+        abuilder.withName("distance").withMinimum(1).withMaximum(1).create()).
+        withDescription("The Distance Measure to use.  Default is SquaredEuclidean").withShortName("m").create();
+
+    Option vectorClassOpt = obuilder.withLongName("vectorClass").withRequired(false).withArgument(
+            abuilder.withName("vectorClass").withMinimum(1).withMaximum(1).create()).
+            withDescription("The Vector implementation class name.  Default is SparseVector.class").withShortName("v").create();
+    Option t1Opt = obuilder.withLongName("t1").withRequired(true).withArgument(
+            abuilder.withName("t1").withMinimum(1).withMaximum(1).create()).
+            withDescription("t1").withShortName("t1").create();
+    Option t2Opt = obuilder.withLongName("t2").withRequired(true).withArgument(
+            abuilder.withName("t2").withMinimum(1).withMaximum(1).create()).
+            withDescription("t2").withShortName("t2").create();
+
+
+
+    Option helpOpt = obuilder.withLongName("help").
+            withDescription("Print out help").withShortName("h").create();
+
+    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt)
+            .withOption(measureClassOpt).withOption(vectorClassOpt)
+            .withOption(t1Opt).withOption(t2Opt)
+            .withOption(helpOpt).create();
+
+
+    try {
+      Parser parser = new Parser();
+      parser.setGroup(group);
+      CommandLine cmdLine = parser.parse(args);
+
+      if (cmdLine.hasOption(helpOpt)) {
+        CommandLineUtil.printHelp(group);
+        return;
+      }
+
+      String input = cmdLine.getValue(inputOpt).toString();
+      String output = cmdLine.getValue(outputOpt).toString();
+      String measureClass = SquaredEuclideanDistanceMeasure.class.getName();
+      if (cmdLine.hasOption(measureClassOpt)) {
+        measureClass = cmdLine.getValue(measureClassOpt).toString();
+      }
+
+      Class<? extends Vector> vectorClass = cmdLine.hasOption(vectorClassOpt) == false ?
+              SparseVector.class
+              : (Class<? extends Vector>) Class.forName(cmdLine.getValue(vectorClassOpt).toString());
+      double t1 = Double.parseDouble(cmdLine.getValue(t1Opt).toString());
+      double t2 = Double.parseDouble(cmdLine.getValue(t2Opt).toString());
+
+      runJob(input, output, measureClass, t1, t2, vectorClass);
+
+    } catch (OptionException e) {
+      log.error("Exception", e);
+      CommandLineUtil.printHelp(group);
+    }
   }
 
   /**

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyDriver.java?rev=788918&r1=788917&r2=788918&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyDriver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyDriver.java Sat Jun 27 02:22:31 2009
@@ -27,8 +27,19 @@
 import org.apache.hadoop.mapred.SequenceFileInputFormat;
 import org.apache.hadoop.mapred.SequenceFileOutputFormat;
 import org.apache.mahout.matrix.Vector;
+import org.apache.mahout.matrix.SparseVector;
+import org.apache.mahout.utils.CommandLineUtil;
+import org.apache.mahout.utils.SquaredEuclideanDistanceMeasure;
 import org.apache.commons.logging.LogFactory;
 import org.apache.commons.logging.Log;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.OptionException;
+import org.apache.commons.cli2.commandline.Parser;
 
 import java.io.IOException;
 
@@ -38,14 +49,70 @@
   }
 
   public static void main(String[] args) throws IOException, ClassNotFoundException {
-    String input = args[0];
-    String output = args[1];
-    String measureClassName = args[2];
-    double t1 = Double.parseDouble(args[3]);
-    double t2 = Double.parseDouble(args[4]);
-    String vectorClassName = args[5];
-    Class<? extends Vector> vectorClass = (Class<? extends Vector>) Class.forName(vectorClassName);
-    runJob(input, output, measureClassName, t1, t2, vectorClass);
+    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
+    ArgumentBuilder abuilder = new ArgumentBuilder();
+    GroupBuilder gbuilder = new GroupBuilder();
+
+    Option inputOpt = obuilder.withLongName("input").withRequired(true).withArgument(
+            abuilder.withName("input").withMinimum(1).withMaximum(1).create()).
+            withDescription("The Path for input Vectors. Must be a SequenceFile of Writable, Vector").withShortName("i").create();
+
+    Option outputOpt = obuilder.withLongName("output").withRequired(true).withArgument(
+            abuilder.withName("output").withMinimum(1).withMaximum(1).create()).
+            withDescription("The Path to put the output in").withShortName("o").create();
+
+    Option measureClassOpt = obuilder.withLongName("distance").withRequired(false).withArgument(
+        abuilder.withName("distance").withMinimum(1).withMaximum(1).create()).
+        withDescription("The Distance Measure to use.  Default is SquaredEuclidean").withShortName("m").create();
+
+    Option vectorClassOpt = obuilder.withLongName("vectorClass").withRequired(false).withArgument(
+            abuilder.withName("vectorClass").withMinimum(1).withMaximum(1).create()).
+            withDescription("The Vector implementation class name.  Default is SparseVector.class").withShortName("v").create();
+    Option t1Opt = obuilder.withLongName("t1").withRequired(true).withArgument(
+            abuilder.withName("t1").withMinimum(1).withMaximum(1).create()).
+            withDescription("t1").withShortName("t1").create();
+    Option t2Opt = obuilder.withLongName("t2").withRequired(true).withArgument(
+            abuilder.withName("t2").withMinimum(1).withMaximum(1).create()).
+            withDescription("t2").withShortName("t2").create();
+
+
+    Option helpOpt = obuilder.withLongName("help").
+            withDescription("Print out help").withShortName("h").create();
+
+    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt)
+            .withOption(measureClassOpt).withOption(vectorClassOpt)
+            .withOption(t1Opt).withOption(t2Opt)
+            .withOption(helpOpt).create();
+
+    try {
+      Parser parser = new Parser();
+      parser.setGroup(group);
+      CommandLine cmdLine = parser.parse(args);
+
+      if (cmdLine.hasOption(helpOpt)) {
+        CommandLineUtil.printHelp(group);
+        return;
+      }
+
+      String input = cmdLine.getValue(inputOpt).toString();
+      String output = cmdLine.getValue(outputOpt).toString();
+      String measureClass = SquaredEuclideanDistanceMeasure.class.getName();
+      if (cmdLine.hasOption(measureClassOpt)) {
+        measureClass = cmdLine.getValue(measureClassOpt).toString();
+      }
+
+      Class<? extends Vector> vectorClass = cmdLine.hasOption(vectorClassOpt) == false ?
+              SparseVector.class
+              : (Class<? extends Vector>) Class.forName(cmdLine.getValue(vectorClassOpt).toString());
+      double t1 = Double.parseDouble(cmdLine.getValue(t1Opt).toString());
+      double t2 = Double.parseDouble(cmdLine.getValue(t2Opt).toString());
+
+      runJob(input, output, measureClass, t1, t2, vectorClass);
+    } catch (OptionException e) {
+      log.error("Exception", e);
+      CommandLineUtil.printHelp(group);
+
+    }     
   }
 
   /**

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/ClusterDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/ClusterDriver.java?rev=788918&r1=788917&r2=788918&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/ClusterDriver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/ClusterDriver.java Sat Jun 27 02:22:31 2009
@@ -28,26 +28,102 @@
 import org.apache.hadoop.mapred.SequenceFileOutputFormat;
 import org.apache.hadoop.mapred.lib.IdentityReducer;
 import org.apache.mahout.matrix.Vector;
+import org.apache.mahout.matrix.SparseVector;
+import org.apache.mahout.utils.CommandLineUtil;
+import org.apache.mahout.utils.SquaredEuclideanDistanceMeasure;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.OptionException;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.commons.logging.LogFactory;
+import org.apache.commons.logging.Log;
 
 import java.io.IOException;
 
 public class ClusterDriver {
 
+  private transient static Log log = LogFactory.getLog(ClusterDriver.class);
+
   public static final String DEFAULT_CLUSTER_OUTPUT_DIRECTORY = "/clusters";
 
   private ClusterDriver() {
   }
 
   public static void main(String[] args) throws IOException, ClassNotFoundException {
-    String points = args[0];
-    String canopies = args[1];
-    String output = args[2];
-    String measureClassName = args[3];
-    double t1 = Double.parseDouble(args[4]);
-    double t2 = Double.parseDouble(args[5]);
-    String vectorClassName = args[6];
-    Class<? extends Vector> vectorClass = (Class<? extends Vector>) Class.forName(vectorClassName);
-    runJob(points, canopies, output, measureClassName, t1, t2, vectorClass);
+
+    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
+    ArgumentBuilder abuilder = new ArgumentBuilder();
+    GroupBuilder gbuilder = new GroupBuilder();
+
+    Option vectorClassOpt = obuilder.withLongName("vectorClass").withRequired(false).withArgument(
+            abuilder.withName("vectorClass").withMinimum(1).withMaximum(1).create()).
+            withDescription("The Vector implementation class name.  Default is SparseVector.class")
+            .withShortName("v").create();
+    Option t1Opt = obuilder.withLongName("t1").withRequired(true).withArgument(
+            abuilder.withName("t1").withMinimum(1).withMaximum(1).create()).
+            withDescription("t1").withShortName("t1").create();
+    Option t2Opt = obuilder.withLongName("t2").withRequired(true).withArgument(
+            abuilder.withName("t2").withMinimum(1).withMaximum(1).create()).
+            withDescription("t2").withShortName("t2").create();
+
+    Option pointsOpt = obuilder.withLongName("points").withRequired(true).withArgument(
+            abuilder.withName("points").withMinimum(1).withMaximum(1).create()).
+            withDescription("The path containing the points").withShortName("p").create();
+
+    Option canopiesOpt = obuilder.withLongName("canopies").withRequired(true).withArgument(
+            abuilder.withName("canopies").withMinimum(1).withMaximum(1).create()).
+            withDescription("The location of the canopies, as a Path").withShortName("c").create();
+
+    Option measureClassOpt = obuilder.withLongName("distance").withRequired(false).withArgument(
+            abuilder.withName("distance").withMinimum(1).withMaximum(1).create()).
+            withDescription("The Distance Measure to use.  Default is SquaredEuclidean").withShortName("m").create();
+
+    Option outputOpt = obuilder.withLongName("output").withRequired(true).withArgument(
+            abuilder.withName("output").withMinimum(1).withMaximum(1).create()).
+            withDescription("The Path to put the output in").withShortName("o").create();
+
+    Option helpOpt = obuilder.withLongName("help").
+            withDescription("Print out help").withShortName("h").create();
+
+    Group group = gbuilder.withName("Options").withOption(vectorClassOpt)
+            .withOption(t1Opt).withOption(t2Opt)
+            .withOption(pointsOpt).withOption(canopiesOpt).withOption(measureClassOpt).withOption(outputOpt)
+            .withOption(helpOpt).create();
+
+    try {
+      Parser parser = new Parser();
+      parser.setGroup(group);
+      CommandLine cmdLine = parser.parse(args);
+      if (cmdLine.hasOption(helpOpt)) {
+        CommandLineUtil.printHelp(group);
+        return;
+      }
+
+      String measureClass = SquaredEuclideanDistanceMeasure.class.getName();
+      if (cmdLine.hasOption(measureClassOpt)) {
+        measureClass = cmdLine.getValue(measureClassOpt).toString();
+      }
+      String output = cmdLine.getValue(outputOpt).toString();
+      String canopies = cmdLine.getValue(canopiesOpt).toString();
+      String points = cmdLine.getValue(pointsOpt).toString();
+      Class<? extends Vector> vectorClass = cmdLine.hasOption(vectorClassOpt) == false ?
+              SparseVector.class
+              : (Class<? extends Vector>) Class.forName(cmdLine.getValue(vectorClassOpt).toString());
+      double t1 = Double.parseDouble(cmdLine.getValue(t1Opt).toString());
+      double t2 = Double.parseDouble(cmdLine.getValue(t2Opt).toString());
+
+      runJob(points, canopies, output, measureClass, t1, t2, vectorClass);
+
+    } catch (OptionException e) {
+      log.error("Exception", e);
+      CommandLineUtil.printHelp(group);
+    }
+
+
   }
 
   /**

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java?rev=788918&r1=788917&r2=788918&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java Sat Jun 27 02:22:31 2009
@@ -105,11 +105,12 @@
             abuilder.withName("numReduce").withMinimum(1).withMaximum(1).create()).
             withDescription("The number of reduce tasks").withShortName("r").create();
 
-    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(clustersOpt).withOption(outputOpt).withOption(measureClassOpt)
-            .withOption(convergenceDeltaOpt).withOption(maxIterationsOpt).withOption(numReduceTasksOpt).withOption(kOpt)
-            .withOption(vectorClassOpt).withOption(overwriteOutput).create();
     Option helpOpt = obuilder.withLongName("help").
             withDescription("Print out help").withShortName("h").create();
+
+    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(clustersOpt).withOption(outputOpt).withOption(measureClassOpt)
+            .withOption(convergenceDeltaOpt).withOption(maxIterationsOpt).withOption(numReduceTasksOpt).withOption(kOpt)
+            .withOption(vectorClassOpt).withOption(overwriteOutput).withOption(helpOpt).create();
     try {
       Parser parser = new Parser();
       parser.setGroup(group);