You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by je...@apache.org on 2010/10/01 01:07:47 UTC

svn commit: r1003329 - in /mahout/trunk/core/src/main/java/org/apache/mahout/clustering/spectral: eigencuts/EigencutsDriver.java kmeans/SpectralKMeansDriver.java

Author: jeastman
Date: Thu Sep 30 23:07:46 2010
New Revision: 1003329

URL: http://svn.apache.org/viewvc?rev=1003329&view=rev
Log:
MAHOUT-519:
- added distanceMeasureOption, convergenceOption, maxIterationsOption and overwriteOption to SpectralKMeansDriver
- factored a static run() method with required Java parameters for programmatic invocation
- added overwriteOption to EigencutsDriver
- factored a static run() method with required Java parameters for programmatic invocation
No tests currently exercise CLI for these drivers so tested manually

Modified:
    mahout/trunk/core/src/main/java/org/apache/mahout/clustering/spectral/eigencuts/EigencutsDriver.java
    mahout/trunk/core/src/main/java/org/apache/mahout/clustering/spectral/kmeans/SpectralKMeansDriver.java

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/spectral/eigencuts/EigencutsDriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/spectral/eigencuts/EigencutsDriver.java?rev=1003329&r1=1003328&r2=1003329&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/spectral/eigencuts/EigencutsDriver.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/spectral/eigencuts/EigencutsDriver.java Thu Sep 30 23:07:46 2010
@@ -30,6 +30,8 @@ import org.apache.mahout.clustering.spec
 import org.apache.mahout.clustering.spectral.common.MatrixDiagonalizeJob;
 import org.apache.mahout.clustering.spectral.common.VectorMatrixMultiplicationJob;
 import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
 import org.apache.mahout.math.DenseMatrix;
 import org.apache.mahout.math.DenseVector;
 import org.apache.mahout.math.Matrix;
@@ -40,20 +42,12 @@ import org.apache.mahout.math.hadoop.dec
 import org.apache.mahout.math.stats.OnlineSummarizer;
 
 public class EigencutsDriver extends AbstractJob {
-  
+
   public static final double EPSILON_DEFAULT = 0.25;
+
   public static final double TAU_DEFAULT = -0.1;
+
   public static final double OVERSHOOT_MULTIPLIER = 1.5;
-  
-  // parameters of interest
-  /** number of dimensions in the square affinity matrix */
-  private int dimensions;
-  /** user-supplied minimum half-life threshold */
-  private double halflife;
-  /** user-supplied coefficient for setting minimum half-life threshold */
-  private double epsilon;
-  /** user-supplied threshold for cutting links in the affinity graph */
-  private double tau;
 
   public static void main(String args[]) throws Exception {
     ToolRunner.run(new EigencutsDriver(), args);
@@ -61,7 +55,7 @@ public class EigencutsDriver extends Abs
 
   @Override
   public int run(String[] arg0) throws Exception {
-    
+
     // set up command line arguments
     Configuration conf = new Configuration();
     addOption("input", "i", "Path to input affinity matrix data", true);
@@ -70,26 +64,52 @@ public class EigencutsDriver extends Abs
     addOption("dimensions", "d", "Square dimensions of affinity matrix", true);
     addOption("epsilon", "e", "Half-life threshold coefficient", Double.toString(EigencutsDriver.EPSILON_DEFAULT));
     addOption("tau", "t", "Threshold for cutting affinities", Double.toString(EigencutsDriver.TAU_DEFAULT));
+    addOption(DefaultOptionCreator.overwriteOption().create());
     Map<String, String> parsedArgs = parseArguments(arg0);
     if (parsedArgs == null) {
       return 0;
     }
-    
+
     // read in the command line values
     Path input = new Path(parsedArgs.get("--input"));
     Path output = new Path(parsedArgs.get("--output"));
-    dimensions = Integer.parseInt(parsedArgs.get("--dimensions"));
-    halflife = Integer.parseInt(parsedArgs.get("--half-life"));
-    epsilon = Double.parseDouble(parsedArgs.get("--epsilon"));
-    tau = Double.parseDouble(parsedArgs.get("--tau"));
-    
+    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
+      HadoopUtil.overwriteOutput(output);
+    }
+    int dimensions = Integer.parseInt(parsedArgs.get("--dimensions"));
+    double halflife = Double.parseDouble(parsedArgs.get("--half-life"));
+    double epsilon = Double.parseDouble(parsedArgs.get("--epsilon"));
+    double tau = Double.parseDouble(parsedArgs.get("--tau"));
+
+    run(conf, input, output, dimensions, halflife, epsilon, tau);
+
+    return 0;
+  }
+
+  /**
+   * Run the Eigencuts clustering algorithm using the supplied arguments
+   * 
+   * @param conf the Configuration to use
+   * @param input the Path to the directory containing input affinity tuples
+   * @param output the Path to the output directory
+   * @param dimensions the int number of dimensions of the square affinity matrix
+   * @param halflife the double minimum half-life threshold
+   * @param epsilon the double coefficient for setting minimum half-life threshold
+   * @param tau the double tau threshold for cutting links in the affinity graph
+   * @throws IOException
+   * @throws InterruptedException
+   * @throws ClassNotFoundException
+   */
+  public static void run(Configuration conf, Path input, Path output, int dimensions, double halflife, double epsilon, double tau)
+      throws IOException, InterruptedException, ClassNotFoundException {
+    // set the instance variables
     // create a few new Paths for temp files and transformations
     Path outputCalc = new Path(output, "calculations");
     Path outputTmp = new Path(output, "temporary");
-    
+
     DistributedRowMatrix A = AffinityMatrixInputJob.runJob(input, outputCalc, dimensions);
     Vector D = MatrixDiagonalizeJob.runJob(A.getRowPath(), dimensions);
-    
+
     long numCuts = 0;
     do {
       // first three steps are the same as spectral k-means:
@@ -97,51 +117,44 @@ public class EigencutsDriver extends Abs
       // 2) calculate L = D^-0.5 * A * D^-0.5
       // 3) calculate eigenvectors of L
 
-      DistributedRowMatrix L = VectorMatrixMultiplicationJob.runJob(A.getRowPath(), 
-        D, new Path(outputCalc, "laplacian-" + (System.nanoTime() & 0xFF)));
+      DistributedRowMatrix L = VectorMatrixMultiplicationJob.runJob(A.getRowPath(), D, new Path(outputCalc, "laplacian-"
+          + (System.nanoTime() & 0xFF)));
       L.configure(new JobConf(conf));
-    
+
       // eigendecomposition (step 3)
-      int overshoot = (int)((double)dimensions * 
-        EigencutsDriver.OVERSHOOT_MULTIPLIER);
+      int overshoot = (int) ((double) dimensions * EigencutsDriver.OVERSHOOT_MULTIPLIER);
       List<Double> eigenValues = new ArrayList<Double>(overshoot);
       Matrix eigenVectors = new DenseMatrix(overshoot, dimensions);
-      DistributedRowMatrix U = performEigenDecomposition(L, 
-        dimensions, overshoot, eigenValues, eigenVectors, 
-        outputCalc);
+      DistributedRowMatrix U = performEigenDecomposition(L, dimensions, overshoot, eigenValues, eigenVectors, outputCalc);
       U.configure(new JobConf(conf));
       eigenValues = eigenValues.subList(0, dimensions);
-    
+
       // here's where things get interesting: steps 4, 5, and 6 are unique
       // to this algorithm, and depending on the final output, steps 1-3
       // may be repeated as well
-    
+
       // helper method, since apparently List and Vector objects don't play nicely
       Vector evs = listToVector(eigenValues);
-    
+
       // calculate sensitivities (step 4 and step 5)
       Path sensitivities = new Path(outputCalc, "sensitivities-" + (System.nanoTime() & 0xFF));
-      EigencutsSensitivityJob.runJob(evs, D, U.getRowPath(), halflife, 
-        tau, median(D), epsilon, sensitivities);
-    
+      EigencutsSensitivityJob.runJob(evs, D, U.getRowPath(), halflife, tau, median(D), epsilon, sensitivities);
+
       // perform the cuts (step 6)
       input = new Path(outputTmp, "nextAff-" + (System.nanoTime() & 0xFF));
       numCuts = EigencutsAffinityCutsJob.runjob(A.getRowPath(), sensitivities, input, conf);
-    
+
       // how many cuts were made?
       if (numCuts > 0) {
         // recalculate A
-        A = new DistributedRowMatrix(input, new Path(outputTmp, Long.toString(System.nanoTime())),
-            dimensions, dimensions);
+        A = new DistributedRowMatrix(input, new Path(outputTmp, Long.toString(System.nanoTime())), dimensions, dimensions);
         A.configure(new JobConf());
       }
     } while (numCuts > 0);
-    
-    // TODO: output format???
-    
-    return 0;
+
+    // TODO: MAHOUT-517: Eigencuts needs an output format
   }
-  
+
   /**
    * Does most of the heavy lifting in setting up Paths, configuring return
    * values, and generally performing the tedious administrative tasks involved
@@ -154,33 +167,39 @@ public class EigencutsDriver extends Abs
    * @param tmp
    * @return
    */
-  public static DistributedRowMatrix performEigenDecomposition(
-      DistributedRowMatrix input, int numEigenVectors, int overshoot, 
-      List<Double> eigenValues, Matrix eigenVectors, Path tmp) 
-      throws IOException {
+  public static DistributedRowMatrix performEigenDecomposition(DistributedRowMatrix input,
+                                                               int numEigenVectors,
+                                                               int overshoot,
+                                                               List<Double> eigenValues,
+                                                               Matrix eigenVectors,
+                                                               Path tmp) throws IOException {
     DistributedLanczosSolver solver = new DistributedLanczosSolver();
     Path seqFiles = new Path(tmp, "eigendecomp-" + (System.nanoTime() & 0xFF));
-    solver.runJob(new Configuration(), input.getRowPath(), new Path(tmp, 
-        "lanczos-" + (System.nanoTime() & 0xFF)), input.numRows(), 
-        input.numCols(), true, overshoot, eigenVectors, eigenValues, 
-        seqFiles.toString());
-    
+    solver.runJob(new Configuration(),
+                  input.getRowPath(),
+                  new Path(tmp, "lanczos-" + (System.nanoTime() & 0xFF)),
+                  input.numRows(),
+                  input.numCols(),
+                  true,
+                  overshoot,
+                  eigenVectors,
+                  eigenValues,
+                  seqFiles.toString());
+
     // now run the verifier to trim down the number of eigenvectors
     EigenVerificationJob verifier = new EigenVerificationJob();
     Path verifiedEigens = new Path(tmp, "verifiedeigens");
-    verifier.runJob(seqFiles, input.getRowPath(), verifiedEigens, 
-        false, 1.0, 0.0, numEigenVectors);
+    verifier.runJob(seqFiles, input.getRowPath(), verifiedEigens, false, 1.0, 0.0, numEigenVectors);
     Path cleanedEigens = verifier.getCleanedEigensPath();
-    return new DistributedRowMatrix(cleanedEigens, new Path(cleanedEigens, 
-        "tmp"), numEigenVectors, input.numRows());
+    return new DistributedRowMatrix(cleanedEigens, new Path(cleanedEigens, "tmp"), numEigenVectors, input.numRows());
   }
-  
+
   /**
    * A quick and dirty hack to compute the median of a vector...
    * @param v
    * @return
    */
-  private double median(Vector v) {
+  private static double median(Vector v) {
     OnlineSummarizer med = new OnlineSummarizer();
     if (v.size() < 100) {
       return v.zSum() / v.size();
@@ -190,14 +209,14 @@ public class EigencutsDriver extends Abs
     }
     return med.getMedian();
   }
-  
+
   /**
    * Iteratively loops through the list, converting it to a Vector of double
    * primitives worthy of other Mahout operations
    * @param list
    * @return
    */
-  private Vector listToVector(List<Double> list) {
+  private static Vector listToVector(List<Double> list) {
     Vector retval = new DenseVector(list.size());
     int index = 0;
     for (Double d : list) {

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/spectral/kmeans/SpectralKMeansDriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/spectral/kmeans/SpectralKMeansDriver.java?rev=1003329&r1=1003328&r2=1003329&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/spectral/kmeans/SpectralKMeansDriver.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/spectral/kmeans/SpectralKMeansDriver.java Thu Sep 30 23:07:46 2010
@@ -17,6 +17,7 @@
 
 package org.apache.mahout.clustering.spectral.kmeans;
 
+import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Map;
@@ -37,8 +38,11 @@ import org.apache.mahout.clustering.spec
 import org.apache.mahout.clustering.spectral.common.UnitVectorizerJob;
 import org.apache.mahout.clustering.spectral.common.VectorMatrixMultiplicationJob;
 import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
 import org.apache.mahout.common.distance.DistanceMeasure;
 import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
+import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure;
 import org.apache.mahout.math.DenseMatrix;
 import org.apache.mahout.math.Matrix;
 import org.apache.mahout.math.Vector;
@@ -62,24 +66,65 @@ public class SpectralKMeansDriver extend
   @Override
   public int run(String[] arg0) throws Exception {
     // set up command line options
-    Configuration conf = new Configuration();
-    addOption("input", "i", "Path to input affinity matrix data", true);
-    addOption("output", "o", "Output of clusterings", true);
+    Configuration conf = getConf();
+    addInputOption();
+    addOutputOption();
     addOption("dimensions", "d", "Square dimensions of affinity matrix", true);
     addOption("clusters", "k", "Number of clusters and top eigenvectors", true);
+    addOption(DefaultOptionCreator.distanceMeasureOption().create());
+    addOption(DefaultOptionCreator.convergenceOption().create());
+    addOption(DefaultOptionCreator.maxIterationsOption().create());
+    addOption(DefaultOptionCreator.overwriteOption().create());
     Map<String, String> parsedArgs = parseArguments(arg0);
     if (parsedArgs == null) {
       return 0;
     }
 
-    // TODO: Need to be able to read all k-means parameters, though
-    // they will be optional parameters to the algorithm
-    // read the values of the command line
-    Path input = new Path(parsedArgs.get("--input"));
-    Path output = new Path(parsedArgs.get("--output"));
+    Path input = getInputPath();
+    Path output = getOutputPath();
+    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
+      HadoopUtil.overwriteOutput(output);
+    }
     int numDims = Integer.parseInt(parsedArgs.get("--dimensions"));
     int clusters = Integer.parseInt(parsedArgs.get("--clusters"));
+    String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
+    ClassLoader ccl = Thread.currentThread().getContextClassLoader();
+    DistanceMeasure measure = ccl.loadClass(measureClass).asSubclass(DistanceMeasure.class).newInstance();
+    double convergenceDelta = Double.parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION));
+    int maxIterations = Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION));
+
+    run(conf, input, output, numDims, clusters, measure, convergenceDelta, maxIterations);
 
+    return 0;
+  }
+
+  /**
+   * Run the Spectral KMeans clustering on the supplied arguments
+   * 
+   * @param conf the Configuration to be used
+   * @param input the Path to the input tuples directory
+   * @param output the Path to the output directory
+   * @param numDims the int number of dimensions of the affinity matrix
+   * @param clusters the int number of eigenvectors and thus clusters to produce
+   * @param measure the DistanceMeasure for the k-Means calculations
+   * @param convergenceDelta the double convergence delta for the k-Means calculations
+   * @param maxIterations the int maximum number of iterations for the k-Means calculations
+   * 
+   * @throws IOException
+   * @throws InterruptedException
+   * @throws ClassNotFoundException
+   * @throws IllegalAccessException
+   * @throws InstantiationException
+   */
+  public static void run(Configuration conf,
+                         Path input,
+                         Path output,
+                         int numDims,
+                         int clusters,
+                         DistanceMeasure measure,
+                         double convergenceDelta,
+                         int maxIterations) throws IOException, InterruptedException, ClassNotFoundException,
+      IllegalAccessException, InstantiationException {
     // create a few new Paths for temp files and transformations
     Path outputCalc = new Path(output, "calculations");
     Path outputTmp = new Path(output, "temporary");
@@ -157,12 +202,11 @@ public class SpectralKMeansDriver extend
 
     // Finally, perform k-means clustering on the rows of L (or W)
     // generate random initial clusters
-    DistanceMeasure measure = new EuclideanDistanceMeasure();
     Path initialclusters = RandomSeedGenerator.buildRandom(Wt.getRowPath(),
                                                            new Path(output, Cluster.INITIAL_CLUSTERS_DIR),
                                                            clusters,
                                                            measure);
-    KMeansDriver.run(new Configuration(), Wt.getRowPath(), initialclusters, output, measure, 0.001, 10, true, false);
+    KMeansDriver.run(conf, Wt.getRowPath(), initialclusters, output, measure, convergenceDelta, maxIterations, true, false);
 
     // Read through the cluster assignments
     Path clusteredPointsPath = new Path(output, "clusteredPoints");
@@ -190,7 +234,5 @@ public class SpectralKMeansDriver extend
     reader.close();
 
     // TODO: output format???
-
-    return 0;
   }
 }