You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by je...@apache.org on 2010/10/01 01:07:47 UTC
svn commit: r1003329 - in
/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/spectral:
eigencuts/EigencutsDriver.java kmeans/SpectralKMeansDriver.java
Author: jeastman
Date: Thu Sep 30 23:07:46 2010
New Revision: 1003329
URL: http://svn.apache.org/viewvc?rev=1003329&view=rev
Log:
MAHOUT-519:
- added distanceMeasureOption, convergenceOption, maxIterationsOption and overwriteOption to SpectralKMeansDriver
- factored a static run() method with required Java parameters for programmatic invocation
- added overwriteOption to EigencutsDriver
- factored a static run() method with required Java parameters for programmatic invocation
No tests currently exercise CLI for these drivers so tested manually
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/spectral/eigencuts/EigencutsDriver.java
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/spectral/kmeans/SpectralKMeansDriver.java
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/spectral/eigencuts/EigencutsDriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/spectral/eigencuts/EigencutsDriver.java?rev=1003329&r1=1003328&r2=1003329&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/spectral/eigencuts/EigencutsDriver.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/spectral/eigencuts/EigencutsDriver.java Thu Sep 30 23:07:46 2010
@@ -30,6 +30,8 @@ import org.apache.mahout.clustering.spec
import org.apache.mahout.clustering.spectral.common.MatrixDiagonalizeJob;
import org.apache.mahout.clustering.spectral.common.VectorMatrixMultiplicationJob;
import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.math.DenseMatrix;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.Matrix;
@@ -40,20 +42,12 @@ import org.apache.mahout.math.hadoop.dec
import org.apache.mahout.math.stats.OnlineSummarizer;
public class EigencutsDriver extends AbstractJob {
-
+
public static final double EPSILON_DEFAULT = 0.25;
+
public static final double TAU_DEFAULT = -0.1;
+
public static final double OVERSHOOT_MULTIPLIER = 1.5;
-
- // parameters of interest
- /** number of dimensions in the square affinity matrix */
- private int dimensions;
- /** user-supplied minimum half-life threshold */
- private double halflife;
- /** user-supplied coefficient for setting minimum half-life threshold */
- private double epsilon;
- /** user-supplied threshold for cutting links in the affinity graph */
- private double tau;
public static void main(String args[]) throws Exception {
ToolRunner.run(new EigencutsDriver(), args);
@@ -61,7 +55,7 @@ public class EigencutsDriver extends Abs
@Override
public int run(String[] arg0) throws Exception {
-
+
// set up command line arguments
Configuration conf = new Configuration();
addOption("input", "i", "Path to input affinity matrix data", true);
@@ -70,26 +64,52 @@ public class EigencutsDriver extends Abs
addOption("dimensions", "d", "Square dimensions of affinity matrix", true);
addOption("epsilon", "e", "Half-life threshold coefficient", Double.toString(EigencutsDriver.EPSILON_DEFAULT));
addOption("tau", "t", "Threshold for cutting affinities", Double.toString(EigencutsDriver.TAU_DEFAULT));
+ addOption(DefaultOptionCreator.overwriteOption().create());
Map<String, String> parsedArgs = parseArguments(arg0);
if (parsedArgs == null) {
return 0;
}
-
+
// read in the command line values
Path input = new Path(parsedArgs.get("--input"));
Path output = new Path(parsedArgs.get("--output"));
- dimensions = Integer.parseInt(parsedArgs.get("--dimensions"));
- halflife = Integer.parseInt(parsedArgs.get("--half-life"));
- epsilon = Double.parseDouble(parsedArgs.get("--epsilon"));
- tau = Double.parseDouble(parsedArgs.get("--tau"));
-
+ if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
+ HadoopUtil.overwriteOutput(output);
+ }
+ int dimensions = Integer.parseInt(parsedArgs.get("--dimensions"));
+ double halflife = Double.parseDouble(parsedArgs.get("--half-life"));
+ double epsilon = Double.parseDouble(parsedArgs.get("--epsilon"));
+ double tau = Double.parseDouble(parsedArgs.get("--tau"));
+
+ run(conf, input, output, dimensions, halflife, epsilon, tau);
+
+ return 0;
+ }
+
+ /**
+ * Run the Eigencuts clustering algorithm using the supplied arguments
+ *
+ * @param conf the Configuration to use
+ * @param input the Path to the directory containing input affinity tuples
+ * @param output the Path to the output directory
+ * @param dimensions the int number of dimensions of the square affinity matrix
+ * @param halflife the double minimum half-life threshold
+ * @param epsilon the double coefficient for setting minimum half-life threshold
+ * @param tau the double tau threshold for cutting links in the affinity graph
+ * @throws IOException
+ * @throws InterruptedException
+ * @throws ClassNotFoundException
+ */
+ public static void run(Configuration conf, Path input, Path output, int dimensions, double halflife, double epsilon, double tau)
+ throws IOException, InterruptedException, ClassNotFoundException {
+ // set the instance variables
// create a few new Paths for temp files and transformations
Path outputCalc = new Path(output, "calculations");
Path outputTmp = new Path(output, "temporary");
-
+
DistributedRowMatrix A = AffinityMatrixInputJob.runJob(input, outputCalc, dimensions);
Vector D = MatrixDiagonalizeJob.runJob(A.getRowPath(), dimensions);
-
+
long numCuts = 0;
do {
// first three steps are the same as spectral k-means:
@@ -97,51 +117,44 @@ public class EigencutsDriver extends Abs
// 2) calculate L = D^-0.5 * A * D^-0.5
// 3) calculate eigenvectors of L
- DistributedRowMatrix L = VectorMatrixMultiplicationJob.runJob(A.getRowPath(),
- D, new Path(outputCalc, "laplacian-" + (System.nanoTime() & 0xFF)));
+ DistributedRowMatrix L = VectorMatrixMultiplicationJob.runJob(A.getRowPath(), D, new Path(outputCalc, "laplacian-"
+ + (System.nanoTime() & 0xFF)));
L.configure(new JobConf(conf));
-
+
// eigendecomposition (step 3)
- int overshoot = (int)((double)dimensions *
- EigencutsDriver.OVERSHOOT_MULTIPLIER);
+ int overshoot = (int) ((double) dimensions * EigencutsDriver.OVERSHOOT_MULTIPLIER);
List<Double> eigenValues = new ArrayList<Double>(overshoot);
Matrix eigenVectors = new DenseMatrix(overshoot, dimensions);
- DistributedRowMatrix U = performEigenDecomposition(L,
- dimensions, overshoot, eigenValues, eigenVectors,
- outputCalc);
+ DistributedRowMatrix U = performEigenDecomposition(L, dimensions, overshoot, eigenValues, eigenVectors, outputCalc);
U.configure(new JobConf(conf));
eigenValues = eigenValues.subList(0, dimensions);
-
+
// here's where things get interesting: steps 4, 5, and 6 are unique
// to this algorithm, and depending on the final output, steps 1-3
// may be repeated as well
-
+
// helper method, since apparently List and Vector objects don't play nicely
Vector evs = listToVector(eigenValues);
-
+
// calculate sensitivities (step 4 and step 5)
Path sensitivities = new Path(outputCalc, "sensitivities-" + (System.nanoTime() & 0xFF));
- EigencutsSensitivityJob.runJob(evs, D, U.getRowPath(), halflife,
- tau, median(D), epsilon, sensitivities);
-
+ EigencutsSensitivityJob.runJob(evs, D, U.getRowPath(), halflife, tau, median(D), epsilon, sensitivities);
+
// perform the cuts (step 6)
input = new Path(outputTmp, "nextAff-" + (System.nanoTime() & 0xFF));
numCuts = EigencutsAffinityCutsJob.runjob(A.getRowPath(), sensitivities, input, conf);
-
+
// how many cuts were made?
if (numCuts > 0) {
// recalculate A
- A = new DistributedRowMatrix(input, new Path(outputTmp, Long.toString(System.nanoTime())),
- dimensions, dimensions);
+ A = new DistributedRowMatrix(input, new Path(outputTmp, Long.toString(System.nanoTime())), dimensions, dimensions);
A.configure(new JobConf());
}
} while (numCuts > 0);
-
- // TODO: output format???
-
- return 0;
+
+ // TODO: MAHOUT-517: Eigencuts needs an output format
}
-
+
/**
* Does most of the heavy lifting in setting up Paths, configuring return
* values, and generally performing the tedious administrative tasks involved
@@ -154,33 +167,39 @@ public class EigencutsDriver extends Abs
* @param tmp
* @return
*/
- public static DistributedRowMatrix performEigenDecomposition(
- DistributedRowMatrix input, int numEigenVectors, int overshoot,
- List<Double> eigenValues, Matrix eigenVectors, Path tmp)
- throws IOException {
+ public static DistributedRowMatrix performEigenDecomposition(DistributedRowMatrix input,
+ int numEigenVectors,
+ int overshoot,
+ List<Double> eigenValues,
+ Matrix eigenVectors,
+ Path tmp) throws IOException {
DistributedLanczosSolver solver = new DistributedLanczosSolver();
Path seqFiles = new Path(tmp, "eigendecomp-" + (System.nanoTime() & 0xFF));
- solver.runJob(new Configuration(), input.getRowPath(), new Path(tmp,
- "lanczos-" + (System.nanoTime() & 0xFF)), input.numRows(),
- input.numCols(), true, overshoot, eigenVectors, eigenValues,
- seqFiles.toString());
-
+ solver.runJob(new Configuration(),
+ input.getRowPath(),
+ new Path(tmp, "lanczos-" + (System.nanoTime() & 0xFF)),
+ input.numRows(),
+ input.numCols(),
+ true,
+ overshoot,
+ eigenVectors,
+ eigenValues,
+ seqFiles.toString());
+
// now run the verifier to trim down the number of eigenvectors
EigenVerificationJob verifier = new EigenVerificationJob();
Path verifiedEigens = new Path(tmp, "verifiedeigens");
- verifier.runJob(seqFiles, input.getRowPath(), verifiedEigens,
- false, 1.0, 0.0, numEigenVectors);
+ verifier.runJob(seqFiles, input.getRowPath(), verifiedEigens, false, 1.0, 0.0, numEigenVectors);
Path cleanedEigens = verifier.getCleanedEigensPath();
- return new DistributedRowMatrix(cleanedEigens, new Path(cleanedEigens,
- "tmp"), numEigenVectors, input.numRows());
+ return new DistributedRowMatrix(cleanedEigens, new Path(cleanedEigens, "tmp"), numEigenVectors, input.numRows());
}
-
+
/**
* A quick and dirty hack to compute the median of a vector...
* @param v
* @return
*/
- private double median(Vector v) {
+ private static double median(Vector v) {
OnlineSummarizer med = new OnlineSummarizer();
if (v.size() < 100) {
return v.zSum() / v.size();
@@ -190,14 +209,14 @@ public class EigencutsDriver extends Abs
}
return med.getMedian();
}
-
+
/**
* Iteratively loops through the list, converting it to a Vector of double
* primitives worthy of other Mahout operations
* @param list
* @return
*/
- private Vector listToVector(List<Double> list) {
+ private static Vector listToVector(List<Double> list) {
Vector retval = new DenseVector(list.size());
int index = 0;
for (Double d : list) {
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/spectral/kmeans/SpectralKMeansDriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/spectral/kmeans/SpectralKMeansDriver.java?rev=1003329&r1=1003328&r2=1003329&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/spectral/kmeans/SpectralKMeansDriver.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/spectral/kmeans/SpectralKMeansDriver.java Thu Sep 30 23:07:46 2010
@@ -17,6 +17,7 @@
package org.apache.mahout.clustering.spectral.kmeans;
+import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
@@ -37,8 +38,11 @@ import org.apache.mahout.clustering.spec
import org.apache.mahout.clustering.spectral.common.UnitVectorizerJob;
import org.apache.mahout.clustering.spectral.common.VectorMatrixMultiplicationJob;
import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.common.distance.DistanceMeasure;
import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
+import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure;
import org.apache.mahout.math.DenseMatrix;
import org.apache.mahout.math.Matrix;
import org.apache.mahout.math.Vector;
@@ -62,24 +66,65 @@ public class SpectralKMeansDriver extend
@Override
public int run(String[] arg0) throws Exception {
// set up command line options
- Configuration conf = new Configuration();
- addOption("input", "i", "Path to input affinity matrix data", true);
- addOption("output", "o", "Output of clusterings", true);
+ Configuration conf = getConf();
+ addInputOption();
+ addOutputOption();
addOption("dimensions", "d", "Square dimensions of affinity matrix", true);
addOption("clusters", "k", "Number of clusters and top eigenvectors", true);
+ addOption(DefaultOptionCreator.distanceMeasureOption().create());
+ addOption(DefaultOptionCreator.convergenceOption().create());
+ addOption(DefaultOptionCreator.maxIterationsOption().create());
+ addOption(DefaultOptionCreator.overwriteOption().create());
Map<String, String> parsedArgs = parseArguments(arg0);
if (parsedArgs == null) {
return 0;
}
- // TODO: Need to be able to read all k-means parameters, though
- // they will be optional parameters to the algorithm
- // read the values of the command line
- Path input = new Path(parsedArgs.get("--input"));
- Path output = new Path(parsedArgs.get("--output"));
+ Path input = getInputPath();
+ Path output = getOutputPath();
+ if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
+ HadoopUtil.overwriteOutput(output);
+ }
int numDims = Integer.parseInt(parsedArgs.get("--dimensions"));
int clusters = Integer.parseInt(parsedArgs.get("--clusters"));
+ String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
+ ClassLoader ccl = Thread.currentThread().getContextClassLoader();
+ DistanceMeasure measure = ccl.loadClass(measureClass).asSubclass(DistanceMeasure.class).newInstance();
+ double convergenceDelta = Double.parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION));
+ int maxIterations = Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION));
+
+ run(conf, input, output, numDims, clusters, measure, convergenceDelta, maxIterations);
+ return 0;
+ }
+
+ /**
+ * Run the Spectral KMeans clustering on the supplied arguments
+ *
+ * @param conf the Configuration to be used
+ * @param input the Path to the input tuples directory
+ * @param output the Path to the output directory
+ * @param numDims the int number of dimensions of the affinity matrix
+ * @param clusters the int number of eigenvectors and thus clusters to produce
+ * @param measure the DistanceMeasure for the k-Means calculations
+ * @param convergenceDelta the double convergence delta for the k-Means calculations
+ * @param maxIterations the int maximum number of iterations for the k-Means calculations
+ *
+ * @throws IOException
+ * @throws InterruptedException
+ * @throws ClassNotFoundException
+ * @throws IllegalAccessException
+ * @throws InstantiationException
+ */
+ public static void run(Configuration conf,
+ Path input,
+ Path output,
+ int numDims,
+ int clusters,
+ DistanceMeasure measure,
+ double convergenceDelta,
+ int maxIterations) throws IOException, InterruptedException, ClassNotFoundException,
+ IllegalAccessException, InstantiationException {
// create a few new Paths for temp files and transformations
Path outputCalc = new Path(output, "calculations");
Path outputTmp = new Path(output, "temporary");
@@ -157,12 +202,11 @@ public class SpectralKMeansDriver extend
// Finally, perform k-means clustering on the rows of L (or W)
// generate random initial clusters
- DistanceMeasure measure = new EuclideanDistanceMeasure();
Path initialclusters = RandomSeedGenerator.buildRandom(Wt.getRowPath(),
new Path(output, Cluster.INITIAL_CLUSTERS_DIR),
clusters,
measure);
- KMeansDriver.run(new Configuration(), Wt.getRowPath(), initialclusters, output, measure, 0.001, 10, true, false);
+ KMeansDriver.run(conf, Wt.getRowPath(), initialclusters, output, measure, convergenceDelta, maxIterations, true, false);
// Read through the cluster assignments
Path clusteredPointsPath = new Path(output, "clusteredPoints");
@@ -190,7 +234,5 @@ public class SpectralKMeansDriver extend
reader.close();
// TODO: output format???
-
- return 0;
}
}