You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by je...@apache.org on 2010/10/11 03:03:06 UTC

svn commit: r1006367 - in /mahout/trunk: core/src/main/java/org/apache/mahout/clustering/spectral/eigencuts/ core/src/main/java/org/apache/mahout/clustering/spectral/kmeans/ core/src/main/java/org/apache/mahout/math/hadoop/decomposer/ examples/src/main...

Author: jeastman
Date: Mon Oct 11 01:03:06 2010
New Revision: 1006367

URL: http://svn.apache.org/viewvc?rev=1006367&view=rev
Log:
- Added DisplaySpectralKMeans example
- Made Configuration and JobConf handling explicit in EigenVerificationJob in order to make the example work
- Updated SpectralKMeansDriver, EigencutsDriver to supply configs
- Updated example README
All tests run. DisplaySpectralKMeans example gets pretty far through before bombing on W.transpose() after eigen cleanup. Committing for more eyeballs as is.

Added:
    mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplaySpectralKMeans.java
Modified:
    mahout/trunk/core/src/main/java/org/apache/mahout/clustering/spectral/eigencuts/EigencutsDriver.java
    mahout/trunk/core/src/main/java/org/apache/mahout/clustering/spectral/kmeans/SpectralKMeansDriver.java
    mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/decomposer/EigenVerificationJob.java
    mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/README.txt

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/spectral/eigencuts/EigencutsDriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/spectral/eigencuts/EigencutsDriver.java?rev=1006367&r1=1006366&r2=1006367&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/spectral/eigencuts/EigencutsDriver.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/spectral/eigencuts/EigencutsDriver.java Mon Oct 11 01:03:06 2010
@@ -129,7 +129,7 @@ public class EigencutsDriver extends Abs
       int overshoot = (int) ((double) dimensions * OVERSHOOT_MULTIPLIER);
       List<Double> eigenValues = new ArrayList<Double>(overshoot);
       Matrix eigenVectors = new DenseMatrix(overshoot, dimensions);
-      DistributedRowMatrix U = performEigenDecomposition(L, dimensions, overshoot, eigenValues, eigenVectors, outputCalc);
+      DistributedRowMatrix U = performEigenDecomposition(conf, L, dimensions, overshoot, eigenValues, eigenVectors, outputCalc);
       U.configure(new JobConf(conf));
       eigenValues = eigenValues.subList(0, dimensions);
 
@@ -164,15 +164,15 @@ public class EigencutsDriver extends Abs
    * values, and generally performing the tedious administrative tasks involved
    * in an eigen-decomposition and running the verifier
    */
-  public static DistributedRowMatrix performEigenDecomposition(DistributedRowMatrix input,
+  public static DistributedRowMatrix performEigenDecomposition(Configuration conf,
+                                                               DistributedRowMatrix input,
                                                                int numEigenVectors,
                                                                int overshoot,
                                                                List<Double> eigenValues,
-                                                               Matrix eigenVectors,
-                                                               Path tmp) throws IOException {
+                                                               Matrix eigenVectors, Path tmp) throws IOException {
     DistributedLanczosSolver solver = new DistributedLanczosSolver();
     Path seqFiles = new Path(tmp, "eigendecomp-" + (System.nanoTime() & 0xFF));
-    solver.runJob(new Configuration(),
+    solver.runJob(conf,
                   input.getRowPath(),
                   new Path(tmp, "lanczos-" + (System.nanoTime() & 0xFF)),
                   input.numRows(),
@@ -186,7 +186,7 @@ public class EigencutsDriver extends Abs
     // now run the verifier to trim down the number of eigenvectors
     EigenVerificationJob verifier = new EigenVerificationJob();
     Path verifiedEigens = new Path(tmp, "verifiedeigens");
-    verifier.runJob(seqFiles, input.getRowPath(), verifiedEigens, false, 1.0, 0.0, numEigenVectors);
+    verifier.runJob(conf, seqFiles, input.getRowPath(), verifiedEigens, false, 1.0, 0.0, numEigenVectors);
     Path cleanedEigens = verifier.getCleanedEigensPath();
     return new DistributedRowMatrix(cleanedEigens, new Path(cleanedEigens, "tmp"), numEigenVectors, input.numRows());
   }

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/spectral/kmeans/SpectralKMeansDriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/spectral/kmeans/SpectralKMeansDriver.java?rev=1006367&r1=1006366&r2=1006367&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/spectral/kmeans/SpectralKMeansDriver.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/spectral/kmeans/SpectralKMeansDriver.java Mon Oct 11 01:03:06 2010
@@ -146,7 +146,7 @@ public class SpectralKMeansDriver extend
     DistributedRowMatrix L =
         VectorMatrixMultiplicationJob.runJob(affSeqFiles, D,
             new Path(outputCalc, "laplacian-" + (System.nanoTime() & 0xFF)));
-    L.configure(new JobConf(conf));
+    L.configure(depConf);
 
     // Next step: perform eigen-decomposition using LanczosSolver
     // since some of the eigen-output is spurious and will be eliminated
@@ -171,10 +171,10 @@ public class SpectralKMeansDriver extend
     // perform a verification
     EigenVerificationJob verifier = new EigenVerificationJob();
     Path verifiedEigensPath = new Path(outputCalc, "eigenverifier");
-    verifier.runJob(lanczosSeqFiles, L.getRowPath(), verifiedEigensPath, true, 1.0, 0.0, clusters);
+    verifier.runJob(conf, lanczosSeqFiles, L.getRowPath(), verifiedEigensPath, true, 1.0, 0.0, clusters);
     Path cleanedEigens = verifier.getCleanedEigensPath();
     DistributedRowMatrix W = new DistributedRowMatrix(cleanedEigens, new Path(cleanedEigens, "tmp"), clusters, numDims);
-    W.configure(new JobConf());
+    W.configure(depConf);
     DistributedRowMatrix Wtrans = W.transpose();
     //    DistributedRowMatrix Wt = W.transpose();
 
@@ -182,7 +182,7 @@ public class SpectralKMeansDriver extend
     Path unitVectors = new Path(outputCalc, "unitvectors-" + (System.nanoTime() & 0xFF));
     UnitVectorizerJob.runJob(Wtrans.getRowPath(), unitVectors);
     DistributedRowMatrix Wt = new DistributedRowMatrix(unitVectors, new Path(unitVectors, "tmp"), clusters, numDims);
-    Wt.configure(new JobConf());
+    Wt.configure(depConf);
 
     // Finally, perform k-means clustering on the rows of L (or W)
     // generate random initial clusters

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/decomposer/EigenVerificationJob.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/decomposer/EigenVerificationJob.java?rev=1006367&r1=1006366&r2=1006367&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/decomposer/EigenVerificationJob.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/decomposer/EigenVerificationJob.java Mon Oct 11 01:03:06 2010
@@ -88,8 +88,6 @@ public class EigenVerificationJob extend
 
   private Path outPath;
 
-  private JobConf conf;
-
   private int maxEigensToKeep;
 
   private Path cleanedEigensPath;
@@ -107,13 +105,13 @@ public class EigenVerificationJob extend
       return 0;
     }
     // parse out the arguments
-    runJob(new Path(argMap.get("--eigenInput")),
+    runJob(getConf(),
+           new Path(argMap.get("--eigenInput")),
            new Path(argMap.get("--corpusInput")),
            getOutputPath(),
            argMap.get("--inMemory") != null,
            Double.parseDouble(argMap.get("--maxError")),
-           Double.parseDouble(argMap.get("--minEigenvalue")),
-           Integer.parseInt(argMap.get("--maxEigens")));
+           Double.parseDouble(argMap.get("--minEigenvalue")), Integer.parseInt(argMap.get("--maxEigens")));
     return 0;
   }
 
@@ -141,13 +139,12 @@ public class EigenVerificationJob extend
     this.tmpOut = tempOut;
     this.maxError = maxError;
     this.minEigenValue = minEigenValue;
-    this.conf = config != null ? config : new JobConf();
 
     if (eigenInput != null && eigensToVerify == null) {
-      prepareEigens(eigenInput, inMemory);
+      prepareEigens(config, eigenInput, inMemory);
     }
     DistributedRowMatrix c = new DistributedRowMatrix(corpusInput, tempOut, 1, 1);
-    c.configure(conf);
+    c.configure(config);
     corpus = c;
 
     // set up eigenverifier and orthoverifier TODO: allow multithreaded execution
@@ -161,7 +158,7 @@ public class EigenVerificationJob extend
 
     List<Map.Entry<MatrixSlice, EigenStatus>> prunedEigenMeta = pruneEigens(eigenMetaData);
 
-    saveCleanEigens(prunedEigenMeta);
+    saveCleanEigens(new Configuration(), prunedEigenMeta);
     return 0;
   }
 
@@ -185,7 +182,7 @@ public class EigenVerificationJob extend
     return OrthonormalityVerifier.pairwiseInnerProducts(eigensToVerify);
   }
 
-  private void saveCleanEigens(List<Map.Entry<MatrixSlice, EigenStatus>> prunedEigenMeta) throws IOException {
+  private void saveCleanEigens(Configuration conf, List<Map.Entry<MatrixSlice, EigenStatus>> prunedEigenMeta) throws IOException {
     Path path = new Path(outPath, CLEAN_EIGENVECTORS);
     FileSystem fs = FileSystem.get(conf);
     SequenceFile.Writer seqWriter = new SequenceFile.Writer(fs, conf, path, IntWritable.class, VectorWritable.class);
@@ -241,7 +238,7 @@ public class EigenVerificationJob extend
     return eigenMetaData;
   }
 
-  private void prepareEigens(Path eigenInput, boolean inMemory) {
+  private void prepareEigens(JobConf conf, Path eigenInput, boolean inMemory) {
     DistributedRowMatrix eigens = new DistributedRowMatrix(eigenInput, tmpOut, 1, 1);
     eigens.configure(conf);
     if (inMemory) {
@@ -267,6 +264,7 @@ public class EigenVerificationJob extend
 
   /**
    * Progammatic invocation of run()
+   * @param conf TODO
    * @param eigenInput Output of LanczosSolver
    * @param corpusInput Input of LanczosSolver
    * @param output
@@ -275,27 +273,24 @@ public class EigenVerificationJob extend
    * @param minEigenValue
    * @param maxEigens
    */
-  public void runJob(Path eigenInput,
+  public void runJob(Configuration conf,
+                     Path eigenInput,
                      Path corpusInput,
                      Path output,
                      boolean inMemory,
                      double maxError,
-                     double minEigenValue,
-                     int maxEigens) throws IOException {
+                     double minEigenValue, int maxEigens) throws IOException {
     // no need to handle command line arguments
     outPath = output;
     tmpOut = new Path(outPath, "tmp");
     maxEigensToKeep = maxEigens;
     this.maxError = maxError;
-    if (getConf() == null) {
-      setConf(new Configuration());
-    }
     if (eigenInput != null && eigensToVerify == null) {
-      prepareEigens(eigenInput, inMemory);
+      prepareEigens(new JobConf(conf), eigenInput, inMemory);
     }
 
     DistributedRowMatrix c = new DistributedRowMatrix(corpusInput, tmpOut, 1, 1);
-    c.configure(new JobConf(getConf()));
+    c.configure(new JobConf(conf));
     corpus = c;
 
     eigenVerifier = new SimpleEigenVerifier();
@@ -305,6 +300,6 @@ public class EigenVerificationJob extend
 
     Map<MatrixSlice, EigenStatus> eigenMetaData = verifyEigens();
     List<Map.Entry<MatrixSlice, EigenStatus>> prunedEigenMeta = pruneEigens(eigenMetaData);
-    saveCleanEigens(prunedEigenMeta);
+    saveCleanEigens(conf, prunedEigenMeta);
   }
 }

Added: mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplaySpectralKMeans.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplaySpectralKMeans.java?rev=1006367&view=auto
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplaySpectralKMeans.java (added)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplaySpectralKMeans.java Mon Oct 11 01:03:06 2010
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.display;
+
+import java.awt.Graphics;
+import java.awt.Graphics2D;
+import java.io.FileWriter;
+import java.io.PrintWriter;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.mahout.clustering.spectral.kmeans.SpectralKMeansDriver;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.common.distance.ManhattanDistanceMeasure;
+
+class DisplaySpectralKMeans extends DisplayClustering {
+
+  DisplaySpectralKMeans() {
+    initialize();
+    this.setTitle("Spectral k-Means Clusters (>" + (int) (significance * 100) + "% of population)");
+  }
+
+  public static void main(String[] args) throws Exception {
+    DistanceMeasure measure = new ManhattanDistanceMeasure();
+    Path samples = new Path("samples");
+    Path output = new Path("output");
+    HadoopUtil.overwriteOutput(samples);
+    HadoopUtil.overwriteOutput(output);
+
+    RandomUtils.useTestSeed();
+    DisplayClustering.generateSamples();
+    writeSampleData(samples);
+    int maxIter = 10;
+    double convergenceDelta = 0.001;
+    Path affinities = new Path(output, "affinities");
+    Configuration conf = new Configuration();
+    FileSystem fs = FileSystem.get(output.toUri(), conf);
+    if (!fs.exists(output)) {
+      fs.mkdirs(output);
+    }
+    FileWriter writer = new FileWriter(affinities.toString());
+    PrintWriter out = new PrintWriter(writer);
+    try {
+      for (int i = 0; i < SAMPLE_DATA.size(); i++) {
+        for (int j = 0; j < SAMPLE_DATA.size(); j++) {
+          out.println(i + "," + j + "," + measure.distance(SAMPLE_DATA.get(i).get(), SAMPLE_DATA.get(j).get()));
+        }
+      }
+    } finally {
+      out.close();
+    }
+    SpectralKMeansDriver.run(new Configuration(), affinities, output, 1100, 5, measure, convergenceDelta, maxIter);
+    loadClusters(output);
+    new DisplaySpectralKMeans();
+  }
+
+  // Override the paint() method
+  @Override
+  public void paint(Graphics g) {
+    plotSampleData((Graphics2D) g);
+    plotClusters((Graphics2D) g);
+  }
+}

Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/README.txt
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/README.txt?rev=1006367&r1=1006366&r2=1006367&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/README.txt (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/README.txt Mon Oct 11 01:03:06 2010
@@ -12,9 +12,9 @@ DisplayClustering - generates 1000 sampl
   * DisplayFuzzyKMeans - uses Fuzzy k-Means clustering
   * DisplayMeanShift - uses MeanShift clustering
   
-  * NOTE: some of these programs display the sample points and then superimposes all of the clusters
+  * NOTE: some of these programs display the sample points and then superimpose all of the clusters
     from each iteration. The last iteration's clusters are in bold red and the previous several are 
-    colored (orange, yellow, green, blue, magenta) in order after which all earlier clusters are in
+    colored (orange, yellow, green, blue, violet) in order after which all earlier clusters are in
     light grey. This helps to visualize how the clusters converge upon a solution over multiple
     iterations.
   * NOTE: by changing the parameter values (k, ALPHA_0, numIterations) and the display SIGNIFICANCE