You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sr...@apache.org on 2011/01/15 17:43:19 UTC
svn commit: r1059370 - /mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java

Author: srowen
Date: Sat Jan 15 16:43:19 2011
New Revision: 1059370

URL: http://svn.apache.org/viewvc?rev=1059370&view=rev
Log:
MAHOUT-551

Modified:
    mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java

Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java?rev=1059370&r1=1059369&r2=1059370&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java Sat Jan 15 16:43:19 2011
@@ -28,6 +28,7 @@ import org.apache.mahout.clustering.Clus
 import org.apache.mahout.clustering.canopy.CanopyDriver;
 import org.apache.mahout.clustering.conversion.InputDriver;
 import org.apache.mahout.clustering.kmeans.KMeansDriver;
+import org.apache.mahout.clustering.kmeans.RandomSeedGenerator;
 import org.apache.mahout.common.AbstractJob;
 import org.apache.mahout.common.HadoopUtil;
 import org.apache.mahout.common.commandline.DefaultOptionCreator;
@@ -55,7 +56,7 @@ public final class Job extends AbstractJ
       log.info("Running with default arguments");
       Path output = new Path("output");
       HadoopUtil.overwriteOutput(output);
-      new Job().run(new Configuration(), new Path("testdata"), output, new EuclideanDistanceMeasure(), 80, 55, 0.5, 10);
+      new Job().run(new Configuration(), new Path("testdata"), output, new EuclideanDistanceMeasure(), 6, 0.5, 10);
     }
   }
 
@@ -65,6 +66,7 @@ public final class Job extends AbstractJ
     addInputOption();
     addOutputOption();
     addOption(DefaultOptionCreator.distanceMeasureOption().create());
+    addOption(DefaultOptionCreator.numClustersOption().create());
     addOption(DefaultOptionCreator.t1Option().create());
     addOption(DefaultOptionCreator.t2Option().create());
     addOption(DefaultOptionCreator.convergenceOption().create());
@@ -90,11 +92,66 @@ public final class Job extends AbstractJ
     ClassLoader ccl = Thread.currentThread().getContextClassLoader();
     Class<?> cl = ccl.loadClass(measureClass);
     DistanceMeasure measure = (DistanceMeasure) cl.newInstance();
-    double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION));
-    double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION));
-    run(getConf(), input, output, measure, t1, t2, convergenceDelta, maxIterations);
+    if (hasOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)) {
+      int k = Integer.parseInt(getOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION));
+      run(getConf(), input, output, measure, k, convergenceDelta, maxIterations);
+    } else {
+      double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION));
+      double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION));
+      run(getConf(), input, output, measure, t1, t2, convergenceDelta, maxIterations);
+    }
     return 0;
   }
+  
+  /**
+   * Run the kmeans clustering job on an input dataset using the given the number of clusters k and iteration
+   * parameters. All output data will be written to the output directory, which will be initially deleted if
+   * it exists. The clustered points will reside in the path <output>/clustered-points. By default, the job
+   * expects a file containing equal length space delimited data that resides in a directory named
+   * "testdata", and writes output to a directory named "output".
+   * @param conf the Configuration to use
+   * @param input
+   *          the String denoting the input directory path
+   * @param output
+   *          the String denoting the output directory path
+   * @param measure
+   *          the DistanceMeasure to use
+   * @param k 
+   *          the number of clusters in Kmeans
+   * @param convergenceDelta
+   *          the double convergence criteria for iterations
+   * @param maxIterations
+   *          the int maximum number of iterations
+   */
+  public void run(Configuration conf,
+                  Path input,
+                  Path output,
+                  DistanceMeasure measure,
+                  int k,
+                  double convergenceDelta,
+                  int maxIterations)
+    throws IOException, InstantiationException, IllegalAccessException, InterruptedException, ClassNotFoundException {
+    Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT);
+    log.info("Preparing Input");
+    InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector");
+    log.info("Running random seed to get initial clusters");
+    Path clusters= new Path(output, Cluster.INITIAL_CLUSTERS_DIR);
+    clusters = RandomSeedGenerator.buildRandom(directoryContainingConvertedInput, clusters, k, measure);
+    log.info("Running KMeans");
+    KMeansDriver.run(conf,
+                     directoryContainingConvertedInput,
+                     clusters,
+                     output,
+                     measure,
+                     convergenceDelta,
+                     maxIterations,
+                     true,
+                     false);
+    // run ClusterDumper
+    ClusterDumper clusterDumper =
+        new ClusterDumper(finalClusterPath(conf, output, maxIterations), new Path(output, "clusteredPoints"));
+    clusterDumper.printClusters(null);
+  }
 
   /**
    * Run the kmeans clustering job on an input dataset using the given distance measure, t1, t2 and iteration