You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sr...@apache.org on 2011/01/15 17:43:19 UTC
svn commit: r1059370 -
/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
Author: srowen
Date: Sat Jan 15 16:43:19 2011
New Revision: 1059370
URL: http://svn.apache.org/viewvc?rev=1059370&view=rev
Log:
MAHOUT-551
Modified:
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java?rev=1059370&r1=1059369&r2=1059370&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java Sat Jan 15 16:43:19 2011
@@ -28,6 +28,7 @@ import org.apache.mahout.clustering.Clus
import org.apache.mahout.clustering.canopy.CanopyDriver;
import org.apache.mahout.clustering.conversion.InputDriver;
import org.apache.mahout.clustering.kmeans.KMeansDriver;
+import org.apache.mahout.clustering.kmeans.RandomSeedGenerator;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
@@ -55,7 +56,7 @@ public final class Job extends AbstractJ
log.info("Running with default arguments");
Path output = new Path("output");
HadoopUtil.overwriteOutput(output);
- new Job().run(new Configuration(), new Path("testdata"), output, new EuclideanDistanceMeasure(), 80, 55, 0.5, 10);
+ new Job().run(new Configuration(), new Path("testdata"), output, new EuclideanDistanceMeasure(), 6, 0.5, 10);
}
}
@@ -65,6 +66,7 @@ public final class Job extends AbstractJ
addInputOption();
addOutputOption();
addOption(DefaultOptionCreator.distanceMeasureOption().create());
+ addOption(DefaultOptionCreator.numClustersOption().create());
addOption(DefaultOptionCreator.t1Option().create());
addOption(DefaultOptionCreator.t2Option().create());
addOption(DefaultOptionCreator.convergenceOption().create());
@@ -90,11 +92,66 @@ public final class Job extends AbstractJ
ClassLoader ccl = Thread.currentThread().getContextClassLoader();
Class<?> cl = ccl.loadClass(measureClass);
DistanceMeasure measure = (DistanceMeasure) cl.newInstance();
- double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION));
- double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION));
- run(getConf(), input, output, measure, t1, t2, convergenceDelta, maxIterations);
+ if (hasOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)) {
+ int k = Integer.parseInt(getOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION));
+ run(getConf(), input, output, measure, k, convergenceDelta, maxIterations);
+ } else {
+ double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION));
+ double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION));
+ run(getConf(), input, output, measure, t1, t2, convergenceDelta, maxIterations);
+ }
return 0;
}
+
+ /**
+ * Run the kmeans clustering job on an input dataset using the given the number of clusters k and iteration
+ * parameters. All output data will be written to the output directory, which will be initially deleted if
+ * it exists. The clustered points will reside in the path <output>/clustered-points. By default, the job
+ * expects a file containing equal length space delimited data that resides in a directory named
+ * "testdata", and writes output to a directory named "output".
+ * @param conf the Configuration to use
+ * @param input
+ * the String denoting the input directory path
+ * @param output
+ * the String denoting the output directory path
+ * @param measure
+ * the DistanceMeasure to use
+ * @param k
+ * the number of clusters in Kmeans
+ * @param convergenceDelta
+ * the double convergence criteria for iterations
+ * @param maxIterations
+ * the int maximum number of iterations
+ */
+ public void run(Configuration conf,
+ Path input,
+ Path output,
+ DistanceMeasure measure,
+ int k,
+ double convergenceDelta,
+ int maxIterations)
+ throws IOException, InstantiationException, IllegalAccessException, InterruptedException, ClassNotFoundException {
+ Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT);
+ log.info("Preparing Input");
+ InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector");
+ log.info("Running random seed to get initial clusters");
+ Path clusters= new Path(output, Cluster.INITIAL_CLUSTERS_DIR);
+ clusters = RandomSeedGenerator.buildRandom(directoryContainingConvertedInput, clusters, k, measure);
+ log.info("Running KMeans");
+ KMeansDriver.run(conf,
+ directoryContainingConvertedInput,
+ clusters,
+ output,
+ measure,
+ convergenceDelta,
+ maxIterations,
+ true,
+ false);
+ // run ClusterDumper
+ ClusterDumper clusterDumper =
+ new ClusterDumper(finalClusterPath(conf, output, maxIterations), new Path(output, "clusteredPoints"));
+ clusterDumper.printClusters(null);
+ }
/**
* Run the kmeans clustering job on an input dataset using the given distance measure, t1, t2 and iteration