You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by je...@apache.org on 2010/05/01 00:52:12 UTC
svn commit: r939867 - in /lucene/mahout/trunk:
core/src/main/java/org/apache/mahout/clustering/dirichlet/
core/src/main/java/org/apache/mahout/clustering/kmeans/
core/src/main/java/org/apache/mahout/clustering/meanshift/
core/src/test/java/org/apache/m...
Author: jeastman
Date: Fri Apr 30 22:52:11 2010
New Revision: 939867
URL: http://svn.apache.org/viewvc?rev=939867&view=rev
Log:
MAHOUT-236:
- removed output directory deletion from DirichletDriver.writeInitialState
- added runClustering option to KmeansDriver and MeanShiftCanopyDriver
- refactored methods from MeanShiftCanopyJob into MeanShiftCanopyDriver and removed job
- adjusted TestKmeansClustering and TestMeanShift
- adjusted synthetic control examples to employ ClusterDumper of outputs
- adjusted TestClusterDumper and TestDCbwEvaluator for KMeans and MeanShift job api changes
- decreased number of iterations in unit tests to improve performance
- all tests run
Removed:
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyJob.java
Modified:
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java
lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java
lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java
lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java
lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/Job.java
lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java
lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java?rev=939867&r1=939866&r2=939867&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java Fri Apr 30 22:52:11 2010
@@ -247,10 +247,8 @@ public class DirichletDriver {
IllegalAccessException, IOException, SecurityException, NoSuchMethodException, InvocationTargetException {
DirichletState<VectorWritable> state = createState(modelFactory, modelPrototype, prototypeSize, numModels, alpha_0);
- JobConf job = new JobConf(KMeansDriver.class);
- Path outPath = new Path(output);
- FileSystem fs = FileSystem.get(outPath.toUri(), job);
- fs.delete(outPath, true);
+ JobConf job = new JobConf(DirichletDriver.class);
+ FileSystem fs = FileSystem.get(new Path(output).toUri(), job);
for (int i = 0; i < numModels; i++) {
Path path = new Path(stateIn + "/part-" + i);
SequenceFile.Writer writer = new SequenceFile.Writer(fs, job, path, Text.class, DirichletCluster.class);
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java?rev=939867&r1=939866&r2=939867&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java Fri Apr 30 22:52:11 2010
@@ -49,86 +49,78 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public final class KMeansDriver {
-
+
private static final Logger log = LoggerFactory.getLogger(KMeansDriver.class);
-
- private KMeansDriver() {}
-
+
+ private KMeansDriver() {
+ }
+
/**
* @param args
* Expects 7 args and they all correspond to the order of the params in {@link #runJob}
*/
public static void main(String[] args) throws Exception {
-
+
DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
ArgumentBuilder abuilder = new ArgumentBuilder();
GroupBuilder gbuilder = new GroupBuilder();
-
+
Option inputOpt = obuilder.withLongName("input").withRequired(true).withArgument(
- abuilder.withName("input").withMinimum(1).withMaximum(1).create()).withDescription(
- "The Path for input Vectors. Must be a SequenceFile of Writable, Vector").withShortName("i").create();
-
- Option clustersOpt = obuilder
- .withLongName("clusters")
- .withRequired(true)
- .withArgument(abuilder.withName("clusters").withMinimum(1).withMaximum(1).create())
- .withDescription(
- "The input centroids, as Vectors. "
- + "Must be a SequenceFile of Writable, Cluster/Canopy. "
- + "If k is also specified, then a random set of vectors will be selected "
- + "and written out to this path first")
+ abuilder.withName("input").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The Path for input Vectors. Must be a SequenceFile of Writable, Vector").withShortName("i").create();
+
+ Option clustersOpt = obuilder.withLongName("clusters").withRequired(true).withArgument(
+ abuilder.withName("clusters").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The input centroids, as Vectors. " + "Must be a SequenceFile of Writable, Cluster/Canopy. "
+ + "If k is also specified, then a random set of vectors will be selected " + "and written out to this path first")
.withShortName("c").create();
-
- Option kOpt = obuilder
- .withLongName("k")
- .withRequired(false)
- .withArgument(abuilder.withName("k").withMinimum(1).withMaximum(1).create())
- .withDescription(
- "The k in k-Means. If specified, then a random selection of k Vectors will be chosen "
- + "as the Centroid and written to the clusters output path.")
- .withShortName("k").create();
-
- Option outputOpt = obuilder.withLongName("output").withRequired(true).withArgument(
- abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription(
- "The Path to put the output in").withShortName("o").create();
-
+
+ Option kOpt = obuilder.withLongName("k").withRequired(false).withArgument(
+ abuilder.withName("k").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The k in k-Means. If specified, then a random selection of k Vectors will be chosen "
+ + "as the Centroid and written to the clusters output path.").withShortName("k").create();
+
+ Option outputOpt = obuilder.withLongName("output").withRequired(true).withArgument(
+ abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription("The Path to put the output in")
+ .withShortName("o").create();
+
Option overwriteOutput = obuilder.withLongName("overwrite").withRequired(false).withDescription(
- "If set, overwrite the output directory").withShortName("w").create();
-
+ "If set, overwrite the output directory").withShortName("w").create();
+
Option measureClassOpt = obuilder.withLongName("distance").withRequired(false).withArgument(
- abuilder.withName("distance").withMinimum(1).withMaximum(1).create()).withDescription(
- "The Distance Measure to use. Default is SquaredEuclidean").withShortName("m").create();
-
+ abuilder.withName("distance").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The Distance Measure to use. Default is SquaredEuclidean").withShortName("m").create();
+
Option convergenceDeltaOpt = obuilder.withLongName("convergence").withRequired(false).withArgument(
- abuilder.withName("convergence").withMinimum(1).withMaximum(1).create()).withDescription(
- "The threshold below which the clusters are considered to be converged. Default is 0.5")
- .withShortName("d").create();
-
+ abuilder.withName("convergence").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The threshold below which the clusters are considered to be converged. Default is 0.5").withShortName("d").create();
+
Option maxIterationsOpt = obuilder.withLongName("max").withRequired(false).withArgument(
- abuilder.withName("max").withMinimum(1).withMaximum(1).create()).withDescription(
- "The maximum number of iterations to perform. Default is 20").withShortName("x").create();
-
+ abuilder.withName("max").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The maximum number of iterations to perform. Default is 20").withShortName("x").create();
+
Option vectorClassOpt = obuilder.withLongName("vectorClass").withRequired(false).withArgument(
- abuilder.withName("vectorClass").withMinimum(1).withMaximum(1).create()).withDescription(
- "The Vector implementation class name. Default is RandomAccessSparseVector.class").withShortName("v")
- .create();
-
+ abuilder.withName("vectorClass").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The Vector implementation class name. Default is RandomAccessSparseVector.class").withShortName("v").create();
+
Option numReduceTasksOpt = obuilder.withLongName("numReduce").withRequired(false).withArgument(
- abuilder.withName("numReduce").withMinimum(1).withMaximum(1).create()).withDescription(
- "The number of reduce tasks").withShortName("r").create();
-
- Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
+ abuilder.withName("numReduce").withMinimum(1).withMaximum(1).create()).withDescription("The number of reduce tasks")
+ .withShortName("r").create();
+
+ Option clusteringOpt = obuilder.withLongName("clustering").withRequired(false).withDescription(
+ "If true, run clustering after the iterations have taken place").withShortName("cl").create();
+
+ Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h").create();
+
+ Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(clustersOpt).withOption(outputOpt).withOption(
+ measureClassOpt).withOption(convergenceDeltaOpt).withOption(maxIterationsOpt).withOption(numReduceTasksOpt)
+ .withOption(kOpt).withOption(vectorClassOpt).withOption(overwriteOutput).withOption(helpOpt).withOption(clusteringOpt)
.create();
-
- Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(clustersOpt).withOption(
- outputOpt).withOption(measureClassOpt).withOption(convergenceDeltaOpt).withOption(maxIterationsOpt)
- .withOption(numReduceTasksOpt).withOption(kOpt).withOption(vectorClassOpt)
- .withOption(overwriteOutput).withOption(helpOpt).create();
try {
Parser parser = new Parser();
parser.setGroup(group);
CommandLine cmdLine = parser.parse(args);
-
+
if (cmdLine.hasOption(helpOpt)) {
CommandLineUtil.printHelp(group);
return;
@@ -144,11 +136,11 @@ public final class KMeansDriver {
if (cmdLine.hasOption(convergenceDeltaOpt)) {
convergenceDelta = Double.parseDouble(cmdLine.getValue(convergenceDeltaOpt).toString());
}
-
+
// Class<? extends Vector> vectorClass = cmdLine.hasOption(vectorClassOpt) == false ?
// RandomAccessSparseVector.class
// : (Class<? extends Vector>) Class.forName(cmdLine.getValue(vectorClassOpt).toString());
-
+
int maxIterations = 20;
if (cmdLine.hasOption(maxIterationsOpt)) {
maxIterations = Integer.parseInt(cmdLine.getValue(maxIterationsOpt).toString());
@@ -161,16 +153,19 @@ public final class KMeansDriver {
HadoopUtil.overwriteOutput(output);
}
if (cmdLine.hasOption(kOpt)) {
- clusters = RandomSeedGenerator.buildRandom(input, clusters,
- Integer.parseInt(cmdLine.getValue(kOpt).toString())).toString();
+ clusters = RandomSeedGenerator.buildRandom(input, clusters, Integer.parseInt(cmdLine.getValue(kOpt).toString())).toString();
}
- runJob(input, clusters, output, measureClass, convergenceDelta, maxIterations, numReduceTasks);
+ boolean runClustering = true;
+ if (cmdLine.hasOption(clusteringOpt)) {
+ runClustering = Boolean.parseBoolean(cmdLine.getValue(clusteringOpt).toString());
+ }
+ runJob(input, clusters, output, measureClass, convergenceDelta, maxIterations, numReduceTasks, runClustering);
} catch (OptionException e) {
log.error("Exception", e);
CommandLineUtil.printHelp(group);
}
}
-
+
/**
* Run the job using supplied arguments
*
@@ -188,21 +183,17 @@ public final class KMeansDriver {
* the maximum number of iterations
* @param numReduceTasks
* the number of reducers
+ * @param runClustering
+ * true if points are to be clustered after iterations are completed
*/
- public static void runJob(String input,
- String clustersIn,
- String output,
- String measureClass,
- double convergenceDelta,
- int maxIterations,
- int numReduceTasks) {
+ public static void runJob(String input, String clustersIn, String output, String measureClass, double convergenceDelta,
+ int maxIterations, int numReduceTasks, boolean runClustering) {
// iterate until the clusters converge
String delta = Double.toString(convergenceDelta);
if (log.isInfoEnabled()) {
- log.info("Input: {} Clusters In: {} Out: {} Distance: {}", new Object[] {input, clustersIn, output,
- measureClass});
- log.info("convergence: {} max Iterations: {} num Reduce Tasks: {} Input Vectors: {}",
- new Object[] {convergenceDelta, maxIterations, numReduceTasks, VectorWritable.class.getName()});
+ log.info("Input: {} Clusters In: {} Out: {} Distance: {}", new Object[] { input, clustersIn, output, measureClass });
+ log.info("convergence: {} max Iterations: {} num Reduce Tasks: {} Input Vectors: {}", new Object[] { convergenceDelta,
+ maxIterations, numReduceTasks, VectorWritable.class.getName() });
}
boolean converged = false;
int iteration = 1;
@@ -215,11 +206,13 @@ public final class KMeansDriver {
clustersIn = clustersOut;
iteration++;
}
- // now actually cluster the points
- log.info("Clustering ");
- runClustering(input, clustersIn, output + Cluster.CLUSTERED_POINTS_DIR, measureClass, delta);
+ if (runClustering) {
+ // now actually cluster the points
+ log.info("Clustering ");
+ runClustering(input, clustersIn, output + Cluster.CLUSTERED_POINTS_DIR, measureClass, delta);
+ }
}
-
+
/**
* Run the job using supplied arguments
*
@@ -239,19 +232,14 @@ public final class KMeansDriver {
* The iteration number
* @return true if the iteration successfully runs
*/
- private static boolean runIteration(String input,
- String clustersIn,
- String clustersOut,
- String measureClass,
- String convergenceDelta,
- int numReduceTasks,
- int iteration) {
+ private static boolean runIteration(String input, String clustersIn, String clustersOut, String measureClass,
+ String convergenceDelta, int numReduceTasks, int iteration) {
JobConf conf = new JobConf(KMeansDriver.class);
conf.setMapOutputKeyClass(Text.class);
conf.setMapOutputValueClass(KMeansInfo.class);
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(Cluster.class);
-
+
FileInputFormat.setInputPaths(conf, new Path(input));
Path outPath = new Path(clustersOut);
FileOutputFormat.setOutputPath(conf, outPath);
@@ -264,7 +252,7 @@ public final class KMeansDriver {
conf.set(KMeansConfigKeys.CLUSTER_PATH_KEY, clustersIn);
conf.set(KMeansConfigKeys.DISTANCE_MEASURE_KEY, measureClass);
conf.set(KMeansConfigKeys.CLUSTER_CONVERGENCE_KEY, convergenceDelta);
-
+
try {
JobClient.runJob(conf);
FileSystem fs = FileSystem.get(outPath.toUri(), conf);
@@ -274,7 +262,7 @@ public final class KMeansDriver {
return true;
}
}
-
+
/**
* Run the job using supplied arguments
*
@@ -289,41 +277,36 @@ public final class KMeansDriver {
* @param convergenceDelta
* the convergence delta value
*/
- private static void runClustering(String input,
- String clustersIn,
- String output,
- String measureClass,
- String convergenceDelta) {
+ private static void runClustering(String input, String clustersIn, String output, String measureClass, String convergenceDelta) {
if (log.isInfoEnabled()) {
log.info("Running Clustering");
- log.info("Input: {} Clusters In: {} Out: {} Distance: {}", new Object[] {input, clustersIn, output,
- measureClass});
+ log.info("Input: {} Clusters In: {} Out: {} Distance: {}", new Object[] { input, clustersIn, output, measureClass });
log.info("convergence: {} Input Vectors: {}", convergenceDelta, VectorWritable.class.getName());
}
JobConf conf = new JobConf(KMeansDriver.class);
conf.setInputFormat(SequenceFileInputFormat.class);
conf.setOutputFormat(SequenceFileOutputFormat.class);
-
+
conf.setOutputKeyClass(IntWritable.class);
conf.setOutputValueClass(WeightedVectorWritable.class);
-
+
FileInputFormat.setInputPaths(conf, new Path(input));
Path outPath = new Path(output);
FileOutputFormat.setOutputPath(conf, outPath);
-
+
conf.setMapperClass(KMeansClusterMapper.class);
conf.setNumReduceTasks(0);
conf.set(KMeansConfigKeys.CLUSTER_PATH_KEY, clustersIn);
conf.set(KMeansConfigKeys.DISTANCE_MEASURE_KEY, measureClass);
conf.set(KMeansConfigKeys.CLUSTER_CONVERGENCE_KEY, convergenceDelta);
-
+
try {
JobClient.runJob(conf);
} catch (IOException e) {
log.warn(e.toString(), e);
}
}
-
+
/**
* Return if all of the Clusters in the parts in the filePath have converged or not
*
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java?rev=939867&r1=939866&r2=939867&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java Fri Apr 30 22:52:11 2010
@@ -28,6 +28,8 @@ import org.apache.commons.cli2.builder.D
import org.apache.commons.cli2.builder.GroupBuilder;
import org.apache.commons.cli2.commandline.Parser;
import org.apache.hadoop.conf.Configurable;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
@@ -37,6 +39,7 @@ import org.apache.hadoop.mapred.JobClien
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
+import org.apache.mahout.clustering.Cluster;
import org.apache.mahout.clustering.WeightedVectorWritable;
import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver;
import org.apache.mahout.common.CommandLineUtil;
@@ -50,10 +53,12 @@ public final class MeanShiftCanopyDriver
public static final String STATE_IN_KEY = "org.apache.mahout.clustering.meanshift.stateInKey";
+ protected static final String CONTROL_CONVERGED = "/control/converged";
+
private MeanShiftCanopyDriver() {
}
- public static void main(String[] args) {
+ public static void main(String[] args) throws IOException {
DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
ArgumentBuilder abuilder = new ArgumentBuilder();
GroupBuilder gbuilder = new GroupBuilder();
@@ -62,6 +67,10 @@ public final class MeanShiftCanopyDriver
Option outputOpt = DefaultOptionCreator.outputOption().create();
Option convergenceDeltaOpt = DefaultOptionCreator.convergenceOption().create();
Option helpOpt = DefaultOptionCreator.helpOption();
+ Option maxIterOpt = DefaultOptionCreator.maxIterOption().create();
+ Option inputIsCanopiesOpt = obuilder.withLongName("inputIsCanopies").withRequired(true).withShortName("i").withArgument(
+ abuilder.withName("inputIsCanopies").withMinimum(1).withMaximum(1).create()).withDescription(
+ "True if the input directory already contains MeanShiftCanopies").create();
Option modelOpt = obuilder.withLongName("distanceClass").withRequired(true).withShortName("d").withArgument(
abuilder.withName("distanceClass").withMinimum(1).withMaximum(1).create()).withDescription(
@@ -75,8 +84,12 @@ public final class MeanShiftCanopyDriver
abuilder.withName("threshold_2").withMinimum(1).withMaximum(1).create()).withDescription("The T1 distance threshold.")
.create();
+ Option clusteringOpt = obuilder.withLongName("clustering").withRequired(false).withDescription(
+ "If true, run clustering after the iterations have taken place").withShortName("cl").create();
+
Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(modelOpt).withOption(helpOpt)
- .withOption(convergenceDeltaOpt).withOption(threshold1Opt).withOption(threshold2Opt).create();
+ .withOption(convergenceDeltaOpt).withOption(threshold1Opt).withOption(threshold2Opt).withOption(clusteringOpt).withOption(
+ maxIterOpt).withOption(inputIsCanopiesOpt).create();
try {
Parser parser = new Parser();
@@ -86,6 +99,10 @@ public final class MeanShiftCanopyDriver
CommandLineUtil.printHelp(group);
return;
}
+ boolean runClustering = true;
+ if (cmdLine.hasOption(clusteringOpt)) {
+ runClustering = Boolean.parseBoolean(cmdLine.getValue(clusteringOpt).toString());
+ }
String input = cmdLine.getValue(inputOpt).toString();
String output = cmdLine.getValue(outputOpt).toString();
@@ -93,9 +110,10 @@ public final class MeanShiftCanopyDriver
double t1 = Double.parseDouble(cmdLine.getValue(threshold1Opt).toString());
double t2 = Double.parseDouble(cmdLine.getValue(threshold2Opt).toString());
double convergenceDelta = Double.parseDouble(cmdLine.getValue(convergenceDeltaOpt).toString());
+ int maxIterations = Integer.parseInt(cmdLine.getValue(maxIterOpt).toString());
+ boolean inputIsCanopies = Boolean.parseBoolean(cmdLine.getValue(inputIsCanopiesOpt).toString());
createCanopyFromVectors(input, output + "/intial-canopies");
- runJob(output + "/intial-canopies", output, output + MeanShiftCanopyConfigKeys.CONTROL_PATH_KEY, measureClassName, t1, t2,
- convergenceDelta);
+ runJob(input, output, measureClassName, t1, t2, convergenceDelta, maxIterations, inputIsCanopies, runClustering);
} catch (OptionException e) {
log.error("Exception parsing command line: ", e);
CommandLineUtil.printHelp(group);
@@ -103,7 +121,7 @@ public final class MeanShiftCanopyDriver
}
/**
- * Run the job
+ * Run an iteration
*
* @param input
* the input pathname String
@@ -120,7 +138,7 @@ public final class MeanShiftCanopyDriver
* @param convergenceDelta
* the double convergence criteria
*/
- public static void runJob(String input, String output, String control, String measureClassName, double t1, double t2,
+ static void runIteration(String input, String output, String control, String measureClassName, double t1, double t2,
double convergenceDelta) {
Configurable client = new JobClient();
@@ -160,7 +178,7 @@ public final class MeanShiftCanopyDriver
* @param output
* the output pathname String
*/
- public static void createCanopyFromVectors(String input, String output) {
+ static void createCanopyFromVectors(String input, String output) {
Configurable client = new JobClient();
JobConf conf = new JobConf(MeanShiftCanopyDriver.class);
@@ -195,25 +213,23 @@ public final class MeanShiftCanopyDriver
* @param output
* the directory pathname for output clustered points
*/
- public static void runClustering(String input,
- String clustersIn,
- String output) {
-
+ static void runClustering(String input, String clustersIn, String output) {
+
JobConf conf = new JobConf(FuzzyKMeansDriver.class);
conf.setJobName("Mean Shift Clustering");
-
+
conf.setOutputKeyClass(IntWritable.class);
conf.setOutputValueClass(WeightedVectorWritable.class);
-
+
FileInputFormat.setInputPaths(conf, new Path(input));
Path outPath = new Path(output);
FileOutputFormat.setOutputPath(conf, outPath);
-
+
conf.setMapperClass(MeanShiftCanopyClusterMapper.class);
-
+
conf.setInputFormat(SequenceFileInputFormat.class);
conf.setOutputFormat(SequenceFileOutputFormat.class);
-
+
// uncomment it to run locally
// conf.set("mapred.job.tracker", "local");
conf.setNumReduceTasks(0);
@@ -224,4 +240,60 @@ public final class MeanShiftCanopyDriver
log.warn(e.toString(), e);
}
}
+
+ /**
+ * Run the job where the input format can be either Vectors or Canopies
+ *
+ * @param input
+ * the input pathname String
+ * @param output
+ * the output pathname String
+ * @param measureClassName
+ * the DistanceMeasure class name
+ * @param t1
+ * the T1 distance threshold
+ * @param t2
+ * the T2 distance threshold
+ * @param convergenceDelta
+ * the double convergence criteria
+ * @param maxIterations
+ * an int number of iterations
+ * @param inputIsCanopies
+ true if the input path already contains MeanShiftCanopies and does not need to be converted from Vectors
+ * @param runClustering
+ * true if the input points are to be clustered once the iterations complete
+ */
+ public static void runJob(String input, String output, String measureClassName, double t1, double t2, double convergenceDelta,
+ int maxIterations, boolean inputIsCanopies, boolean runClustering) throws IOException {
+ // delete the output directory
+ Configuration conf = new JobConf(MeanShiftCanopyDriver.class);
+
+ String clustersIn = output + Cluster.INITIAL_CLUSTERS_DIR;
+ if (inputIsCanopies) {
+ clustersIn = input;
+ } else {
+ createCanopyFromVectors(input, clustersIn);
+ }
+
+ // iterate until the clusters converge
+ boolean converged = false;
+ int iteration = 1;
+ while (!converged && (iteration <= maxIterations)) {
+ log.info("Iteration {}", iteration);
+ // point the output to a new directory per iteration
+ String clustersOut = output + Cluster.CLUSTERS_DIR + iteration;
+ String controlOut = output + CONTROL_CONVERGED;
+ runIteration(clustersIn, clustersOut, controlOut, measureClassName, t1, t2, convergenceDelta);
+ converged = FileSystem.get(conf).exists(new Path(controlOut));
+ // now point the input to the old output directory
+ clustersIn = clustersOut;
+ iteration++;
+ }
+
+ if (runClustering) {
+ // now cluster the points
+ MeanShiftCanopyDriver.runClustering((inputIsCanopies ? input : output + Cluster.INITIAL_CLUSTERS_DIR), clustersIn, output
+ + Cluster.CLUSTERED_POINTS_DIR);
+ }
+ }
}
Modified: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java?rev=939867&r1=939866&r2=939867&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java (original)
+++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java Fri Apr 30 22:52:11 2010
@@ -364,7 +364,7 @@ public class TestKmeansClustering extend
// now run the Job
HadoopUtil.overwriteOutput("output");
KMeansDriver.runJob("testdata/points", "testdata/clusters", "output", EuclideanDistanceMeasure.class.getName(), 0.001, 10,
- k + 1);
+ k + 1, true);
// now compare the expected clusters with actual
File outDir = new File("output/clusteredPoints");
assertTrue("output dir exists?", outDir.exists());
@@ -412,7 +412,7 @@ public class TestKmeansClustering extend
CanopyDriver.runJob("testdata/points", "output", ManhattanDistanceMeasure.class.getName(), 3.1, 2.1, false);
// now run the KMeans job
- KMeansDriver.runJob("testdata/points", "output/clusters-0", "output", EuclideanDistanceMeasure.class.getName(), 0.001, 10, 1);
+ KMeansDriver.runJob("testdata/points", "output/clusters-0", "output", EuclideanDistanceMeasure.class.getName(), 0.001, 10, 1, true);
// now compare the expected clusters with actual
File outDir = new File("output/clusteredPoints");
Modified: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java?rev=939867&r1=939866&r2=939867&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java (original)
+++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java Fri Apr 30 22:52:11 2010
@@ -304,7 +304,7 @@ public class TestMeanShift extends Mahou
ClusteringTestUtils.writePointsToFile(points, "testdata/file1", fs, conf);
ClusteringTestUtils.writePointsToFile(points, "testdata/file2", fs, conf);
// now run the Job
- MeanShiftCanopyJob.runJob("testdata", "output", EuclideanDistanceMeasure.class.getName(), 4, 1, 0.5, 10);
+ MeanShiftCanopyDriver.runJob("testdata", "output", EuclideanDistanceMeasure.class.getName(), 4, 1, 0.5, 10, false, false);
JobConf conf = new JobConf(MeanShiftCanopyDriver.class);
Path outPart = new Path("output/clusters-3/part-00000");
SequenceFile.Reader reader = new SequenceFile.Reader(fs, outPart, conf);
Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java?rev=939867&r1=939866&r2=939867&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java Fri Apr 30 22:52:11 2010
@@ -34,6 +34,7 @@ import org.apache.hadoop.mapred.JobConf;
import org.apache.mahout.clustering.canopy.CanopyDriver;
import org.apache.mahout.clustering.syntheticcontrol.Constants;
import org.apache.mahout.common.CommandLineUtil;
+import org.apache.mahout.utils.clustering.ClusterDumper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -123,9 +124,11 @@ public final class Job {
* the canopy T1 threshold
* @param t2
* the canopy T2 threshold
+ * @throws IllegalAccessException
+ * @throws InstantiationException
*/
private static void runJob(String input, String output, String measureClassName,
- double t1, double t2) throws IOException {
+ double t1, double t2) throws IOException, InstantiationException, IllegalAccessException {
JobClient client = new JobClient();
JobConf conf = new JobConf(Job.class);
@@ -139,6 +142,10 @@ public final class Job {
InputDriver.runJob(input, directoryContainingConvertedInput,
"org.apache.mahout.math.RandomAccessSparseVector");
CanopyDriver.runJob(directoryContainingConvertedInput, output, measureClassName, t1, t2, true);
+
+ ClusterDumper clusterDumper = new ClusterDumper("output/clusters-0", "output/clusteredPoints");
+ clusterDumper.printClusters(null);
+
}
}
Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/Job.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/Job.java?rev=939867&r1=939866&r2=939867&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/Job.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/Job.java Fri Apr 30 22:52:11 2010
@@ -43,6 +43,7 @@ import org.apache.mahout.clustering.synt
import org.apache.mahout.common.CommandLineUtil;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.utils.clustering.ClusterDumper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -161,9 +162,12 @@ public class Job {
fs.mkdirs(outPath);
String directoryContainingConvertedInput = output + Constants.DIRECTORY_CONTAINING_CONVERTED_INPUT;
InputDriver.runJob(input, directoryContainingConvertedInput, vectorClassName);
- DirichletDriver.runJob(directoryContainingConvertedInput, output + "/state", modelFactory,
+ DirichletDriver.runJob(directoryContainingConvertedInput, output, modelFactory,
vectorClassName, 60, numModels, maxIterations, alpha_0, numReducers, true, true, 0);
- printResults(output + "/state", modelFactory, vectorClassName, 60, maxIterations, numModels, alpha_0);
+
+ ClusterDumper clusterDumper = new ClusterDumper("output/clusters-5", "output/clusteredPoints");
+ clusterDumper.printClusters(null);
+
}
/**
Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java?rev=939867&r1=939866&r2=939867&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java Fri Apr 30 22:52:11 2010
@@ -38,58 +38,59 @@ import org.apache.mahout.clustering.synt
import org.apache.mahout.clustering.syntheticcontrol.canopy.InputDriver;
import org.apache.mahout.common.CommandLineUtil;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.utils.clustering.ClusterDumper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public final class Job {
-
+
private static final Logger log = LoggerFactory.getLogger(Job.class);
-
- private Job() { }
-
+
+ private Job() {
+ }
+
public static void main(String[] args) throws Exception {
DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
ArgumentBuilder abuilder = new ArgumentBuilder();
GroupBuilder gbuilder = new GroupBuilder();
-
+
Option inputOpt = DefaultOptionCreator.inputOption().withRequired(false).create();
Option outputOpt = DefaultOptionCreator.outputOption().withRequired(false).create();
Option convergenceDeltaOpt = DefaultOptionCreator.convergenceOption().withRequired(false).create();
Option maxIterationsOpt = DefaultOptionCreator.maxIterOption().withRequired(false).create();
-
+
Option measureClassOpt = obuilder.withLongName("distance").withRequired(false).withArgument(
- abuilder.withName("distance").withMinimum(1).withMaximum(1).create()).withDescription(
- "The Distance Measure to use. Default is SquaredEuclidean").withShortName("m").create();
-
+ abuilder.withName("distance").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The Distance Measure to use. Default is SquaredEuclidean").withShortName("m").create();
+
Option t1Opt = obuilder.withLongName("t1").withRequired(false).withArgument(
- abuilder.withName("t1").withMinimum(1).withMaximum(1).create()).withDescription("The t1 value to use.")
- .withShortName("m").create();
+ abuilder.withName("t1").withMinimum(1).withMaximum(1).create()).withDescription("The t1 value to use.").withShortName("m")
+ .create();
Option t2Opt = obuilder.withLongName("t2").withRequired(false).withArgument(
- abuilder.withName("t2").withMinimum(1).withMaximum(1).create()).withDescription("The t2 value to use.")
- .withShortName("m").create();
- Option vectorClassOpt = obuilder.withLongName("vectorClass").withRequired(false).withArgument(
- abuilder.withName("vectorClass").withMinimum(1).withMaximum(1).create()).withDescription(
- "The Vector implementation class name. Default is RandomAccessSparseVector.class").withShortName("v")
+ abuilder.withName("t2").withMinimum(1).withMaximum(1).create()).withDescription("The t2 value to use.").withShortName("m")
.create();
-
+ Option vectorClassOpt = obuilder.withLongName("vectorClass").withRequired(false).withArgument(
+ abuilder.withName("vectorClass").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The Vector implementation class name. Default is RandomAccessSparseVector.class").withShortName("v").create();
+
Option helpOpt = DefaultOptionCreator.helpOption();
-
- Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(
- measureClassOpt).withOption(convergenceDeltaOpt).withOption(maxIterationsOpt)
- .withOption(vectorClassOpt).withOption(t1Opt).withOption(t2Opt).withOption(helpOpt).create();
+
+ Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(measureClassOpt).withOption(
+ convergenceDeltaOpt).withOption(maxIterationsOpt).withOption(vectorClassOpt).withOption(t1Opt).withOption(t2Opt)
+ .withOption(helpOpt).create();
try {
Parser parser = new Parser();
parser.setGroup(group);
CommandLine cmdLine = parser.parse(args);
-
+
if (cmdLine.hasOption(helpOpt)) {
CommandLineUtil.printHelp(group);
return;
}
String input = cmdLine.getValue(inputOpt, "testdata").toString();
String output = cmdLine.getValue(outputOpt, "output").toString();
- String measureClass = cmdLine.getValue(measureClassOpt,
- "org.apache.mahout.common.distance.EuclideanDistanceMeasure").toString();
+ String measureClass = cmdLine.getValue(measureClassOpt, "org.apache.mahout.common.distance.EuclideanDistanceMeasure")
+ .toString();
double t1 = Double.parseDouble(cmdLine.getValue(t1Opt, "80").toString());
double t2 = Double.parseDouble(cmdLine.getValue(t2Opt, "55").toString());
double convergenceDelta = Double.parseDouble(cmdLine.getValue(convergenceDeltaOpt, "0.5").toString());
@@ -97,14 +98,14 @@ public final class Job {
// String className = cmdLine.getValue(vectorClassOpt,
// "org.apache.mahout.math.RandomAccessSparseVector").toString();
// Class<? extends Vector> vectorClass = Class.forName(className).asSubclass(Vector.class);
-
+
runJob(input, output, measureClass, t1, t2, convergenceDelta, maxIterations);
} catch (OptionException e) {
log.error("Exception", e);
CommandLineUtil.printHelp(group);
}
}
-
+
/**
* Run the kmeans clustering job on an input dataset using the given distance measure, t1, t2 and iteration
* parameters. All output data will be written to the output directory, which will be initially deleted if
@@ -127,17 +128,14 @@ public final class Job {
* the double convergence criteria for iterations
* @param maxIterations
* the int maximum number of iterations
+ * @throws IllegalAccessException
+ * @throws InstantiationException
*/
- private static void runJob(String input,
- String output,
- String measureClass,
- double t1,
- double t2,
- double convergenceDelta,
- int maxIterations) throws IOException {
+ private static void runJob(String input, String output, String measureClass, double t1, double t2, double convergenceDelta,
+ int maxIterations) throws IOException, InstantiationException, IllegalAccessException {
JobClient client = new JobClient();
JobConf conf = new JobConf(Job.class);
-
+
Path outPath = new Path(output);
client.setConf(conf);
FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
@@ -146,14 +144,14 @@ public final class Job {
}
String directoryContainingConvertedInput = output + Constants.DIRECTORY_CONTAINING_CONVERTED_INPUT;
log.info("Preparing Input");
- InputDriver.runJob(input, directoryContainingConvertedInput,
- "org.apache.mahout.math.RandomAccessSparseVector");
+ InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector");
log.info("Running Canopy to get initial clusters");
- CanopyDriver.runJob(directoryContainingConvertedInput,
- output + Cluster.INITIAL_CLUSTERS_DIR, measureClass, t1, t2, false);
+ CanopyDriver.runJob(directoryContainingConvertedInput, output, measureClass, t1, t2, false);
log.info("Running KMeans");
- KMeansDriver.runJob(directoryContainingConvertedInput,
- output + Cluster.INITIAL_CLUSTERS_DIR, output, measureClass, convergenceDelta,
- maxIterations, 1);
+ KMeansDriver.runJob(directoryContainingConvertedInput, output + Cluster.INITIAL_CLUSTERS_DIR, output, measureClass,
+ convergenceDelta, maxIterations, 1, true);
+
+ ClusterDumper clusterDumper = new ClusterDumper("output/clusters-10", "output/clusteredPoints");
+ clusterDumper.printClusters(null);
}
}
Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java?rev=939867&r1=939866&r2=939867&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java Fri Apr 30 22:52:11 2010
@@ -32,48 +32,49 @@ import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
-import org.apache.mahout.clustering.meanshift.MeanShiftCanopyJob;
+import org.apache.mahout.clustering.meanshift.MeanShiftCanopyDriver;
import org.apache.mahout.clustering.syntheticcontrol.Constants;
import org.apache.mahout.common.CommandLineUtil;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.utils.clustering.ClusterDumper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public final class Job {
private static final Logger log = LoggerFactory.getLogger(Job.class);
-
+
private static final String CLUSTERED_POINTS_OUTPUT_DIRECTORY = "/clusteredPoints";
-
- private Job() {}
-
+
+ private Job() {
+ }
+
public static void main(String[] args) throws Exception {
DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
ArgumentBuilder abuilder = new ArgumentBuilder();
GroupBuilder gbuilder = new GroupBuilder();
-
+
Option inputOpt = DefaultOptionCreator.inputOption().withRequired(false).create();
Option outputOpt = DefaultOptionCreator.outputOption().withRequired(false).create();
Option convergenceDeltaOpt = DefaultOptionCreator.convergenceOption().withRequired(false).create();
Option maxIterOpt = DefaultOptionCreator.maxIterOption().withRequired(false).create();
Option helpOpt = DefaultOptionCreator.helpOption();
-
- Option modelOpt = obuilder.withLongName("distanceClass").withRequired(false).withShortName("d")
- .withArgument(abuilder.withName("distanceClass").withMinimum(1).withMaximum(1).create())
- .withDescription("The distance measure class name.").create();
-
- Option threshold1Opt = obuilder.withLongName("threshold_1").withRequired(false).withShortName("t1")
- .withArgument(abuilder.withName("threshold_1").withMinimum(1).withMaximum(1).create())
- .withDescription("The T1 distance threshold.").create();
-
- Option threshold2Opt = obuilder.withLongName("threshold_2").withRequired(false).withShortName("t2")
- .withArgument(abuilder.withName("threshold_2").withMinimum(1).withMaximum(1).create())
- .withDescription("The T1 distance threshold.").create();
-
- Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt)
- .withOption(modelOpt).withOption(helpOpt).withOption(convergenceDeltaOpt).withOption(threshold1Opt)
- .withOption(maxIterOpt).withOption(threshold2Opt).create();
-
+
+ Option modelOpt = obuilder.withLongName("distanceClass").withRequired(false).withShortName("d").withArgument(
+ abuilder.withName("distanceClass").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The distance measure class name.").create();
+
+ Option threshold1Opt = obuilder.withLongName("threshold_1").withRequired(false).withShortName("t1").withArgument(
+ abuilder.withName("threshold_1").withMinimum(1).withMaximum(1).create()).withDescription("The T1 distance threshold.")
+ .create();
+
+ Option threshold2Opt = obuilder.withLongName("threshold_2").withRequired(false).withShortName("t2").withArgument(
+ abuilder.withName("threshold_2").withMinimum(1).withMaximum(1).create()).withDescription("The T1 distance threshold.")
+ .create();
+
+ Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(modelOpt).withOption(helpOpt)
+ .withOption(convergenceDeltaOpt).withOption(threshold1Opt).withOption(maxIterOpt).withOption(threshold2Opt).create();
+
try {
Parser parser = new Parser();
parser.setGroup(group);
@@ -82,11 +83,10 @@ public final class Job {
CommandLineUtil.printHelp(group);
return;
}
-
+
String input = cmdLine.getValue(inputOpt, "testdata").toString();
String output = cmdLine.getValue(outputOpt, "output").toString();
- String measureClassName = cmdLine.getValue(modelOpt,
- "org.apache.mahout.common.distance.EuclideanDistanceMeasure").toString();
+ String measureClassName = cmdLine.getValue(modelOpt, "org.apache.mahout.common.distance.EuclideanDistanceMeasure").toString();
double t1 = Double.parseDouble(cmdLine.getValue(threshold1Opt, "47.6").toString());
double t2 = Double.parseDouble(cmdLine.getValue(threshold2Opt, "1").toString());
double convergenceDelta = Double.parseDouble(cmdLine.getValue(convergenceDeltaOpt, "0.5").toString());
@@ -97,7 +97,7 @@ public final class Job {
CommandLineUtil.printHelp(group);
}
}
-
+
/**
* Run the meanshift clustering job on an input dataset using the given distance measure, t1, t2 and
* iteration parameters. All output data will be written to the output directory, which will be initially
@@ -120,17 +120,14 @@ public final class Job {
* the double convergence criteria for iterations
* @param maxIterations
* the int maximum number of iterations
+ * @throws IllegalAccessException
+ * @throws InstantiationException
*/
- private static void runJob(String input,
- String output,
- String measureClassName,
- double t1,
- double t2,
- double convergenceDelta,
- int maxIterations) throws IOException {
+ private static void runJob(String input, String output, String measureClassName, double t1, double t2, double convergenceDelta,
+ int maxIterations) throws IOException, InstantiationException, IllegalAccessException {
JobClient client = new JobClient();
JobConf conf = new JobConf(Job.class);
-
+
Path outPath = new Path(output);
client.setConf(conf);
FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
@@ -139,11 +136,12 @@ public final class Job {
}
String directoryContainingConvertedInput = output + Constants.DIRECTORY_CONTAINING_CONVERTED_INPUT;
InputDriver.runJob(input, directoryContainingConvertedInput);
- MeanShiftCanopyJob.runJob(directoryContainingConvertedInput, output + "/meanshift", measureClassName, t1,
- t2, convergenceDelta, maxIterations, true);
- FileStatus[] status = dfs.listStatus(new Path(output + "/meanshift"));
- OutputDriver.runJob(status[status.length - 1].getPath().toString(), output
- + CLUSTERED_POINTS_OUTPUT_DIRECTORY);
+ MeanShiftCanopyDriver.runJob(directoryContainingConvertedInput, output, measureClassName, t1, t2,
+ convergenceDelta, maxIterations, true, true);
+
+ ClusterDumper clusterDumper = new ClusterDumper("output/clusters-10", "output/clusteredPoints");
+ clusterDumper.printClusters(null);
+
}
-
+
}
Modified: lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java?rev=939867&r1=939866&r2=939867&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java (original)
+++ lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java Fri Apr 30 22:52:11 2010
@@ -39,7 +39,7 @@ import org.apache.mahout.clustering.diri
import org.apache.mahout.clustering.dirichlet.models.L1ModelDistribution;
import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver;
import org.apache.mahout.clustering.kmeans.KMeansDriver;
-import org.apache.mahout.clustering.meanshift.MeanShiftCanopyJob;
+import org.apache.mahout.clustering.meanshift.MeanShiftCanopyDriver;
import org.apache.mahout.common.MahoutTestCase;
import org.apache.mahout.common.RandomUtils;
import org.apache.mahout.common.distance.CosineDistanceMeasure;
@@ -163,7 +163,7 @@ public class TestClusterDumper extends M
// now run the Canopy job to prime kMeans canopies
CanopyDriver.runJob("testdata/points", "output", EuclideanDistanceMeasure.class.getName(), 8, 4, false);
// now run the KMeans job
- KMeansDriver.runJob("testdata/points", "output/clusters-0", "output", EuclideanDistanceMeasure.class.getName(), 0.001, 10, 1);
+ KMeansDriver.runJob("testdata/points", "output/clusters-0", "output", EuclideanDistanceMeasure.class.getName(), 0.001, 10, 1, true);
// run ClusterDumper
ClusterDumper clusterDumper = new ClusterDumper("output/clusters-2", "output/clusteredPoints");
clusterDumper.printClusters(termDictionary);
@@ -181,7 +181,7 @@ public class TestClusterDumper extends M
}
public void testMeanShift() throws Exception {
- MeanShiftCanopyJob.runJob("testdata/points", "output", CosineDistanceMeasure.class.getName(), 0.5, 0.01, 0.05, 10);
+ MeanShiftCanopyDriver.runJob("testdata/points", "output", CosineDistanceMeasure.class.getName(), 0.5, 0.01, 0.05, 10, false, true);
// run ClusterDumper
ClusterDumper clusterDumper = new ClusterDumper("output/clusters-1", "output/clusteredPoints");
clusterDumper.printClusters(termDictionary);
Modified: lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java?rev=939867&r1=939866&r2=939867&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java (original)
+++ lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java Fri Apr 30 22:52:11 2010
@@ -39,7 +39,7 @@ import org.apache.mahout.clustering.diri
import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver;
import org.apache.mahout.clustering.kmeans.KMeansDriver;
import org.apache.mahout.clustering.kmeans.TestKmeansClustering;
-import org.apache.mahout.clustering.meanshift.MeanShiftCanopyJob;
+import org.apache.mahout.clustering.meanshift.MeanShiftCanopyDriver;
import org.apache.mahout.common.MahoutTestCase;
import org.apache.mahout.common.RandomUtils;
import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
@@ -163,7 +163,7 @@ public class TestCDbwEvaluator extends M
// now run the Canopy job to prime kMeans canopies
CanopyDriver.runJob("testdata", "output", EuclideanDistanceMeasure.class.getName(), 3.1, 2.1, false);
// now run the KMeans job
- KMeansDriver.runJob("testdata", "output/clusters-0", "output", EuclideanDistanceMeasure.class.getName(), 0.001, 10, 1);
+ KMeansDriver.runJob("testdata", "output/clusters-0", "output", EuclideanDistanceMeasure.class.getName(), 0.001, 10, 1, true);
int numIterations = 2;
CDbwDriver.runJob("output/clusters-2", "output/clusteredPoints", "output", EuclideanDistanceMeasure.class.getName(),
numIterations, 1);
@@ -183,7 +183,7 @@ public class TestCDbwEvaluator extends M
}
public void testMeanShift() throws Exception {
- MeanShiftCanopyJob.runJob("testdata", "output", EuclideanDistanceMeasure.class.getName(), 2.1, 1.0, 0.001, 10);
+ MeanShiftCanopyDriver.runJob("testdata", "output", EuclideanDistanceMeasure.class.getName(), 2.1, 1.0, 0.001, 10, false, true);
int numIterations = 2;
CDbwDriver.runJob("output/clusters-2", "output/clusteredPoints", "output", EuclideanDistanceMeasure.class.getName(),
numIterations, 1);