You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by je...@apache.org on 2010/04/02 23:52:20 UTC
svn commit: r930402 - in /lucene/mahout/trunk:
core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyJob.java
examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java
Author: jeastman
Date: Fri Apr 2 21:52:20 2010
New Revision: 930402
URL: http://svn.apache.org/viewvc?rev=930402&view=rev
Log:
MAHOUT-339: Added option to pass clusters as input to mean shift clustering in addition to vectors. Changed synthetic control example job to use this option. All tests run
Modified:
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyJob.java
lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyJob.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyJob.java?rev=930402&r1=930401&r2=930402&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyJob.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyJob.java Fri Apr 2 21:52:20 2010
@@ -37,40 +37,40 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class MeanShiftCanopyJob {
-
+
protected static final String CONTROL_CONVERGED = "/control/converged";
-
+
private static final Logger log = LoggerFactory.getLogger(MeanShiftCanopyJob.class);
-
- private MeanShiftCanopyJob() { }
-
+
+ private MeanShiftCanopyJob() {
+ }
+
public static void main(String[] args) throws IOException {
DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
ArgumentBuilder abuilder = new ArgumentBuilder();
GroupBuilder gbuilder = new GroupBuilder();
-
+
Option inputOpt = DefaultOptionCreator.inputOption().create();
Option outputOpt = DefaultOptionCreator.outputOption().create();
Option convergenceDeltaOpt = DefaultOptionCreator.convergenceOption().create();
Option maxIterOpt = DefaultOptionCreator.maxIterOption().create();
Option helpOpt = DefaultOptionCreator.helpOption();
-
- Option modelOpt = obuilder.withLongName("distanceClass").withRequired(true).withShortName("d")
- .withArgument(abuilder.withName("distanceClass").withMinimum(1).withMaximum(1).create())
- .withDescription("The distance measure class name.").create();
-
- Option threshold1Opt = obuilder.withLongName("threshold_1").withRequired(true).withShortName("t1")
- .withArgument(abuilder.withName("threshold_1").withMinimum(1).withMaximum(1).create())
- .withDescription("The T1 distance threshold.").create();
-
- Option threshold2Opt = obuilder.withLongName("threshold_2").withRequired(true).withShortName("t2")
- .withArgument(abuilder.withName("threshold_2").withMinimum(1).withMaximum(1).create())
- .withDescription("The T1 distance threshold.").create();
-
- Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt)
- .withOption(modelOpt).withOption(helpOpt).withOption(convergenceDeltaOpt).withOption(threshold1Opt)
- .withOption(maxIterOpt).withOption(threshold2Opt).create();
-
+
+ Option modelOpt = obuilder.withLongName("distanceClass").withRequired(true).withShortName("d").withArgument(
+ abuilder.withName("distanceClass").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The distance measure class name.").create();
+
+ Option threshold1Opt = obuilder.withLongName("threshold_1").withRequired(true).withShortName("t1").withArgument(
+ abuilder.withName("threshold_1").withMinimum(1).withMaximum(1).create()).withDescription("The T1 distance threshold.")
+ .create();
+
+ Option threshold2Opt = obuilder.withLongName("threshold_2").withRequired(true).withShortName("t2").withArgument(
+ abuilder.withName("threshold_2").withMinimum(1).withMaximum(1).create()).withDescription("The T1 distance threshold.")
+ .create();
+
+ Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(modelOpt).withOption(helpOpt)
+ .withOption(convergenceDeltaOpt).withOption(threshold1Opt).withOption(maxIterOpt).withOption(threshold2Opt).create();
+
try {
Parser parser = new Parser();
parser.setGroup(group);
@@ -79,7 +79,7 @@ public class MeanShiftCanopyJob {
CommandLineUtil.printHelp(group);
return;
}
-
+
String input = cmdLine.getValue(inputOpt).toString();
String output = cmdLine.getValue(outputOpt).toString();
String measureClassName = cmdLine.getValue(modelOpt).toString();
@@ -93,9 +93,9 @@ public class MeanShiftCanopyJob {
CommandLineUtil.printHelp(group);
}
}
-
+
/**
- * Run the job
+ * Run the job, first converting the input Vectors to Canopies
*
* @param input
* the input pathname String
@@ -112,13 +112,33 @@ public class MeanShiftCanopyJob {
* @param maxIterations
* an int number of iterations
*/
- public static void runJob(String input,
- String output,
- String measureClassName,
- double t1,
- double t2,
- double convergenceDelta,
- int maxIterations) throws IOException {
+ public static void runJob(String input, String output, String measureClassName, double t1, double t2, double convergenceDelta,
+ int maxIterations) throws IOException {
+ runJob(input, output, measureClassName, t1,t2,convergenceDelta, maxIterations, false);
+ }
+
+ /**
+ * Run the job where the input format can be either Vectors or Canopies
+ *
+ * @param input
+ * the input pathname String
+ * @param output
+ * the output pathname String
+ * @param measureClassName
+ * the DistanceMeasure class name
+ * @param t1
+ * the T1 distance threshold
+ * @param t2
+ * the T2 distance threshold
+ * @param convergenceDelta
+ * the double convergence criteria
+ * @param maxIterations
+ * an int number of iterations
+ * @param inputIsCanopies
+ true if the input path already contains MeanShiftCanopies and does not need to be converted from Vectors
+ */
+ public static void runJob(String input, String output, String measureClassName, double t1, double t2, double convergenceDelta,
+ int maxIterations, boolean inputIsCanopies) throws IOException {
// delete the output directory
Configuration conf = new JobConf(MeanShiftCanopyDriver.class);
Path outPath = new Path(output);
@@ -127,25 +147,27 @@ public class MeanShiftCanopyJob {
fs.delete(outPath, true);
}
fs.mkdirs(outPath);
-
- MeanShiftCanopyDriver.createCanopyFromVectors(input, output+"/initial-canopies");
-
+
+ String clustersIn = output + "/initial-canopies";
+ if (inputIsCanopies)
+ clustersIn = input;
+ else
+ MeanShiftCanopyDriver.createCanopyFromVectors(input, clustersIn);
+
// iterate until the clusters converge
boolean converged = false;
int iteration = 0;
- String clustersIn = output+"/initial-canopies";
while (!converged && (iteration < maxIterations)) {
log.info("Iteration {}", iteration);
// point the output to a new directory per iteration
String clustersOut = output + "/canopies-" + iteration;
String controlOut = output + CONTROL_CONVERGED;
- MeanShiftCanopyDriver.runJob(clustersIn, clustersOut, controlOut, measureClassName, t1, t2,
- convergenceDelta);
+ MeanShiftCanopyDriver.runJob(clustersIn, clustersOut, controlOut, measureClassName, t1, t2, convergenceDelta);
converged = FileSystem.get(conf).exists(new Path(controlOut));
// now point the input to the old output directory
clustersIn = output + "/canopies-" + iteration;
iteration++;
}
}
-
+
}
Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java?rev=930402&r1=930401&r2=930402&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java Fri Apr 2 21:52:20 2010
@@ -140,7 +140,7 @@ public final class Job {
String directoryContainingConvertedInput = output + Constants.DIRECTORY_CONTAINING_CONVERTED_INPUT;
InputDriver.runJob(input, directoryContainingConvertedInput);
MeanShiftCanopyJob.runJob(directoryContainingConvertedInput, output + "/meanshift", measureClassName, t1,
- t2, convergenceDelta, maxIterations);
+ t2, convergenceDelta, maxIterations, true);
FileStatus[] status = dfs.listStatus(new Path(output + "/meanshift"));
OutputDriver.runJob(status[status.length - 1].getPath().toString(), output
+ CLUSTERED_POINTS_OUTPUT_DIRECTORY);