You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by pr...@apache.org on 2012/03/17 08:34:32 UTC
svn commit: r1301886 - in /mahout/trunk:
core/src/main/java/org/apache/mahout/clustering/kmeans/
core/src/main/java/org/apache/mahout/clustering/spectral/kmeans/
core/src/test/java/org/apache/mahout/clustering/kmeans/
core/src/test/java/org/apache/maho...
Author: pranjan
Date: Sat Mar 17 07:34:31 2012
New Revision: 1301886
URL: http://svn.apache.org/viewvc?rev=1301886&view=rev
Log:
MAHOUT-981, Added outlier removal option in method and CLI for KMeansDriver.
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/spectral/kmeans/SpectralKMeansDriver.java
mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java
mahout/trunk/core/src/test/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterCountReaderTest.java
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayKMeans.java
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java
mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java?rev=1301886&r1=1301885&r2=1301886&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java Sat Mar 17 07:34:31 2012
@@ -82,6 +82,7 @@ public class KMeansDriver extends Abstra
addOption(DefaultOptionCreator.overwriteOption().create());
addOption(DefaultOptionCreator.clusteringOption().create());
addOption(DefaultOptionCreator.methodOption().create());
+ addOption(DefaultOptionCreator.outlierThresholdOption().create());
if (parseArguments(args) == null) {
return -1;
@@ -111,28 +112,39 @@ public class KMeansDriver extends Abstra
if (getConf() == null) {
setConf(new Configuration());
}
- run(getConf(), input, clusters, output, measure, convergenceDelta, maxIterations, runClustering, runSequential);
+ double clusterClassificationThreshold = 0.0;
+ if (hasOption(DefaultOptionCreator.OUTLIER_THRESHOLD)) {
+ clusterClassificationThreshold = Double.parseDouble(getOption(DefaultOptionCreator.OUTLIER_THRESHOLD));
+ }
+ run(getConf(), input, clusters, output, measure, convergenceDelta, maxIterations, runClustering,
+ clusterClassificationThreshold, runSequential);
return 0;
}
- /**
- * Iterate over the input vectors to produce clusters and, if requested, use the
- * results of the final iteration to cluster the input vectors.
+ /**
+ * Iterate over the input vectors to produce clusters and, if requested, use
+ * the results of the final iteration to cluster the input vectors.
+ *
* @param input
* the directory pathname for input points
* @param clustersIn
* the directory pathname for initial & computed clusters
* @param output
* the directory pathname for output points
- * @param measure
+ * @param measure
* the DistanceMeasure to use
* @param convergenceDelta
* the convergence delta value
* @param maxIterations
* the maximum number of iterations
- * @param runClustering
+ * @param runClustering
* true if points are to be clustered after iterations are completed
- * @param runSequential if true execute sequential algorithm
+ * @param clusterClassificationThreshold
+ * Is a clustering strictness / outlier removal parameter. Its value
+ * should be between 0 and 1. Vectors having pdf below this value
+ * will not be clustered.
+ * @param runSequential
+ * if true execute sequential algorithm
*/
public static void run(Configuration conf,
Path input,
@@ -142,6 +154,7 @@ public class KMeansDriver extends Abstra
double convergenceDelta,
int maxIterations,
boolean runClustering,
+ double clusterClassificationThreshold,
boolean runSequential)
throws IOException, InterruptedException, ClassNotFoundException {
@@ -161,14 +174,14 @@ public class KMeansDriver extends Abstra
clustersOut,
output,
measure,
- delta,
+ clusterClassificationThreshold,
runSequential);
}
}
/**
- * Iterate over the input vectors to produce clusters and, if requested, use the
- * results of the final iteration to cluster the input vectors.
+ * Iterate over the input vectors to produce clusters and, if requested, use
+ * the results of the final iteration to cluster the input vectors.
*
* @param input
* the directory pathname for input points
@@ -176,15 +189,20 @@ public class KMeansDriver extends Abstra
* the directory pathname for initial & computed clusters
* @param output
* the directory pathname for output points
- * @param measure
+ * @param measure
* the DistanceMeasure to use
* @param convergenceDelta
* the convergence delta value
* @param maxIterations
* the maximum number of iterations
- * @param runClustering
+ * @param runClustering
* true if points are to be clustered after iterations are completed
- * @param runSequential if true execute sequential algorithm
+ * @param clusterClassificationThreshold
+ * Is a clustering strictness / outlier removal parrameter. Its value
+ * should be between 0 and 1. Vectors having pdf below this value
+ * will not be clustered.
+ * @param runSequential
+ * if true execute sequential algorithm
*/
public static void run(Path input,
Path clustersIn,
@@ -193,6 +211,7 @@ public class KMeansDriver extends Abstra
double convergenceDelta,
int maxIterations,
boolean runClustering,
+ double clusterClassificationThreshold,
boolean runSequential)
throws IOException, InterruptedException, ClassNotFoundException {
run(new Configuration(),
@@ -203,6 +222,7 @@ public class KMeansDriver extends Abstra
convergenceDelta,
maxIterations,
runClustering,
+ clusterClassificationThreshold,
runSequential);
}
@@ -404,6 +424,7 @@ public class KMeansDriver extends Abstra
/**
* Run the job using supplied arguments
+ *
* @param input
* the directory pathname for input points
* @param clustersIn
@@ -412,25 +433,26 @@ public class KMeansDriver extends Abstra
* the directory pathname for output points
* @param measure
* the classname of the DistanceMeasure
- * @param convergenceDelta
- * the convergence delta value
- * @param runSequential if true execute sequential algorithm
+ * @param clusterClassificationThreshold
+ * Is a clustering strictness / outlier removal parrameter. Its value
+ * should be between 0 and 1. Vectors having pdf below this value
+ * will not be clustered.
+ * @param runSequential
+ * if true execute sequential algorithm
*/
public static void clusterData(Configuration conf,
Path input,
Path clustersIn,
Path output,
DistanceMeasure measure,
- String convergenceDelta,
+ double clusterClassificationThreshold,
boolean runSequential)
throws IOException, InterruptedException, ClassNotFoundException {
if (log.isInfoEnabled()) {
log.info("Running Clustering");
log.info("Input: {} Clusters In: {} Out: {} Distance: {}", new Object[] {input, clustersIn, output, measure});
- log.info("convergence: {} Input Vectors: {}", convergenceDelta, VectorWritable.class.getName());
}
- Double clusterClassificationThreshold = 0.0;
ClusterClassifier.writePolicy(new KMeansClusteringPolicy(), clustersIn);
ClusterClassificationDriver.run(input, output, new Path(output, CLUSTERED_POINTS_DIRECTORY),
clusterClassificationThreshold, true, runSequential);
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/spectral/kmeans/SpectralKMeansDriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/spectral/kmeans/SpectralKMeansDriver.java?rev=1301886&r1=1301885&r2=1301886&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/spectral/kmeans/SpectralKMeansDriver.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/spectral/kmeans/SpectralKMeansDriver.java Sat Mar 17 07:34:31 2012
@@ -187,6 +187,7 @@ public class SpectralKMeansDriver extend
convergenceDelta,
maxIterations,
true,
+ 0.0,
false);
}
}
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java?rev=1301886&r1=1301885&r2=1301886&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java Sat Mar 17 07:34:31 2012
@@ -527,7 +527,7 @@ public final class TestKmeansClustering
// now run the KMeans job
Path kmeansOutput = new Path(outputPath, "kmeans");
KMeansDriver.run(pointsPath, new Path(outputPath, "clusters-0-final"), kmeansOutput, new EuclideanDistanceMeasure(),
- 0.001, 10, true, false);
+ 0.001, 10, true, 0.0, false);
// now compare the expected clusters with actual
Path clusteredPointsPath = new Path(kmeansOutput, "clusteredPoints");
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterCountReaderTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterCountReaderTest.java?rev=1301886&r1=1301885&r2=1301886&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterCountReaderTest.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterCountReaderTest.java Sat Mar 17 07:34:31 2012
@@ -101,7 +101,7 @@ public final class ClusterCountReaderTes
CanopyDriver.run(conf, pointsPath, outputPathForCanopy, measure, 4.0, 3.0, true, 0.0, true);
Path clustersIn = new Path(outputPathForCanopy, new Path(Cluster.CLUSTERS_DIR + '0'
+ Cluster.FINAL_ITERATION_SUFFIX));
- KMeansDriver.run(conf, pointsPath, clustersIn, outputPathForKMeans, measure, 1, 1, true, true);
+ KMeansDriver.run(conf, pointsPath, clustersIn, outputPathForKMeans, measure, 1, 1, true, 0.0, true);
}
private static void verifyThatNumberOfClustersIsCorrect(Configuration conf, Path clusteredPointsPath) {
Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayKMeans.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayKMeans.java?rev=1301886&r1=1301885&r2=1301886&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayKMeans.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayKMeans.java Sat Mar 17 07:34:31 2012
@@ -93,7 +93,7 @@ public class DisplayKMeans extends Displ
DistanceMeasure measure, int maxIterations, double convergenceDelta) throws IOException, InterruptedException,
ClassNotFoundException {
Path clusters = RandomSeedGenerator.buildRandom(conf, samples, new Path(output, "clusters-0"), 3, measure);
- KMeansDriver.run(samples, clusters, output, measure, convergenceDelta, maxIterations, true, true);
+ KMeansDriver.run(samples, clusters, output, measure, convergenceDelta, maxIterations, true, 0.0, true);
loadClusters(output);
}
Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java?rev=1301886&r1=1301885&r2=1301886&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java Sat Mar 17 07:34:31 2012
@@ -145,7 +145,7 @@ public final class Job extends AbstractJ
directoryContainingConvertedInput, clusters, k, measure);
log.info("Running KMeans");
KMeansDriver.run(conf, directoryContainingConvertedInput, clusters, output,
- measure, convergenceDelta, maxIterations, true, false);
+ measure, convergenceDelta, maxIterations, true, 0.0, false);
// run ClusterDumper
ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
output, maxIterations), new Path(output, "clusteredPoints"));
@@ -195,7 +195,7 @@ public final class Job extends AbstractJ
log.info("Running KMeans");
KMeansDriver.run(conf, directoryContainingConvertedInput, new Path(output,
Cluster.INITIAL_CLUSTERS_DIR), output, measure, convergenceDelta,
- maxIterations, true, false);
+ maxIterations, true, 0.0, false);
// run ClusterDumper
ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
output, maxIterations), new Path(output, "clusteredPoints"));
Modified: mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java?rev=1301886&r1=1301885&r2=1301886&view=diff
==============================================================================
--- mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java (original)
+++ mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java Sat Mar 17 07:34:31 2012
@@ -207,7 +207,7 @@ public final class TestClusterDumper ext
4, false, 0.0, true);
// now run the KMeans job
KMeansDriver.run(conf, getTestTempDirPath("testdata"), new Path(output,
- "clusters-0-final"), output, measure, 0.001, 10, true, false);
+ "clusters-0-final"), output, measure, 0.001, 10, true, 0.0, false);
// run ClusterDumper
ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
output, 10), new Path(output, "clusteredPoints"));
@@ -338,7 +338,7 @@ public final class TestClusterDumper ext
// now run the KMeans job
Path kmeansOutput = new Path(output, "kmeans");
KMeansDriver.run(svdData, new Path(output, "clusters-0"), kmeansOutput, measure,
- 0.001, 10, true, true);
+ 0.001, 10, true, 0.0, true);
// run ClusterDumper
ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
kmeansOutput, 10), new Path(kmeansOutput, "clusteredPoints"));
@@ -380,7 +380,7 @@ public final class TestClusterDumper ext
// now run the KMeans job
Path kmeansOutput = new Path(output, "kmeans");
KMeansDriver.run(sData.getRowPath(), new Path(output, "clusters-0"),
- kmeansOutput, measure, 0.001, 10, true, true);
+ kmeansOutput, measure, 0.001, 10, true, 0.0, true);
// run ClusterDumper
ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
kmeansOutput, 10), new Path(kmeansOutput, "clusteredPoints"));
@@ -425,7 +425,7 @@ public final class TestClusterDumper ext
// now run the KMeans job
Path kmeansOutput = new Path(output, "kmeans");
KMeansDriver.run(sData.getRowPath(), new Path(output, "clusters-0"),
- kmeansOutput, measure, 0.001, 10, true, true);
+ kmeansOutput, measure, 0.001, 10, true, 0.0, true);
// run ClusterDumper
ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
kmeansOutput, 10), new Path(kmeansOutput, "clusteredPoints"));
Modified: mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java?rev=1301886&r1=1301885&r2=1301886&view=diff
==============================================================================
--- mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java (original)
+++ mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java Sat Mar 17 07:34:31 2012
@@ -344,7 +344,7 @@ public final class TestClusterEvaluator
// now run the KMeans job
Path kmeansOutput = new Path(output, "kmeans");
KMeansDriver.run(testdata, new Path(output, "clusters-0-final"), kmeansOutput, measure,
- 0.001, 10, true, true);
+ 0.001, 10, true, 0.0, true);
int numIterations = 10;
Path clustersIn = new Path(output, "clusters-2");
RepresentativePointsDriver.run(conf, clustersIn, new Path(output,
Modified: mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java?rev=1301886&r1=1301885&r2=1301886&view=diff
==============================================================================
--- mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java (original)
+++ mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java Sat Mar 17 07:34:31 2012
@@ -354,7 +354,7 @@ public final class TestCDbwEvaluator ext
// now run the KMeans job
Path kmeansOutput = new Path(output, "kmeans");
KMeansDriver.run(testdata, new Path(output, "clusters-0-final"), kmeansOutput, measure,
- 0.001, 10, true, true);
+ 0.001, 10, true, 0.0, true);
int numIterations = 10;
Path clustersIn = new Path(output, "clusters-2");
RepresentativePointsDriver.run(conf, clustersIn, new Path(output,