You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by pr...@apache.org on 2012/03/17 08:34:32 UTC

svn commit: r1301886 - in /mahout/trunk: core/src/main/java/org/apache/mahout/clustering/kmeans/ core/src/main/java/org/apache/mahout/clustering/spectral/kmeans/ core/src/test/java/org/apache/mahout/clustering/kmeans/ core/src/test/java/org/apache/maho...

Author: pranjan
Date: Sat Mar 17 07:34:31 2012
New Revision: 1301886

URL: http://svn.apache.org/viewvc?rev=1301886&view=rev
Log:
MAHOUT-981, Added outlier removal option in method and CLI for KMeansDriver.

Modified:
    mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
    mahout/trunk/core/src/main/java/org/apache/mahout/clustering/spectral/kmeans/SpectralKMeansDriver.java
    mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java
    mahout/trunk/core/src/test/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterCountReaderTest.java
    mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayKMeans.java
    mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
    mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
    mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java
    mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java?rev=1301886&r1=1301885&r2=1301886&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java Sat Mar 17 07:34:31 2012
@@ -82,6 +82,7 @@ public class KMeansDriver extends Abstra
     addOption(DefaultOptionCreator.overwriteOption().create());
     addOption(DefaultOptionCreator.clusteringOption().create());
     addOption(DefaultOptionCreator.methodOption().create());
+    addOption(DefaultOptionCreator.outlierThresholdOption().create());
 
     if (parseArguments(args) == null) {
       return -1;
@@ -111,28 +112,39 @@ public class KMeansDriver extends Abstra
     if (getConf() == null) {
       setConf(new Configuration());
     }
-    run(getConf(), input, clusters, output, measure, convergenceDelta, maxIterations, runClustering, runSequential);
+    double clusterClassificationThreshold = 0.0;
+    if (hasOption(DefaultOptionCreator.OUTLIER_THRESHOLD)) {
+      clusterClassificationThreshold = Double.parseDouble(getOption(DefaultOptionCreator.OUTLIER_THRESHOLD));
+    }
+    run(getConf(), input, clusters, output, measure, convergenceDelta, maxIterations, runClustering,
+        clusterClassificationThreshold, runSequential);
     return 0;
   }
 
-  /**
-   * Iterate over the input vectors to produce clusters and, if requested, use the
-   * results of the final iteration to cluster the input vectors.
+  	/**
+   * Iterate over the input vectors to produce clusters and, if requested, use
+   * the results of the final iteration to cluster the input vectors.
+   * 
    * @param input
    *          the directory pathname for input points
    * @param clustersIn
    *          the directory pathname for initial & computed clusters
    * @param output
    *          the directory pathname for output points
-   * @param measure 
+   * @param measure
    *          the DistanceMeasure to use
    * @param convergenceDelta
    *          the convergence delta value
    * @param maxIterations
    *          the maximum number of iterations
-   * @param runClustering 
+   * @param runClustering
    *          true if points are to be clustered after iterations are completed
-   * @param runSequential if true execute sequential algorithm
+   * @param clusterClassificationThreshold
+   *          Is a clustering strictness / outlier removal parameter. Its value
+   *          should be between 0 and 1. Vectors having pdf below this value
+   *          will not be clustered.
+   * @param runSequential
+   *          if true execute sequential algorithm
    */
   public static void run(Configuration conf,
                          Path input,
@@ -142,6 +154,7 @@ public class KMeansDriver extends Abstra
                          double convergenceDelta,
                          int maxIterations,
                          boolean runClustering,
+                         double clusterClassificationThreshold, 
                          boolean runSequential)
     throws IOException, InterruptedException, ClassNotFoundException {
 
@@ -161,14 +174,14 @@ public class KMeansDriver extends Abstra
           clustersOut,
           output,
           measure,
-          delta,
+          clusterClassificationThreshold,
           runSequential);
     }
   }
 
   /**
-   * Iterate over the input vectors to produce clusters and, if requested, use the
-   * results of the final iteration to cluster the input vectors.
+   * Iterate over the input vectors to produce clusters and, if requested, use
+   * the results of the final iteration to cluster the input vectors.
    * 
    * @param input
    *          the directory pathname for input points
@@ -176,15 +189,20 @@ public class KMeansDriver extends Abstra
    *          the directory pathname for initial & computed clusters
    * @param output
    *          the directory pathname for output points
-   * @param measure 
+   * @param measure
    *          the DistanceMeasure to use
    * @param convergenceDelta
    *          the convergence delta value
    * @param maxIterations
    *          the maximum number of iterations
-   * @param runClustering 
+   * @param runClustering
    *          true if points are to be clustered after iterations are completed
-   * @param runSequential if true execute sequential algorithm
+   * @param clusterClassificationThreshold
+   *          Is a clustering strictness / outlier removal parrameter. Its value
+   *          should be between 0 and 1. Vectors having pdf below this value
+   *          will not be clustered.
+   * @param runSequential
+   *          if true execute sequential algorithm
    */
   public static void run(Path input,
                          Path clustersIn,
@@ -193,6 +211,7 @@ public class KMeansDriver extends Abstra
                          double convergenceDelta,
                          int maxIterations,
                          boolean runClustering,
+                         double clusterClassificationThreshold, 
                          boolean runSequential)
     throws IOException, InterruptedException, ClassNotFoundException {
     run(new Configuration(),
@@ -203,6 +222,7 @@ public class KMeansDriver extends Abstra
         convergenceDelta,
         maxIterations,
         runClustering,
+        clusterClassificationThreshold, 
         runSequential);
   }
 
@@ -404,6 +424,7 @@ public class KMeansDriver extends Abstra
 
   /**
    * Run the job using supplied arguments
+   * 
    * @param input
    *          the directory pathname for input points
    * @param clustersIn
@@ -412,25 +433,26 @@ public class KMeansDriver extends Abstra
    *          the directory pathname for output points
    * @param measure
    *          the classname of the DistanceMeasure
-   * @param convergenceDelta
-   *          the convergence delta value
-   * @param runSequential if true execute sequential algorithm
+   * @param clusterClassificationThreshold
+   *          Is a clustering strictness / outlier removal parrameter. Its value
+   *          should be between 0 and 1. Vectors having pdf below this value
+   *          will not be clustered.
+   * @param runSequential
+   *          if true execute sequential algorithm
    */
   public static void clusterData(Configuration conf,
                                  Path input,
                                  Path clustersIn,
                                  Path output,
                                  DistanceMeasure measure,
-                                 String convergenceDelta,
+                                 double clusterClassificationThreshold,
                                  boolean runSequential)
     throws IOException, InterruptedException, ClassNotFoundException {
 
     if (log.isInfoEnabled()) {
       log.info("Running Clustering");
       log.info("Input: {} Clusters In: {} Out: {} Distance: {}", new Object[] {input, clustersIn, output, measure});
-      log.info("convergence: {} Input Vectors: {}", convergenceDelta, VectorWritable.class.getName());
     }
-    Double clusterClassificationThreshold = 0.0;
     ClusterClassifier.writePolicy(new KMeansClusteringPolicy(), clustersIn);
     ClusterClassificationDriver.run(input, output, new Path(output, CLUSTERED_POINTS_DIRECTORY),
         clusterClassificationThreshold, true, runSequential);

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/spectral/kmeans/SpectralKMeansDriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/spectral/kmeans/SpectralKMeansDriver.java?rev=1301886&r1=1301885&r2=1301886&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/spectral/kmeans/SpectralKMeansDriver.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/spectral/kmeans/SpectralKMeansDriver.java Sat Mar 17 07:34:31 2012
@@ -187,6 +187,7 @@ public class SpectralKMeansDriver extend
                      convergenceDelta,
                      maxIterations,
                      true,
+                     0.0, 
                      false);
   }
 }

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java?rev=1301886&r1=1301885&r2=1301886&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java Sat Mar 17 07:34:31 2012
@@ -527,7 +527,7 @@ public final class TestKmeansClustering 
     // now run the KMeans job
     Path kmeansOutput = new Path(outputPath, "kmeans");
 	KMeansDriver.run(pointsPath, new Path(outputPath, "clusters-0-final"), kmeansOutput, new EuclideanDistanceMeasure(),
-        0.001, 10, true, false);
+        0.001, 10, true, 0.0, false);
     
     // now compare the expected clusters with actual
     Path clusteredPointsPath = new Path(kmeansOutput, "clusteredPoints");

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterCountReaderTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterCountReaderTest.java?rev=1301886&r1=1301885&r2=1301886&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterCountReaderTest.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterCountReaderTest.java Sat Mar 17 07:34:31 2012
@@ -101,7 +101,7 @@ public final class ClusterCountReaderTes
     CanopyDriver.run(conf, pointsPath, outputPathForCanopy, measure, 4.0, 3.0, true, 0.0, true);
     Path clustersIn = new Path(outputPathForCanopy, new Path(Cluster.CLUSTERS_DIR + '0'
                                                                    + Cluster.FINAL_ITERATION_SUFFIX));
-    KMeansDriver.run(conf, pointsPath, clustersIn, outputPathForKMeans, measure, 1, 1, true, true);
+    KMeansDriver.run(conf, pointsPath, clustersIn, outputPathForKMeans, measure, 1, 1, true, 0.0, true);
   }
   
   private static void verifyThatNumberOfClustersIsCorrect(Configuration conf, Path clusteredPointsPath) {

Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayKMeans.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayKMeans.java?rev=1301886&r1=1301885&r2=1301886&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayKMeans.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayKMeans.java Sat Mar 17 07:34:31 2012
@@ -93,7 +93,7 @@ public class DisplayKMeans extends Displ
       DistanceMeasure measure, int maxIterations, double convergenceDelta) throws IOException, InterruptedException,
       ClassNotFoundException {
     Path clusters = RandomSeedGenerator.buildRandom(conf, samples, new Path(output, "clusters-0"), 3, measure);
-    KMeansDriver.run(samples, clusters, output, measure, convergenceDelta, maxIterations, true, true);
+    KMeansDriver.run(samples, clusters, output, measure, convergenceDelta, maxIterations, true, 0.0, true);
     loadClusters(output);
   }
   

Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java?rev=1301886&r1=1301885&r2=1301886&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java Sat Mar 17 07:34:31 2012
@@ -145,7 +145,7 @@ public final class Job extends AbstractJ
         directoryContainingConvertedInput, clusters, k, measure);
     log.info("Running KMeans");
     KMeansDriver.run(conf, directoryContainingConvertedInput, clusters, output,
-        measure, convergenceDelta, maxIterations, true, false);
+        measure, convergenceDelta, maxIterations, true, 0.0, false);
     // run ClusterDumper
     ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
         output, maxIterations), new Path(output, "clusteredPoints"));
@@ -195,7 +195,7 @@ public final class Job extends AbstractJ
     log.info("Running KMeans");
     KMeansDriver.run(conf, directoryContainingConvertedInput, new Path(output,
         Cluster.INITIAL_CLUSTERS_DIR), output, measure, convergenceDelta,
-        maxIterations, true, false);
+        maxIterations, true, 0.0, false);
     // run ClusterDumper
     ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
         output, maxIterations), new Path(output, "clusteredPoints"));

Modified: mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java?rev=1301886&r1=1301885&r2=1301886&view=diff
==============================================================================
--- mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java (original)
+++ mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java Sat Mar 17 07:34:31 2012
@@ -207,7 +207,7 @@ public final class TestClusterDumper ext
         4, false, 0.0, true);
     // now run the KMeans job
     KMeansDriver.run(conf, getTestTempDirPath("testdata"), new Path(output,
-        "clusters-0-final"), output, measure, 0.001, 10, true, false);
+        "clusters-0-final"), output, measure, 0.001, 10, true, 0.0, false);
     // run ClusterDumper
     ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
         output, 10), new Path(output, "clusteredPoints"));
@@ -338,7 +338,7 @@ public final class TestClusterDumper ext
     // now run the KMeans job
     Path kmeansOutput = new Path(output, "kmeans");
 	KMeansDriver.run(svdData, new Path(output, "clusters-0"), kmeansOutput, measure,
-        0.001, 10, true, true);
+        0.001, 10, true, 0.0, true);
     // run ClusterDumper
     ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
     		kmeansOutput, 10), new Path(kmeansOutput, "clusteredPoints"));
@@ -380,7 +380,7 @@ public final class TestClusterDumper ext
     // now run the KMeans job
     Path kmeansOutput = new Path(output, "kmeans");
 	KMeansDriver.run(sData.getRowPath(), new Path(output, "clusters-0"),
-        kmeansOutput, measure, 0.001, 10, true, true);
+        kmeansOutput, measure, 0.001, 10, true, 0.0, true);
     // run ClusterDumper
     ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
     		kmeansOutput, 10), new Path(kmeansOutput, "clusteredPoints"));
@@ -425,7 +425,7 @@ public final class TestClusterDumper ext
     // now run the KMeans job
     Path kmeansOutput = new Path(output, "kmeans");
 	KMeansDriver.run(sData.getRowPath(), new Path(output, "clusters-0"),
-        kmeansOutput, measure, 0.001, 10, true, true);
+        kmeansOutput, measure, 0.001, 10, true, 0.0, true);
     // run ClusterDumper
     ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
     		kmeansOutput, 10), new Path(kmeansOutput, "clusteredPoints"));

Modified: mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java?rev=1301886&r1=1301885&r2=1301886&view=diff
==============================================================================
--- mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java (original)
+++ mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java Sat Mar 17 07:34:31 2012
@@ -344,7 +344,7 @@ public final class TestClusterEvaluator 
     // now run the KMeans job
     Path kmeansOutput = new Path(output, "kmeans");
 	KMeansDriver.run(testdata, new Path(output, "clusters-0-final"), kmeansOutput, measure,
-        0.001, 10, true, true);
+        0.001, 10, true, 0.0, true);
     int numIterations = 10;
     Path clustersIn = new Path(output, "clusters-2");
     RepresentativePointsDriver.run(conf, clustersIn, new Path(output,

Modified: mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java?rev=1301886&r1=1301885&r2=1301886&view=diff
==============================================================================
--- mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java (original)
+++ mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java Sat Mar 17 07:34:31 2012
@@ -354,7 +354,7 @@ public final class TestCDbwEvaluator ext
     // now run the KMeans job
     Path kmeansOutput = new Path(output, "kmeans");
 	KMeansDriver.run(testdata, new Path(output, "clusters-0-final"), kmeansOutput, measure,
-        0.001, 10, true, true);
+        0.001, 10, true, 0.0, true);
     int numIterations = 10;
     Path clustersIn = new Path(output, "clusters-2");
     RepresentativePointsDriver.run(conf, clustersIn, new Path(output,