You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by je...@apache.org on 2010/07/24 05:39:33 UTC

svn commit: r978786 [2/2] - in /mahout/trunk: core/src/main/java/org/apache/mahout/clustering/ core/src/main/java/org/apache/mahout/clustering/canopy/ core/src/main/java/org/apache/mahout/clustering/dirichlet/models/ core/src/main/java/org/apache/mahou...

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyClusterer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyClusterer.java?rev=978786&r1=978785&r2=978786&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyClusterer.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyClusterer.java Sat Jul 24 03:39:30 2010
@@ -112,11 +112,8 @@ public class MeanShiftCanopyClusterer {
    * @return if the cluster is converged
    */
   public boolean shiftToMean(MeanShiftCanopy canopy) {
-    Vector centroid = canopy.computeCentroid();
-    canopy.setConverged(measure.distance(centroid, canopy.getCenter()) < convergenceDelta);
-    canopy.setCenter(centroid);
-    canopy.setNumPoints(1);
-    canopy.setPointTotal(centroid.clone());
+    canopy.computeConvergence(measure, convergenceDelta);
+    canopy.computeParameters();
     return canopy.isConverged();
   }
 
@@ -194,8 +191,7 @@ public class MeanShiftCanopyClusterer {
     return migratedCanopies;
   }
 
-  @SuppressWarnings("unused")
-  private static void verifyNonOverlap(List<MeanShiftCanopy> canopies) {
+  protected static void verifyNonOverlap(List<MeanShiftCanopy> canopies) {
     Set<Integer> coveredPoints = new HashSet<Integer>();
     // verify no overlap
     for (MeanShiftCanopy canopy : canopies) {

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java?rev=978786&r1=978785&r2=978786&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java Sat Jul 24 03:39:30 2010
@@ -34,6 +34,7 @@ import org.apache.hadoop.mapreduce.lib.i
 import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.mahout.clustering.AbstractCluster;
 import org.apache.mahout.clustering.Cluster;
 import org.apache.mahout.clustering.WeightedVectorWritable;
 import org.apache.mahout.clustering.canopy.CanopyDriver;
@@ -439,6 +440,9 @@ public class MeanShiftCanopyDriver exten
                                                            MeanShiftCanopy.class);
       try {
         for (MeanShiftCanopy cluster : clusters) {
+          log.info("Writing Cluster:" + cluster.getId() + " center:" + AbstractCluster.formatVector(cluster.getCenter(), null)
+                   + " numPoints:" + cluster.getNumPoints() + " radius:" + AbstractCluster.formatVector(cluster.getRadius(), null) + " to: "
+                   + clustersOut.getName());
           writer.append(new Text(cluster.getIdentifier()), cluster);
         }
       } finally {

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/TestClusterInterface.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/TestClusterInterface.java?rev=978786&r1=978785&r2=978786&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/TestClusterInterface.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/TestClusterInterface.java Sat Jul 24 03:39:30 2010
@@ -176,7 +176,7 @@ public class TestClusterInterface extend
     Cluster cluster = new Canopy(m, 123);
     String formatString = cluster.asFormatString(null);
     System.out.println(formatString);
-    assertEquals("format", "C-123: [1.100, 2.200, 3.300]", formatString);
+    assertEquals("format", "C-123{n=0 c=[1.100, 2.200, 3.300] r=[]}", formatString);
   }
 
   public void testCanopyAsFormatStringSparse() {
@@ -186,7 +186,7 @@ public class TestClusterInterface extend
     Cluster cluster = new Canopy(m, 123);
     String formatString = cluster.asFormatString(null);
     System.out.println(formatString);
-    assertEquals("format", "C-123: [0:1.100, 2:3.300]", formatString);
+    assertEquals("format", "C-123{n=0 c=[0:1.100, 2:3.300] r=[]}", formatString);
   }
 
   public void testCanopyAsFormatStringWithBindings() {
@@ -196,7 +196,7 @@ public class TestClusterInterface extend
     String[] bindings = { "fee", null, null };
     String formatString = cluster.asFormatString(bindings);
     System.out.println(formatString);
-    assertEquals("format", "C-123: [fee:1.100, 1:2.200, 2:3.300]", formatString);
+    assertEquals("format", "C-123{n=0 c=[fee:1.100, 1:2.200, 2:3.300] r=[]}", formatString);
   }
 
   public void testCanopyAsFormatStringSparseWithBindings() {
@@ -206,7 +206,7 @@ public class TestClusterInterface extend
     Cluster cluster = new Canopy(m, 123);
     String formatString = cluster.asFormatString(null);
     System.out.println(formatString);
-    assertEquals("format", "C-123: [0:1.100, 2:3.300]", formatString);
+    assertEquals("format", "C-123{n=0 c=[0:1.100, 2:3.300] r=[]}", formatString);
   }
 
   public void testClusterAsFormatString() {
@@ -215,7 +215,7 @@ public class TestClusterInterface extend
     Cluster cluster = new org.apache.mahout.clustering.kmeans.Cluster(m, 123);
     String formatString = cluster.asFormatString(null);
     System.out.println(formatString);
-    assertEquals("format", "C-123: [1.100, 2.200, 3.300]", formatString);
+    assertEquals("format", "CL-123{n=0 c=[1.100, 2.200, 3.300] r=[]}", formatString);
   }
 
   public void testClusterAsFormatStringSparse() {
@@ -225,7 +225,7 @@ public class TestClusterInterface extend
     Cluster cluster = new org.apache.mahout.clustering.kmeans.Cluster(m, 123);
     String formatString = cluster.asFormatString(null);
     System.out.println(formatString);
-    assertEquals("format", "C-123: [0:1.100, 2:3.300]", formatString);
+    assertEquals("format", "CL-123{n=0 c=[0:1.100, 2:3.300] r=[]}", formatString);
   }
 
   public void testClusterAsFormatStringWithBindings() {
@@ -235,7 +235,7 @@ public class TestClusterInterface extend
     String[] bindings = { "fee", null, "foo" };
     String formatString = cluster.asFormatString(bindings);
     System.out.println(formatString);
-    assertEquals("format", "C-123: [fee:1.100, 1:2.200, foo:3.300]", formatString);
+    assertEquals("format", "CL-123{n=0 c=[fee:1.100, 1:2.200, foo:3.300] r=[]}", formatString);
   }
 
   public void testClusterAsFormatStringSparseWithBindings() {
@@ -245,7 +245,7 @@ public class TestClusterInterface extend
     Cluster cluster = new org.apache.mahout.clustering.kmeans.Cluster(m, 123);
     String formatString = cluster.asFormatString(null);
     System.out.println(formatString);
-    assertEquals("format", "C-123: [0:1.100, 2:3.300]", formatString);
+    assertEquals("format", "CL-123{n=0 c=[0:1.100, 2:3.300] r=[]}", formatString);
   }
 
   public void testMSCanopyAsFormatString() {
@@ -254,7 +254,7 @@ public class TestClusterInterface extend
     Cluster cluster = new MeanShiftCanopy(m, 123);
     String formatString = cluster.asFormatString(null);
     System.out.println(formatString);
-    assertEquals("format", "C-123: [1.100, 2.200, 3.300]", formatString);
+    assertEquals("format", "MSC-123{n=0 c=[1.100, 2.200, 3.300] r=[]}", formatString);
   }
 
   public void testMSCanopyAsFormatStringSparse() {
@@ -264,7 +264,7 @@ public class TestClusterInterface extend
     Cluster cluster = new MeanShiftCanopy(m, 123);
     String formatString = cluster.asFormatString(null);
     System.out.println(formatString);
-    assertEquals("format", "C-123: [0:1.100, 2:3.300]", formatString);
+    assertEquals("format", "MSC-123{n=0 c=[0:1.100, 2:3.300] r=[]}", formatString);
   }
 
   public void testMSCanopyAsFormatStringWithBindings() {
@@ -274,7 +274,7 @@ public class TestClusterInterface extend
     String[] bindings = { "fee", null, "foo" };
     String formatString = cluster.asFormatString(bindings);
     System.out.println(formatString);
-    assertEquals("format", "C-123: [fee:1.100, 1:2.200, foo:3.300]", formatString);
+    assertEquals("format", "MSC-123{n=0 c=[fee:1.100, 1:2.200, foo:3.300] r=[]}", formatString);
   }
 
   public void testMSCanopyAsFormatStringSparseWithBindings() {
@@ -285,7 +285,7 @@ public class TestClusterInterface extend
     String[] bindings = { "fee", null, "foo" };
     String formatString = cluster.asFormatString(bindings);
     System.out.println(formatString);
-    assertEquals("format", "C-123: [fee:1.100, foo:3.300]", formatString);
+    assertEquals("format", "MSC-123{n=0 c=[fee:1.100, foo:3.300] r=[]}", formatString);
   }
 
 }

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java?rev=978786&r1=978785&r2=978786&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java Sat Jul 24 03:39:30 2010
@@ -32,7 +32,7 @@ import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.WritableComparable;
 import org.apache.hadoop.mapreduce.Mapper;
 import org.apache.hadoop.mapreduce.Reducer;
-import org.apache.mahout.clustering.ClusterBase;
+import org.apache.mahout.clustering.AbstractCluster;
 import org.apache.mahout.clustering.ClusteringTestUtils;
 import org.apache.mahout.clustering.WeightedVectorWritable;
 import org.apache.mahout.common.DummyRecordWriter;
@@ -92,7 +92,7 @@ public class TestCanopyCreation extends 
    */
   private static void printCanopies(List<Canopy> canopies) {
     for (Canopy canopy : canopies) {
-      System.out.println(canopy.toString());
+      System.out.println(canopy.asFormatString(null));
     }
   }
 
@@ -110,9 +110,9 @@ public class TestCanopyCreation extends 
     super.setUp();
     fs = FileSystem.get(new Configuration());
     referenceManhattan = CanopyClusterer.createCanopies(getPoints(), manhattanDistanceMeasure, 3.1, 2.1);
-    manhattanCentroids = CanopyClusterer.calculateCentroids(referenceManhattan);
+    manhattanCentroids = CanopyClusterer.getCenters(referenceManhattan);
     referenceEuclidean = CanopyClusterer.createCanopies(getPoints(), euclideanDistanceMeasure, 3.1, 2.1);
-    euclideanCentroids = CanopyClusterer.calculateCentroids(referenceEuclidean);
+    euclideanCentroids = CanopyClusterer.getCenters(referenceEuclidean);
   }
 
   /** Story: User can cluster points using a ManhattanDistanceMeasure and a reference implementation */
@@ -437,7 +437,7 @@ public class TestCanopyCreation extends 
     WeightedVectorWritable vector = new WeightedVectorWritable();
     while (reader.next(clusterId, vector)) {
       count++;
-      System.out.println("Txt: " + clusterId + " Vec: " + ClusterBase.formatVector(vector.getVector().get(), null));
+      System.out.println("Txt: " + clusterId + " Vec: " + AbstractCluster.formatVector(vector.getVector().get(), null));
     }
     assertEquals("number of points", points.size(), count);
     reader.close();
@@ -477,7 +477,7 @@ public class TestCanopyCreation extends 
     WeightedVectorWritable vector = new WeightedVectorWritable();
     while (reader.next(clusterId, vector)) {
       count++;
-      System.out.println("Txt: " + clusterId + " Vec: " + ClusterBase.formatVector(vector.getVector().get(), null));
+      System.out.println("Txt: " + clusterId + " Vec: " + AbstractCluster.formatVector(vector.getVector().get(), null));
     }
     assertEquals("number of points", points.size(), count);
     reader.close();
@@ -502,7 +502,7 @@ public class TestCanopyCreation extends 
     WeightedVectorWritable vector = new WeightedVectorWritable();
     while (reader.next(clusterId, vector)) {
       count++;
-      System.out.println("Txt: " + clusterId + " Vec: " + ClusterBase.formatVector(vector.getVector().get(), null));
+      System.out.println("Txt: " + clusterId + " Vec: " + AbstractCluster.formatVector(vector.getVector().get(), null));
     }
     assertEquals("number of points", points.size(), count);
     reader.close();
@@ -532,7 +532,7 @@ public class TestCanopyCreation extends 
     WeightedVectorWritable vw = new WeightedVectorWritable();
     while (reader.next(canopyId, vw)) {
       count++;
-      System.out.println("Txt: " + canopyId.toString() + " Vec: " + ClusterBase.formatVector(vw.getVector().get(), null));
+      System.out.println("Txt: " + canopyId.toString() + " Vec: " + AbstractCluster.formatVector(vw.getVector().get(), null));
     }
     assertEquals("number of points", points.size(), count);
     reader.close();

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java?rev=978786&r1=978785&r2=978786&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java Sat Jul 24 03:39:30 2010
@@ -35,6 +35,8 @@ import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.WritableComparable;
 import org.apache.hadoop.mapreduce.Mapper;
 import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.mahout.clustering.AbstractCluster;
+import org.apache.mahout.clustering.ClusterObservations;
 import org.apache.mahout.clustering.ClusteringTestUtils;
 import org.apache.mahout.clustering.WeightedVectorWritable;
 import org.apache.mahout.clustering.kmeans.TestKmeansClustering;
@@ -185,12 +187,12 @@ public class TestFuzzyKmeansClustering e
 
   public void testFuzzyKMeansSeqJob() throws Exception {
     List<VectorWritable> points = TestKmeansClustering.getPointsWritable(TestKmeansClustering.reference);
-  
+
     Path pointsPath = getTestTempDirPath("points");
     Path clustersPath = getTestTempDirPath("clusters");
     Configuration conf = new Configuration();
     ClusteringTestUtils.writePointsToFile(points, new Path(pointsPath, "file1"), fs, conf);
-  
+
     for (int k = 0; k < points.size(); k++) {
       System.out.println("testKFuzzyKMeansMRJob k= " + k);
       // pick k initial cluster centers at random
@@ -201,18 +203,18 @@ public class TestFuzzyKmeansClustering e
                                                            SoftCluster.class);
       for (int i = 0; i < k + 1; i++) {
         Vector vec = tweakValue(points.get(i).get());
-  
-        SoftCluster cluster = new SoftCluster(vec);
+
+        SoftCluster cluster = new SoftCluster(vec, i);
         // add the center so the centroid will be correct upon output
-        cluster.addPoint(cluster.getCenter(), 1);
+        cluster.observe(cluster.getCenter(), 1);
         /*
          * writer.write(cluster.getIdentifier() + '\t' + SoftCluster.formatCluster(cluster) + '\n');
          */
         writer.append(new Text(cluster.getIdentifier()), cluster);
-  
+
       }
       writer.close();
-  
+
       // now run the Job using the run() command line options.
       Path output = getTestTempDirPath("output");
       /*      FuzzyKMeansDriver.runJob(pointsPath,
@@ -227,17 +229,12 @@ public class TestFuzzyKmeansClustering e
                                      true,
                                      0);
       */
-      String[] args = { 
-          optKey(DefaultOptionCreator.INPUT_OPTION), pointsPath.toString(), 
-          optKey(DefaultOptionCreator.CLUSTERS_IN_OPTION), clustersPath.toString(), 
-          optKey(DefaultOptionCreator.OUTPUT_OPTION), output.toString(),
-          optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION), EuclideanDistanceMeasure.class.getName(),
-          optKey(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION), "0.001", 
-          optKey(DefaultOptionCreator.MAX_ITERATIONS_OPTION), "2", 
-          optKey(FuzzyKMeansDriver.M_OPTION), "2.0", 
-          optKey(DefaultOptionCreator.CLUSTERING_OPTION),
-          optKey(DefaultOptionCreator.EMIT_MOST_LIKELY_OPTION), 
-          optKey(DefaultOptionCreator.OVERWRITE_OPTION),
+      String[] args = { optKey(DefaultOptionCreator.INPUT_OPTION), pointsPath.toString(),
+          optKey(DefaultOptionCreator.CLUSTERS_IN_OPTION), clustersPath.toString(), optKey(DefaultOptionCreator.OUTPUT_OPTION),
+          output.toString(), optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION), EuclideanDistanceMeasure.class.getName(),
+          optKey(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION), "0.001", optKey(DefaultOptionCreator.MAX_ITERATIONS_OPTION), "2",
+          optKey(FuzzyKMeansDriver.M_OPTION), "2.0", optKey(DefaultOptionCreator.CLUSTERING_OPTION),
+          optKey(DefaultOptionCreator.EMIT_MOST_LIKELY_OPTION), optKey(DefaultOptionCreator.OVERWRITE_OPTION),
           optKey(DefaultOptionCreator.METHOD_OPTION), DefaultOptionCreator.SEQUENTIAL_METHOD };
       new FuzzyKMeansDriver().run(args);
       SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path(output, "clusteredPoints/part-m-0"), conf);
@@ -248,7 +245,7 @@ public class TestFuzzyKmeansClustering e
       }
       reader.close();
     }
-  
+
   }
 
   public void testFuzzyKMeansMRJob() throws Exception {
@@ -270,9 +267,9 @@ public class TestFuzzyKmeansClustering e
       for (int i = 0; i < k + 1; i++) {
         Vector vec = tweakValue(points.get(i).get());
 
-        SoftCluster cluster = new SoftCluster(vec);
+        SoftCluster cluster = new SoftCluster(vec, i);
         // add the center so the centroid will be correct upon output
-        cluster.addPoint(cluster.getCenter(), 1);
+        cluster.observe(cluster.getCenter(), 1);
         /*
          * writer.write(cluster.getIdentifier() + '\t' + SoftCluster.formatCluster(cluster) + '\n');
          */
@@ -295,17 +292,12 @@ public class TestFuzzyKmeansClustering e
                                      true,
                                      0);
       */
-      String[] args = { 
-          optKey(DefaultOptionCreator.INPUT_OPTION), pointsPath.toString(), 
-          optKey(DefaultOptionCreator.CLUSTERS_IN_OPTION), clustersPath.toString(), 
-          optKey(DefaultOptionCreator.OUTPUT_OPTION), output.toString(),
-          optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION), EuclideanDistanceMeasure.class.getName(),
-          optKey(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION), "0.001", 
-          optKey(DefaultOptionCreator.MAX_ITERATIONS_OPTION), "2", 
-          optKey(FuzzyKMeansDriver.M_OPTION), "2.0", 
-          optKey(DefaultOptionCreator.CLUSTERING_OPTION),
-          optKey(DefaultOptionCreator.EMIT_MOST_LIKELY_OPTION), 
-          optKey(DefaultOptionCreator.OVERWRITE_OPTION) };
+      String[] args = { optKey(DefaultOptionCreator.INPUT_OPTION), pointsPath.toString(),
+          optKey(DefaultOptionCreator.CLUSTERS_IN_OPTION), clustersPath.toString(), optKey(DefaultOptionCreator.OUTPUT_OPTION),
+          output.toString(), optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION), EuclideanDistanceMeasure.class.getName(),
+          optKey(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION), "0.001", optKey(DefaultOptionCreator.MAX_ITERATIONS_OPTION), "2",
+          optKey(FuzzyKMeansDriver.M_OPTION), "2.0", optKey(DefaultOptionCreator.CLUSTERING_OPTION),
+          optKey(DefaultOptionCreator.EMIT_MOST_LIKELY_OPTION), optKey(DefaultOptionCreator.OVERWRITE_OPTION) };
       new FuzzyKMeansDriver().run(args);
       SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path(output, "clusteredPoints/part-m-00000"), conf);
       IntWritable key = new IntWritable();
@@ -330,7 +322,7 @@ public class TestFuzzyKmeansClustering e
         Vector vec = tweakValue(points.get(i).get());
 
         SoftCluster cluster = new SoftCluster(vec, i);
-        cluster.addPoint(cluster.getCenter(), 1);
+        cluster.observe(cluster.getCenter(), 1);
         clusterList.add(cluster);
       }
 
@@ -345,10 +337,9 @@ public class TestFuzzyKmeansClustering e
       conf.set(FuzzyKMeansConfigKeys.EMIT_MOST_LIKELY_KEY, "true");
       conf.set(FuzzyKMeansConfigKeys.THRESHOLD_KEY, "0");
 
-      DummyRecordWriter<Text, FuzzyKMeansInfo> mapWriter = new DummyRecordWriter<Text, FuzzyKMeansInfo>();
-      Mapper<WritableComparable<?>, VectorWritable, Text, FuzzyKMeansInfo>.Context mapContext = DummyRecordWriter.build(mapper,
-                                                                                                                        conf,
-                                                                                                                        mapWriter);
+      DummyRecordWriter<Text, ClusterObservations> mapWriter = new DummyRecordWriter<Text, ClusterObservations>();
+      Mapper<WritableComparable<?>, VectorWritable, Text, ClusterObservations>.Context mapContext = DummyRecordWriter
+          .build(mapper, conf, mapWriter);
       mapper.setup(mapContext);
       for (VectorWritable point : points) {
         mapper.map(new Text(), point, mapContext);
@@ -361,15 +352,15 @@ public class TestFuzzyKmeansClustering e
 
       for (Text key : mapWriter.getKeys()) {
         // SoftCluster cluster = SoftCluster.decodeCluster(key);
-        List<FuzzyKMeansInfo> values = mapWriter.getValue(key);
+        List<ClusterObservations> values = mapWriter.getValue(key);
 
-        for (FuzzyKMeansInfo value : values) {
-          Double val = pointTotalProbMap.get(value.getVector());
+        for (ClusterObservations value : values) {
+          Double val = pointTotalProbMap.get(value.getS1());
           double probVal = 0.0;
           if (val != null) {
             probVal = val;
           }
-          pointTotalProbMap.put(value.getVector(), probVal + value.getProbability());
+          pointTotalProbMap.put(value.getS1(), probVal + value.getS0());
         }
       }
       for (Map.Entry<Vector, Double> entry : pointTotalProbMap.entrySet()) {
@@ -393,7 +384,7 @@ public class TestFuzzyKmeansClustering e
         Vector vec = tweakValue(points.get(i).get());
 
         SoftCluster cluster = new SoftCluster(vec, i);
-        cluster.addPoint(cluster.getCenter(), 1);
+        cluster.observe(cluster.getCenter(), 1);
         clusterList.add(cluster);
       }
 
@@ -408,10 +399,9 @@ public class TestFuzzyKmeansClustering e
       conf.set(FuzzyKMeansConfigKeys.EMIT_MOST_LIKELY_KEY, "true");
       conf.set(FuzzyKMeansConfigKeys.THRESHOLD_KEY, "0");
 
-      DummyRecordWriter<Text, FuzzyKMeansInfo> mapWriter = new DummyRecordWriter<Text, FuzzyKMeansInfo>();
-      Mapper<WritableComparable<?>, VectorWritable, Text, FuzzyKMeansInfo>.Context mapContext = DummyRecordWriter.build(mapper,
-                                                                                                                        conf,
-                                                                                                                        mapWriter);
+      DummyRecordWriter<Text, ClusterObservations> mapWriter = new DummyRecordWriter<Text, ClusterObservations>();
+      Mapper<WritableComparable<?>, VectorWritable, Text, ClusterObservations>.Context mapContext = DummyRecordWriter
+          .build(mapper, conf, mapWriter);
       mapper.setup(mapContext);
       for (VectorWritable point : points) {
         mapper.map(new Text(), point, mapContext);
@@ -419,12 +409,12 @@ public class TestFuzzyKmeansClustering e
 
       // run combiner
       FuzzyKMeansCombiner combiner = new FuzzyKMeansCombiner();
-      DummyRecordWriter<Text, FuzzyKMeansInfo> combinerWriter = new DummyRecordWriter<Text, FuzzyKMeansInfo>();
-      Reducer<Text, FuzzyKMeansInfo, Text, FuzzyKMeansInfo>.Context combinerContext = DummyRecordWriter
-          .build(combiner, conf, combinerWriter, Text.class, FuzzyKMeansInfo.class);
+      DummyRecordWriter<Text, ClusterObservations> combinerWriter = new DummyRecordWriter<Text, ClusterObservations>();
+      Reducer<Text, ClusterObservations, Text, ClusterObservations>.Context combinerContext = DummyRecordWriter
+          .build(combiner, conf, combinerWriter, Text.class, ClusterObservations.class);
       combiner.setup(combinerContext);
       for (Text key : mapWriter.getKeys()) {
-        List<FuzzyKMeansInfo> values = mapWriter.getValue(key);
+        List<ClusterObservations> values = mapWriter.getValue(key);
         combiner.reduce(new Text(key), values, combinerContext);
       }
 
@@ -432,7 +422,7 @@ public class TestFuzzyKmeansClustering e
       assertEquals("Combiner Output", k + 1, combinerWriter.getData().size());
 
       for (Text key : combinerWriter.getKeys()) {
-        List<FuzzyKMeansInfo> values = combinerWriter.getValue(key);
+        List<ClusterObservations> values = combinerWriter.getValue(key);
         assertEquals("too many values", 1, values.size());
       }
     }
@@ -465,10 +455,9 @@ public class TestFuzzyKmeansClustering e
       conf.set(FuzzyKMeansConfigKeys.EMIT_MOST_LIKELY_KEY, "true");
       conf.set(FuzzyKMeansConfigKeys.THRESHOLD_KEY, "0");
 
-      DummyRecordWriter<Text, FuzzyKMeansInfo> mapWriter = new DummyRecordWriter<Text, FuzzyKMeansInfo>();
-      Mapper<WritableComparable<?>, VectorWritable, Text, FuzzyKMeansInfo>.Context mapContext = DummyRecordWriter.build(mapper,
-                                                                                                                        conf,
-                                                                                                                        mapWriter);
+      DummyRecordWriter<Text, ClusterObservations> mapWriter = new DummyRecordWriter<Text, ClusterObservations>();
+      Mapper<WritableComparable<?>, VectorWritable, Text, ClusterObservations>.Context mapContext = DummyRecordWriter
+          .build(mapper, conf, mapWriter);
       mapper.setup(mapContext);
       for (VectorWritable point : points) {
         mapper.map(new Text(), point, mapContext);
@@ -476,27 +465,24 @@ public class TestFuzzyKmeansClustering e
 
       // run combiner
       FuzzyKMeansCombiner combiner = new FuzzyKMeansCombiner();
-      DummyRecordWriter<Text, FuzzyKMeansInfo> combinerWriter = new DummyRecordWriter<Text, FuzzyKMeansInfo>();
-      Reducer<Text, FuzzyKMeansInfo, Text, FuzzyKMeansInfo>.Context combinerContext = DummyRecordWriter
-          .build(combiner, conf, combinerWriter, Text.class, FuzzyKMeansInfo.class);
+      DummyRecordWriter<Text, ClusterObservations> combinerWriter = new DummyRecordWriter<Text, ClusterObservations>();
+      Reducer<Text, ClusterObservations, Text, ClusterObservations>.Context combinerContext = DummyRecordWriter
+          .build(combiner, conf, combinerWriter, Text.class, ClusterObservations.class);
       combiner.setup(combinerContext);
       for (Text key : mapWriter.getKeys()) {
-        List<FuzzyKMeansInfo> values = mapWriter.getValue(key);
+        List<ClusterObservations> values = mapWriter.getValue(key);
         combiner.reduce(new Text(key), values, combinerContext);
       }
 
       // run reducer
       FuzzyKMeansReducer reducer = new FuzzyKMeansReducer();
       DummyRecordWriter<Text, SoftCluster> reducerWriter = new DummyRecordWriter<Text, SoftCluster>();
-      Reducer<Text, FuzzyKMeansInfo, Text, SoftCluster>.Context reducerContext = DummyRecordWriter.build(reducer,
-                                                                                                         conf,
-                                                                                                         reducerWriter,
-                                                                                                         Text.class,
-                                                                                                         FuzzyKMeansInfo.class);
+      Reducer<Text, ClusterObservations, Text, SoftCluster>.Context reducerContext = DummyRecordWriter
+          .build(reducer, conf, reducerWriter, Text.class, ClusterObservations.class);
       reducer.setup(clusterList, conf);
 
       for (Text key : combinerWriter.getKeys()) {
-        List<FuzzyKMeansInfo> values = combinerWriter.getValue(key);
+        List<ClusterObservations> values = combinerWriter.getValue(key);
         reducer.reduce(new Text(key), values, reducerContext);
       }
 
@@ -522,9 +508,9 @@ public class TestFuzzyKmeansClustering e
         List<SoftCluster> values = reducerWriter.getValue(new Text(clusterId));
         SoftCluster cluster = values.get(0);
         System.out.println("ref= " + key.toString() + " cluster= " + cluster.toString());
-        cluster.recomputeCenter();
-        assertEquals("key center: " + key.getCenter().asFormatString() + " does not equal cluster: "
-            + cluster.getCenter().asFormatString(), key.getCenter(), cluster.getCenter());
+        cluster.computeParameters();
+        assertEquals("key center: " + AbstractCluster.formatVector(key.getCenter(), null) + " does not equal cluster: "
+            + AbstractCluster.formatVector(cluster.getCenter(), null), key.getCenter(), cluster.getCenter());
       }
     }
   }
@@ -541,11 +527,11 @@ public class TestFuzzyKmeansClustering e
         Vector vec = tweakValue(points.get(i).get());
 
         SoftCluster cluster = new SoftCluster(vec, i);
-        cluster.addPoint(cluster.getCenter(), 1);
+        cluster.observe(cluster.getCenter(), 1);
         clusterList.add(cluster);
       }
       for (SoftCluster softCluster : clusterList) {
-        softCluster.recomputeCenter();
+        softCluster.computeParameters();
       }
 
       // run mapper
@@ -560,10 +546,9 @@ public class TestFuzzyKmeansClustering e
       conf.set(FuzzyKMeansConfigKeys.EMIT_MOST_LIKELY_KEY, "true");
       conf.set(FuzzyKMeansConfigKeys.THRESHOLD_KEY, "0");
 
-      DummyRecordWriter<Text, FuzzyKMeansInfo> mapWriter = new DummyRecordWriter<Text, FuzzyKMeansInfo>();
-      Mapper<WritableComparable<?>, VectorWritable, Text, FuzzyKMeansInfo>.Context mapContext = DummyRecordWriter.build(mapper,
-                                                                                                                        conf,
-                                                                                                                        mapWriter);
+      DummyRecordWriter<Text, ClusterObservations> mapWriter = new DummyRecordWriter<Text, ClusterObservations>();
+      Mapper<WritableComparable<?>, VectorWritable, Text, ClusterObservations>.Context mapContext = DummyRecordWriter
+          .build(mapper, conf, mapWriter);
       mapper.setup(mapContext);
       for (VectorWritable point : points) {
         mapper.map(new Text(), point, mapContext);
@@ -571,27 +556,24 @@ public class TestFuzzyKmeansClustering e
 
       // run combiner
       FuzzyKMeansCombiner combiner = new FuzzyKMeansCombiner();
-      DummyRecordWriter<Text, FuzzyKMeansInfo> combinerWriter = new DummyRecordWriter<Text, FuzzyKMeansInfo>();
-      Reducer<Text, FuzzyKMeansInfo, Text, FuzzyKMeansInfo>.Context combinerContext = DummyRecordWriter
-          .build(combiner, conf, combinerWriter, Text.class, FuzzyKMeansInfo.class);
+      DummyRecordWriter<Text, ClusterObservations> combinerWriter = new DummyRecordWriter<Text, ClusterObservations>();
+      Reducer<Text, ClusterObservations, Text, ClusterObservations>.Context combinerContext = DummyRecordWriter
+          .build(combiner, conf, combinerWriter, Text.class, ClusterObservations.class);
       combiner.setup(combinerContext);
       for (Text key : mapWriter.getKeys()) {
-        List<FuzzyKMeansInfo> values = mapWriter.getValue(key);
+        List<ClusterObservations> values = mapWriter.getValue(key);
         combiner.reduce(new Text(key), values, combinerContext);
       }
 
       // run reducer
       FuzzyKMeansReducer reducer = new FuzzyKMeansReducer();
       DummyRecordWriter<Text, SoftCluster> reducerWriter = new DummyRecordWriter<Text, SoftCluster>();
-      Reducer<Text, FuzzyKMeansInfo, Text, SoftCluster>.Context reducerContext = DummyRecordWriter.build(reducer,
-                                                                                                         conf,
-                                                                                                         reducerWriter,
-                                                                                                         Text.class,
-                                                                                                         FuzzyKMeansInfo.class);
+      Reducer<Text, ClusterObservations, Text, SoftCluster>.Context reducerContext = DummyRecordWriter
+          .build(reducer, conf, reducerWriter, Text.class, ClusterObservations.class);
       reducer.setup(clusterList, conf);
 
       for (Text key : combinerWriter.getKeys()) {
-        List<FuzzyKMeansInfo> values = combinerWriter.getValue(key);
+        List<ClusterObservations> values = combinerWriter.getValue(key);
         reducer.reduce(new Text(key), values, reducerContext);
       }
 
@@ -602,7 +584,7 @@ public class TestFuzzyKmeansClustering e
         reducerClusters.add(values.get(0));
       }
       for (SoftCluster softCluster : reducerClusters) {
-        softCluster.recomputeCenter();
+        softCluster.computeParameters();
       }
 
       FuzzyKMeansClusterMapper clusterMapper = new FuzzyKMeansClusterMapper();
@@ -655,19 +637,19 @@ public class TestFuzzyKmeansClustering e
     }
   }
 
-  public void testFuzzyKMeansInfoSerialization() throws IOException {
+  public void testClusterObservationsSerialization() throws IOException {
     double[] data = { 1.1, 2.2, 3.3 };
     Vector vector = new DenseVector(data);
-    FuzzyKMeansInfo reference = new FuzzyKMeansInfo(2.0, vector, 1);
+    ClusterObservations reference = new ClusterObservations(1, 2.0, vector, vector);
     DataOutputBuffer out = new DataOutputBuffer();
     reference.write(out);
-    FuzzyKMeansInfo info = new FuzzyKMeansInfo();
+    ClusterObservations info = new ClusterObservations();
     DataInputBuffer in = new DataInputBuffer();
     in.reset(out.getData(), out.getLength());
     info.readFields(in);
-    assertEquals("probability", reference.getProbability(), info.getProbability());
-    assertTrue("point total", reference.getVector().equals(info.getVector()));
-    assertEquals("combiner", reference.getCombinerPass(), info.getCombinerPass());
+    assertEquals("probability", reference.getS0(), info.getS0());
+    assertTrue("point total", reference.getS1().equals(info.getS1()));
+    assertEquals("combiner", reference.getCombinerState(), info.getCombinerState());
   }
 
 }

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java?rev=978786&r1=978785&r2=978786&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java Sat Jul 24 03:39:30 2010
@@ -32,6 +32,8 @@ import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.WritableComparable;
 import org.apache.hadoop.mapreduce.Mapper;
 import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.mahout.clustering.AbstractCluster;
+import org.apache.mahout.clustering.ClusterObservations;
 import org.apache.mahout.clustering.ClusteringTestUtils;
 import org.apache.mahout.clustering.WeightedVectorWritable;
 import org.apache.mahout.clustering.canopy.CanopyDriver;
@@ -111,31 +113,20 @@ public class TestKmeansClustering extend
       List<Cluster> clusters = new ArrayList<Cluster>();
       for (int i = 0; i < k + 1; i++) {
         Vector vec = points.get(i);
-        clusters.add(new VisibleCluster(vec));
+        clusters.add(new Cluster(vec, i));
       }
       // iterate clusters until they converge
       int maxIter = 10;
       List<List<Cluster>> clustersList = KMeansClusterer.clusterPoints(points, clusters, measure, maxIter, 0.001);
       clusters = clustersList.get(clustersList.size() - 1);
       for (int c = 0; c < clusters.size(); c++) {
-        Cluster cluster = clusters.get(c);
-        System.out.println(cluster.toString());
+        AbstractCluster cluster = clusters.get(c);
+        System.out.println(cluster.asFormatString(null));
         assertEquals("Cluster " + c + " test " + (k + 1), expectedNumPoints[k][c], cluster.getNumPoints());
       }
     }
   }
 
-  public void testStd() {
-    List<Vector> points = getPoints(reference);
-    Cluster c = new Cluster(points.get(0));
-    for (Vector p : points) {
-      c.addPoint(p);
-      if (c.getNumPoints() > 1) {
-        assertTrue(c.getStd() > 0.0);
-      }
-    }
-  }
-
   private static Map<String, Cluster> loadClusterMap(List<Cluster> clusters) {
     Map<String, Cluster> clusterMap = new HashMap<String, Cluster>();
 
@@ -156,16 +147,15 @@ public class TestKmeansClustering extend
     List<VectorWritable> points = getPointsWritable(reference);
     for (int k = 0; k < points.size(); k++) {
       // pick k initial cluster centers at random
-      DummyRecordWriter<Text, KMeansInfo> mapWriter = new DummyRecordWriter<Text, KMeansInfo>();
-      Mapper<WritableComparable<?>, VectorWritable, Text, KMeansInfo>.Context mapContext = DummyRecordWriter.build(mapper,
-                                                                                                                   conf,
-                                                                                                                   mapWriter);
+      DummyRecordWriter<Text, ClusterObservations> mapWriter = new DummyRecordWriter<Text, ClusterObservations>();
+      Mapper<WritableComparable<?>, VectorWritable, Text, ClusterObservations>.Context mapContext = DummyRecordWriter
+          .build(mapper, conf, mapWriter);
       List<Cluster> clusters = new ArrayList<Cluster>();
 
       for (int i = 0; i < k + 1; i++) {
         Cluster cluster = new Cluster(points.get(i).get(), i);
         // add the center so the centroid will be correct upon output
-        cluster.addPoint(cluster.getCenter());
+        cluster.observe(cluster.getCenter(), 1);
         clusters.add(cluster);
       }
       mapper.setup(clusters, measure);
@@ -175,16 +165,14 @@ public class TestKmeansClustering extend
         mapper.map(new Text(), point, mapContext);
       }
       assertEquals("Number of map results", k + 1, mapWriter.getData().size());
-      // now verify that all points are correctly allocated
-      EuclideanDistanceMeasure euclideanDistanceMeasure = measure;
       Map<String, Cluster> clusterMap = loadClusterMap(clusters);
       for (Text key : mapWriter.getKeys()) {
-        Cluster cluster = clusterMap.get(key.toString());
-        List<KMeansInfo> values = mapWriter.getValue(key);
-        for (KMeansInfo value : values) {
-          double distance = euclideanDistanceMeasure.distance(cluster.getCenter(), value.getPointTotal());
-          for (Cluster c : clusters) {
-            assertTrue("distance error", distance <= euclideanDistanceMeasure.distance(value.getPointTotal(), c.getCenter()));
+        AbstractCluster cluster = clusterMap.get(key.toString());
+        List<ClusterObservations> values = mapWriter.getValue(key);
+        for (ClusterObservations value : values) {
+          double distance = measure.distance(cluster.getCenter(), value.getS1());
+          for (AbstractCluster c : clusters) {
+            assertTrue("distance error", distance <= measure.distance(value.getS1(), c.getCenter()));
           }
         }
       }
@@ -205,17 +193,16 @@ public class TestKmeansClustering extend
     List<VectorWritable> points = getPointsWritable(reference);
     for (int k = 0; k < points.size(); k++) {
       // pick k initial cluster centers at random
-      DummyRecordWriter<Text, KMeansInfo> mapWriter = new DummyRecordWriter<Text, KMeansInfo>();
-      Mapper<WritableComparable<?>, VectorWritable, Text, KMeansInfo>.Context mapContext = DummyRecordWriter.build(mapper,
-                                                                                                                   conf,
-                                                                                                                   mapWriter);
+      DummyRecordWriter<Text, ClusterObservations> mapWriter = new DummyRecordWriter<Text, ClusterObservations>();
+      Mapper<WritableComparable<?>, VectorWritable, Text, ClusterObservations>.Context mapContext = DummyRecordWriter
+          .build(mapper, conf, mapWriter);
       List<Cluster> clusters = new ArrayList<Cluster>();
       for (int i = 0; i < k + 1; i++) {
         Vector vec = points.get(i).get();
 
         Cluster cluster = new Cluster(vec, i);
         // add the center so the centroid will be correct upon output
-        cluster.addPoint(cluster.getCenter());
+        cluster.observe(cluster.getCenter(), 1);
         clusters.add(cluster);
       }
       mapper.setup(clusters, measure);
@@ -225,12 +212,9 @@ public class TestKmeansClustering extend
       }
       // now combine the data
       KMeansCombiner combiner = new KMeansCombiner();
-      DummyRecordWriter<Text, KMeansInfo> combinerWriter = new DummyRecordWriter<Text, KMeansInfo>();
-      Reducer<Text, KMeansInfo, Text, KMeansInfo>.Context combinerContext = DummyRecordWriter.build(combiner,
-                                                                                                    conf,
-                                                                                                    combinerWriter,
-                                                                                                    Text.class,
-                                                                                                    KMeansInfo.class);
+      DummyRecordWriter<Text, ClusterObservations> combinerWriter = new DummyRecordWriter<Text, ClusterObservations>();
+      Reducer<Text, ClusterObservations, Text, ClusterObservations>.Context combinerContext = DummyRecordWriter
+          .build(combiner, conf, combinerWriter, Text.class, ClusterObservations.class);
       for (Text key : mapWriter.getKeys()) {
         combiner.reduce(new Text(key), mapWriter.getValue(key), combinerContext);
       }
@@ -240,13 +224,12 @@ public class TestKmeansClustering extend
       int count = 0;
       Vector total = new DenseVector(2);
       for (Text key : combinerWriter.getKeys()) {
-        List<KMeansInfo> values = combinerWriter.getValue(key);
+        List<ClusterObservations> values = combinerWriter.getValue(key);
         assertEquals("too many values", 1, values.size());
-        // String value = values.get(0).toString();
-        KMeansInfo info = values.get(0);
+        ClusterObservations info = values.get(0);
 
-        count += info.getPoints();
-        total = total.plus(info.getPointTotal());
+        count += info.getS0();
+        total = total.plus(info.getS1());
       }
       assertEquals("total points", 9, count);
       assertEquals("point total[0]", 27, (int) total.get(0));
@@ -269,10 +252,9 @@ public class TestKmeansClustering extend
     for (int k = 0; k < points.size(); k++) {
       System.out.println("K = " + k);
       // pick k initial cluster centers at random
-      DummyRecordWriter<Text, KMeansInfo> mapWriter = new DummyRecordWriter<Text, KMeansInfo>();
-      Mapper<WritableComparable<?>, VectorWritable, Text, KMeansInfo>.Context mapContext = DummyRecordWriter.build(mapper,
-                                                                                                                   conf,
-                                                                                                                   mapWriter);
+      DummyRecordWriter<Text, ClusterObservations> mapWriter = new DummyRecordWriter<Text, ClusterObservations>();
+      Mapper<WritableComparable<?>, VectorWritable, Text, ClusterObservations>.Context mapContext = DummyRecordWriter
+          .build(mapper, conf, mapWriter);
       List<Cluster> clusters = new ArrayList<Cluster>();
       for (int i = 0; i < k + 1; i++) {
         Vector vec = points.get(i).get();
@@ -288,12 +270,9 @@ public class TestKmeansClustering extend
       }
       // now combine the data
       KMeansCombiner combiner = new KMeansCombiner();
-      DummyRecordWriter<Text, KMeansInfo> combinerWriter = new DummyRecordWriter<Text, KMeansInfo>();
-      Reducer<Text, KMeansInfo, Text, KMeansInfo>.Context combinerContext = DummyRecordWriter.build(combiner,
-                                                                                                    conf,
-                                                                                                    combinerWriter,
-                                                                                                    Text.class,
-                                                                                                    KMeansInfo.class);
+      DummyRecordWriter<Text, ClusterObservations> combinerWriter = new DummyRecordWriter<Text, ClusterObservations>();
+      Reducer<Text, ClusterObservations, Text, ClusterObservations>.Context combinerContext = DummyRecordWriter
+          .build(combiner, conf, combinerWriter, Text.class, ClusterObservations.class);
       for (Text key : mapWriter.getKeys()) {
         combiner.reduce(new Text(key), mapWriter.getValue(key), combinerContext);
       }
@@ -302,11 +281,11 @@ public class TestKmeansClustering extend
       KMeansReducer reducer = new KMeansReducer();
       reducer.setup(clusters, measure);
       DummyRecordWriter<Text, Cluster> reducerWriter = new DummyRecordWriter<Text, Cluster>();
-      Reducer<Text, KMeansInfo, Text, Cluster>.Context reducerContext = DummyRecordWriter.build(reducer,
-                                                                                                conf,
-                                                                                                reducerWriter,
-                                                                                                Text.class,
-                                                                                                KMeansInfo.class);
+      Reducer<Text, ClusterObservations, Text, Cluster>.Context reducerContext = DummyRecordWriter.build(reducer,
+                                                                                                         conf,
+                                                                                                         reducerWriter,
+                                                                                                         Text.class,
+                                                                                                         ClusterObservations.class);
       for (Text key : combinerWriter.getKeys()) {
         reducer.reduce(new Text(key), combinerWriter.getValue(key), reducerContext);
       }
@@ -339,7 +318,7 @@ public class TestKmeansClustering extend
         converged = converged && cluster.isConverged();
         // Since we aren't roundtripping through Writable, we need to compare the reference center with the
         // cluster centroid
-        cluster.recomputeCenter();
+        cluster.computeParameters();
         assertEquals(ref.getCenter(), cluster.getCenter());
       }
       if (k == 8) {
@@ -353,7 +332,7 @@ public class TestKmeansClustering extend
   /** Story: User wishes to run kmeans job on reference data */
   public void testKMeansSeqJob() throws Exception {
     List<VectorWritable> points = getPointsWritable(reference);
-  
+
     Path pointsPath = getTestTempDirPath("points");
     Path clustersPath = getTestTempDirPath("clusters");
     Configuration conf = new Configuration();
@@ -365,13 +344,13 @@ public class TestKmeansClustering extend
       Path path = new Path(clustersPath, "part-00000");
       FileSystem fs = FileSystem.get(path.toUri(), conf);
       SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path, Text.class, Cluster.class);
-  
+
       for (int i = 0; i < k + 1; i++) {
         Vector vec = points.get(i).get();
-  
+
         Cluster cluster = new Cluster(vec, i);
         // add the center so the centroid will be correct upon output
-        cluster.addPoint(cluster.getCenter());
+        cluster.observe(cluster.getCenter(), 1);
         writer.append(new Text(cluster.getIdentifier()), cluster);
       }
       writer.close();
@@ -385,7 +364,7 @@ public class TestKmeansClustering extend
           optKey(DefaultOptionCreator.CLUSTERING_OPTION), optKey(DefaultOptionCreator.OVERWRITE_OPTION),
           optKey(DefaultOptionCreator.METHOD_OPTION), DefaultOptionCreator.SEQUENTIAL_METHOD };
       new KMeansDriver().run(args);
-  
+
       // now compare the expected clusters with actual
       Path clusteredPointsPath = new Path(outputPath, "clusteredPoints");
       SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path(clusteredPointsPath, "part-m-0"), conf);
@@ -426,7 +405,7 @@ public class TestKmeansClustering extend
 
         Cluster cluster = new Cluster(vec, i);
         // add the center so the centroid will be correct upon output
-        cluster.addPoint(cluster.getCenter());
+        cluster.observe(cluster.getCenter(), 1);
         writer.append(new Text(cluster.getIdentifier()), cluster);
       }
       writer.close();
@@ -491,8 +470,6 @@ public class TestKmeansClustering extend
 
     // now compare the expected clusters with actual
     Path clusteredPointsPath = new Path(outputPath, "clusteredPoints");
-    //String[] outFiles = outDir.list();
-    //assertEquals("output dir files?", 4, outFiles.length);
     DummyOutputCollector<IntWritable, WeightedVectorWritable> collector = new DummyOutputCollector<IntWritable, WeightedVectorWritable>();
     SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path(clusteredPointsPath, "part-m-00000"), conf);
 

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestRandomSeedGenerator.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestRandomSeedGenerator.java?rev=978786&r1=978785&r2=978786&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestRandomSeedGenerator.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestRandomSeedGenerator.java Sat Jul 24 03:39:30 2010
@@ -29,6 +29,7 @@ import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.mapreduce.Job;
+import org.apache.mahout.clustering.AbstractCluster;
 import org.apache.mahout.clustering.ClusteringTestUtils;
 import org.apache.mahout.common.MahoutTestCase;
 import org.apache.mahout.math.RandomAccessSparseVector;
@@ -87,7 +88,7 @@ public class TestRandomSeedGenerator ext
     
     SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path(output, "part-randomSeed"), conf);
     Writable key = (Writable) reader.getKeyClass().newInstance();
-    Cluster value = (Cluster) reader.getValueClass().newInstance();
+    AbstractCluster value = (AbstractCluster) reader.getValueClass().newInstance();
     
     int clusterCount = 0;
     Set<Integer> set = new HashSet<Integer>();

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java?rev=978786&r1=978785&r2=978786&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java Sat Jul 24 03:39:30 2010
@@ -69,7 +69,7 @@ public class TestMeanShift extends Mahou
       }
     }
     for (MeanShiftCanopy canopy : canopies) {
-      int ch = 'A' + canopy.getCanopyId();
+      int ch = 'A' + canopy.getId();
       for (int pid : canopy.getBoundPoints().toList()) {
         Vector pt = raw[pid];
         out[(int) pt.getQuick(0)][(int) pt.getQuick(1)] = (char) ch;
@@ -203,8 +203,8 @@ public class TestMeanShift extends Mahou
     for (Map.Entry<String, MeanShiftCanopy> stringMeanShiftCanopyEntry : refCanopyMap.entrySet()) {
       MeanShiftCanopy ref = stringMeanShiftCanopyEntry.getValue();
 
-      MeanShiftCanopy canopy = canopyMap.get((ref.isConverged() ? "V-" : "C-") + ref.getCanopyId());
-      assertEquals("ids", ref.getCanopyId(), canopy.getCanopyId());
+      MeanShiftCanopy canopy = canopyMap.get((ref.isConverged() ? "MSV-" : "MSC-") + ref.getId());
+      assertEquals("ids", ref.getId(), canopy.getId());
       assertEquals("centers(" + ref.getIdentifier() + ')', ref.getCenter().asFormatString(), canopy.getCenter().asFormatString());
       assertEquals("bound points", ref.getBoundPoints().toList().size(), canopy.getBoundPoints().toList().size());
     }
@@ -241,6 +241,7 @@ public class TestMeanShift extends Mahou
     conf.set(MeanShiftCanopyConfigKeys.T1_KEY, "4");
     conf.set(MeanShiftCanopyConfigKeys.T2_KEY, "1");
     conf.set(MeanShiftCanopyConfigKeys.CLUSTER_CONVERGENCE_KEY, "0.5");
+    conf.set(MeanShiftCanopyConfigKeys.CONTROL_PATH_KEY, "output/control");
 
     MeanShiftCanopyMapper mapper = new MeanShiftCanopyMapper();
     DummyRecordWriter<Text, MeanShiftCanopy> mapWriter = new DummyRecordWriter<Text, MeanShiftCanopy>();
@@ -280,11 +281,10 @@ public class TestMeanShift extends Mahou
     for (Map.Entry<String, MeanShiftCanopy> mapEntry : reducerReferenceMap.entrySet()) {
       MeanShiftCanopy refCanopy = mapEntry.getValue();
 
-      List<MeanShiftCanopy> values = reduceWriter.getValue(new Text((refCanopy.isConverged() ? "V-" : "C-")
-          + refCanopy.getCanopyId()));
+      List<MeanShiftCanopy> values = reduceWriter.getValue(new Text((refCanopy.isConverged() ? "MSV-" : "MSC-") + refCanopy.getId()));
       assertEquals("values", 1, values.size());
       MeanShiftCanopy reducerCanopy = values.get(0);
-      assertEquals("ids", refCanopy.getCanopyId(), reducerCanopy.getCanopyId());
+      assertEquals("ids", refCanopy.getId(), reducerCanopy.getId());
       int refNumPoints = refCanopy.getNumPoints();
       int reducerNumPoints = reducerCanopy.getNumPoints();
       assertEquals("numPoints", refNumPoints, reducerNumPoints);

Added: mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/ClustersFilter.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/ClustersFilter.java?rev=978786&view=auto
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/ClustersFilter.java (added)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/ClustersFilter.java Sat Jul 24 03:39:30 2010
@@ -0,0 +1,11 @@
+package org.apache.mahout.clustering.display;
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathFilter;
+
+public class ClustersFilter implements PathFilter {
+  @Override
+  public boolean accept(Path path) {
+    return (path.toString().contains("/clusters-"));
+  }
+}

Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayCanopy.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayCanopy.java?rev=978786&r1=978785&r2=978786&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayCanopy.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayCanopy.java Sat Jul 24 03:39:30 2010
@@ -17,17 +17,20 @@
 
 package org.apache.mahout.clustering.display;
 
-import java.awt.BasicStroke;
 import java.awt.Graphics;
 import java.awt.Graphics2D;
+import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
 
+import org.apache.hadoop.fs.Path;
+import org.apache.mahout.clustering.Cluster;
 import org.apache.mahout.clustering.canopy.Canopy;
 import org.apache.mahout.clustering.canopy.CanopyClusterer;
+import org.apache.mahout.clustering.canopy.CanopyDriver;
+import org.apache.mahout.common.HadoopUtil;
 import org.apache.mahout.common.RandomUtils;
 import org.apache.mahout.common.distance.ManhattanDistanceMeasure;
-import org.apache.mahout.math.DenseVector;
 import org.apache.mahout.math.Vector;
 import org.apache.mahout.math.VectorWritable;
 
@@ -35,12 +38,6 @@ class DisplayCanopy extends DisplayClust
 
   private static final long serialVersionUID = 1L;
 
-  private static List<Canopy> canopies;
-
-  private static final double T1 = 3.0;
-
-  private static final double T2 = 1.6;
-
   DisplayCanopy() {
     initialize();
     this.setTitle("Canopy Clusters (>" + (int) (SIGNIFICANCE * 100) + "% of population)");
@@ -48,33 +45,38 @@ class DisplayCanopy extends DisplayClust
 
   @Override
   public void paint(Graphics g) {
-    Graphics2D g2 = (Graphics2D) g;
-    plotSampleData(g2);
-    Vector dv = new DenseVector(2);
-    for (Canopy canopy : canopies) {
-      if (canopy.getNumPoints() > DisplayClustering.SAMPLE_DATA.size() * SIGNIFICANCE) {
-        g2.setStroke(new BasicStroke(2));
-        g2.setColor(COLORS[1]);
-        dv.assign(T1);
-        Vector center = canopy.computeCentroid();
-        DisplayClustering.plotEllipse(g2, center, dv);
-        g2.setStroke(new BasicStroke(3));
-        g2.setColor(COLORS[0]);
-        dv.assign(T2);
-        DisplayClustering.plotEllipse(g2, center, dv);
-      }
-    }
+    plotSampleData((Graphics2D) g);
+    plotClusters((Graphics2D) g);
   }
 
-  public static void main(String[] args) {
+  public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException, InstantiationException,
+      IllegalAccessException {
+    SIGNIFICANCE = 0.05;
+    Path samples = new Path("samples");
+    Path output = new Path("output");
+    HadoopUtil.overwriteOutput(samples);
+    HadoopUtil.overwriteOutput(output);
     RandomUtils.useTestSeed();
-    DisplayClustering.generateSamples();
-    List<Vector> points = new ArrayList<Vector>();
-    for (VectorWritable sample : SAMPLE_DATA) {
-      points.add(sample.get());
+    generateSamples();
+    writeSampleData(samples);
+    boolean b = true;
+    if (b) {
+      new CanopyDriver().buildClusters(samples, output, ManhattanDistanceMeasure.class.getName(), T1, T2, true);
+      loadClusters(output);
+    } else {
+      List<Vector> points = new ArrayList<Vector>();
+      for (VectorWritable sample : SAMPLE_DATA) {
+        points.add(sample.get());
+      }
+      List<Canopy> canopies = CanopyClusterer.createCanopies(points, new ManhattanDistanceMeasure(), T1, T2);
+      CanopyClusterer.updateCentroids(canopies);
+      List<Cluster> clusters = new ArrayList<Cluster>();
+      for (Canopy canopy : canopies)
+        clusters.add(canopy);
+      CLUSTERS.add(clusters);
     }
-    canopies = CanopyClusterer.createCanopies(points, new ManhattanDistanceMeasure(), T1, T2);
-    CanopyClusterer.updateCentroids(canopies);
+
     new DisplayCanopy();
   }
+
 }

Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayClustering.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayClustering.java?rev=978786&r1=978785&r2=978786&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayClustering.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayClustering.java Sat Jul 24 03:39:30 2010
@@ -28,11 +28,21 @@ import java.awt.event.WindowEvent;
 import java.awt.geom.AffineTransform;
 import java.awt.geom.Ellipse2D;
 import java.awt.geom.Rectangle2D;
+import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
 
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.clustering.AbstractCluster;
 import org.apache.mahout.clustering.Cluster;
 import org.apache.mahout.clustering.dirichlet.UncommonDistributions;
+import org.apache.mahout.clustering.kmeans.OutputLogFilter;
 import org.apache.mahout.common.RandomUtils;
 import org.apache.mahout.math.DenseVector;
 import org.apache.mahout.math.Vector;
@@ -59,6 +69,10 @@ public class DisplayClustering extends F
   protected static final Color[] COLORS = { Color.red, Color.orange, Color.yellow, Color.green, Color.blue, Color.magenta,
       Color.lightGray };
 
+  protected static final double T1 = 3.0;
+
+  protected static final double T2 = 2.8;
+
   protected static int res; // screen resolution
 
   public DisplayClustering() {
@@ -215,6 +229,54 @@ public class DisplayClustering extends F
     }
   }
 
+  protected static void writeSampleData(Path output) throws IOException {
+    Configuration conf = new Configuration();
+    FileSystem fs = FileSystem.get(output.toUri(), conf);
+    SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, output, Text.class, VectorWritable.class);
+    try {
+      for (VectorWritable vw : SAMPLE_DATA) {
+        writer.append(new Text(), vw);
+      }
+    } finally {
+      writer.close();
+    }
+  }
+
+  protected static List<Cluster> readClusters(Path clustersIn) throws IOException, InstantiationException, IllegalAccessException {
+    List<Cluster> clusters = new ArrayList<Cluster>();
+    Configuration conf = new Configuration();
+    FileSystem fs = FileSystem.get(clustersIn.toUri(), conf);
+    FileStatus[] status = fs.listStatus(clustersIn, new OutputLogFilter());
+    for (FileStatus s : status) {
+      SequenceFile.Reader reader = new SequenceFile.Reader(fs, s.getPath(), conf);
+      try {
+        Text key = new Text();
+        Writable value = (Writable) reader.getValueClass().newInstance();
+        while (reader.next(key, value)) {
+          Cluster cluster = (Cluster) value;
+          log.info("Reading Cluster:" + cluster.getId() + " center:" + AbstractCluster.formatVector(cluster.getCenter(), null)
+            + " numPoints:" + cluster.getNumPoints() + " radius:" + AbstractCluster.formatVector(cluster.getRadius(), null));
+          clusters.add(cluster);
+          value = (Writable) reader.getValueClass().newInstance();
+        }
+      } finally {
+        reader.close();
+      }
+    }
+    return clusters;
+  }
+  
+  protected static void loadClusters(Path output) throws IOException, InstantiationException, IllegalAccessException{
+    List<Cluster> clusters = new ArrayList<Cluster>();
+    Configuration conf = new Configuration();
+    FileSystem fs = FileSystem.get(output.toUri(), conf);
+    FileStatus[] status = fs.listStatus(output, new ClustersFilter());
+    for (FileStatus s : status) {
+      clusters = readClusters(s.getPath());
+      CLUSTERS.add(clusters);
+    }
+  }
+
   /**
    * Generate random samples and add them to the sampleData
    * 

Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayFuzzyKMeans.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayFuzzyKMeans.java?rev=978786&r1=978785&r2=978786&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayFuzzyKMeans.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayFuzzyKMeans.java Sat Jul 24 03:39:30 2010
@@ -19,11 +19,16 @@ package org.apache.mahout.clustering.dis
 
 import java.awt.Graphics;
 import java.awt.Graphics2D;
+import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
 
+import org.apache.hadoop.fs.Path;
 import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansClusterer;
+import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver;
 import org.apache.mahout.clustering.fuzzykmeans.SoftCluster;
+import org.apache.mahout.clustering.kmeans.RandomSeedGenerator;
+import org.apache.mahout.common.HadoopUtil;
 import org.apache.mahout.common.RandomUtils;
 import org.apache.mahout.common.distance.DistanceMeasure;
 import org.apache.mahout.common.distance.ManhattanDistanceMeasure;
@@ -44,43 +49,67 @@ class DisplayFuzzyKMeans extends Display
     plotClusters((Graphics2D) g);
   }
 
-  public static void main(String[] args) {
+  public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException, InstantiationException,
+      IllegalAccessException {
     DistanceMeasure measure = new ManhattanDistanceMeasure();
     double threshold = 0.001;
     int numClusters = 3;
     int numIterations = 10;
     int m = 3;
-    
+
+    Path samples = new Path("samples");
+    Path output = new Path("output");
+    HadoopUtil.overwriteOutput(samples);
+    HadoopUtil.overwriteOutput(output);
     RandomUtils.useTestSeed();
     DisplayClustering.generateSamples();
-    List<Vector> points = new ArrayList<Vector>();
-    for (VectorWritable sample : SAMPLE_DATA) {
-      points.add((Vector) sample.get());
-    }
-    int id = 0;
-    List<SoftCluster> initialClusters = new ArrayList<SoftCluster>();
-    for (Vector point : points) {
-      if (initialClusters.size() < Math.min(numClusters, points.size())) {
-        initialClusters.add(new SoftCluster(point, id++));
-      } else {
-        break;
+    boolean b = true;
+    if (b) {
+      writeSampleData(samples);
+      Path clusters = RandomSeedGenerator.buildRandom(samples, new Path(output, "clusters-0"), 3);
+      FuzzyKMeansDriver.runJob(samples,
+                               clusters,
+                               output,
+                               measure.getClass().getName(),
+                               threshold,
+                               numIterations,
+                               1,
+                               m,
+                               true,
+                               true,
+                               threshold,
+                               b);
+      loadClusters(output);
+    } else {
+      List<Vector> points = new ArrayList<Vector>();
+      for (VectorWritable sample : SAMPLE_DATA) {
+        points.add((Vector) sample.get());
       }
-    }
-    List<List<SoftCluster>> results = FuzzyKMeansClusterer.clusterPoints(points,
-                                                                         initialClusters,
-                                                                         measure,
-                                                                         threshold,
-                                                                         m,
-                                                                         numIterations);
-    for (List<SoftCluster> models : results) {
-      List<org.apache.mahout.clustering.Cluster> clusters = new ArrayList<org.apache.mahout.clustering.Cluster>();
-      for (SoftCluster cluster : models) {
-        org.apache.mahout.clustering.Cluster cluster2 = (org.apache.mahout.clustering.Cluster) cluster;
-        if (isSignificant(cluster2)) {
-          clusters.add(cluster2);
+      int id = 0;
+      List<SoftCluster> initialClusters = new ArrayList<SoftCluster>();
+      for (Vector point : points) {
+        if (initialClusters.size() < Math.min(numClusters, points.size())) {
+          initialClusters.add(new SoftCluster(point, id++));
+        } else {
+          break;
+        }
+      }
+      List<List<SoftCluster>> results = FuzzyKMeansClusterer.clusterPoints(points,
+                                                                           initialClusters,
+                                                                           measure,
+                                                                           threshold,
+                                                                           m,
+                                                                           numIterations);
+      for (List<SoftCluster> models : results) {
+        List<org.apache.mahout.clustering.Cluster> clusters = new ArrayList<org.apache.mahout.clustering.Cluster>();
+        for (SoftCluster cluster : models) {
+          org.apache.mahout.clustering.Cluster cluster2 = (org.apache.mahout.clustering.Cluster) cluster;
+          if (isSignificant(cluster2)) {
+            clusters.add(cluster2);
+          }
         }
+        CLUSTERS.add(clusters);
       }
-      CLUSTERS.add(clusters);
     }
     new DisplayFuzzyKMeans();
   }

Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayKMeans.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayKMeans.java?rev=978786&r1=978785&r2=978786&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayKMeans.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayKMeans.java Sat Jul 24 03:39:30 2010
@@ -19,11 +19,17 @@ package org.apache.mahout.clustering.dis
 
 import java.awt.Graphics;
 import java.awt.Graphics2D;
+import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
 
+import org.apache.hadoop.fs.Path;
+import org.apache.mahout.clustering.AbstractCluster;
 import org.apache.mahout.clustering.kmeans.Cluster;
 import org.apache.mahout.clustering.kmeans.KMeansClusterer;
+import org.apache.mahout.clustering.kmeans.KMeansDriver;
+import org.apache.mahout.clustering.kmeans.RandomSeedGenerator;
+import org.apache.mahout.common.HadoopUtil;
 import org.apache.mahout.common.RandomUtils;
 import org.apache.mahout.common.distance.DistanceMeasure;
 import org.apache.mahout.common.distance.ManhattanDistanceMeasure;
@@ -39,40 +45,52 @@ class DisplayKMeans extends DisplayClust
     this.setTitle("k-Means Clusters (>" + (int) (SIGNIFICANCE * 100) + "% of population)");
   }
 
-  public static void main(String[] args) {
+  public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException, InstantiationException,
+      IllegalAccessException {
     DistanceMeasure measure = new ManhattanDistanceMeasure();
     int numClusters = 3;
     int maxIter = 10;
     double distanceThreshold = 0.001;
-    
+    Path samples = new Path("samples");
+    Path output = new Path("output");
+    HadoopUtil.overwriteOutput(samples);
+    HadoopUtil.overwriteOutput(output);
+
     RandomUtils.useTestSeed();
     DisplayClustering.generateSamples();
-    List<Vector> points = new ArrayList<Vector>();
-    for (VectorWritable sample : SAMPLE_DATA) {
-      points.add(sample.get());
-    }
-    List<Cluster> initialClusters = new ArrayList<Cluster>();
-    int id = 0;
-    for (Vector point : points) {
-      if (initialClusters.size() < Math.min(numClusters, points.size())) {
-        initialClusters.add(new Cluster(point, id++));
-      } else {
-        break;
+    writeSampleData(samples);
+    boolean b = true;
+    if (b) {
+      Path clusters = RandomSeedGenerator.buildRandom(samples, new Path(output, "clusters-0"), 3);
+      KMeansDriver.runJob(samples, clusters, output, measure.getClass().getName(), distanceThreshold, maxIter, 1, true, true);
+      loadClusters(output);
+    } else {
+      List<Vector> points = new ArrayList<Vector>();
+      for (VectorWritable sample : SAMPLE_DATA) {
+        points.add(sample.get());
       }
-    }
-    result = KMeansClusterer.clusterPoints(points, initialClusters, measure, maxIter, distanceThreshold);
-    for (List<Cluster> models : result) {
-      List<org.apache.mahout.clustering.Cluster> clusters = new ArrayList<org.apache.mahout.clustering.Cluster>();
-      for (Cluster cluster : models) {
-        org.apache.mahout.clustering.Cluster cluster2 = (org.apache.mahout.clustering.Cluster) cluster;
-        if (isSignificant(cluster2)) {
-          clusters.add(cluster2);
+      List<Cluster> initialClusters = new ArrayList<Cluster>();
+      int id = 0;
+      for (Vector point : points) {
+        if (initialClusters.size() < Math.min(numClusters, points.size())) {
+          initialClusters.add(new Cluster(point, id++));
+        } else {
+          break;
         }
       }
-      CLUSTERS.add(clusters);
-    }
 
-    System.out.println(result.size());
+      result = KMeansClusterer.clusterPoints(points, initialClusters, measure, maxIter, distanceThreshold);
+      for (List<Cluster> models : result) {
+        List<org.apache.mahout.clustering.Cluster> clusters = new ArrayList<org.apache.mahout.clustering.Cluster>();
+        for (AbstractCluster cluster : models) {
+          org.apache.mahout.clustering.Cluster cluster2 = (org.apache.mahout.clustering.Cluster) cluster;
+          if (isSignificant(cluster2)) {
+            clusters.add(cluster2);
+          }
+        }
+        CLUSTERS.add(clusters);
+      }
+    }
     new DisplayKMeans();
   }
 

Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayMeanShift.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayMeanShift.java?rev=978786&r1=978785&r2=978786&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayMeanShift.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayMeanShift.java Sat Jul 24 03:39:30 2010
@@ -21,11 +21,16 @@ import java.awt.Color;
 import java.awt.Graphics;
 import java.awt.Graphics2D;
 import java.awt.geom.AffineTransform;
+import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
 
+import org.apache.hadoop.fs.Path;
+import org.apache.mahout.clustering.Cluster;
 import org.apache.mahout.clustering.meanshift.MeanShiftCanopy;
 import org.apache.mahout.clustering.meanshift.MeanShiftCanopyClusterer;
+import org.apache.mahout.clustering.meanshift.MeanShiftCanopyDriver;
+import org.apache.mahout.common.HadoopUtil;
 import org.apache.mahout.common.RandomUtils;
 import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
 import org.apache.mahout.math.DenseVector;
@@ -38,8 +43,6 @@ final class DisplayMeanShift extends Dis
 
   private static final Logger log = LoggerFactory.getLogger(DisplayMeanShift.class);
 
-  private static List<MeanShiftCanopy> canopies = new ArrayList<MeanShiftCanopy>();
-
   private static double t1;
 
   private static double t2;
@@ -70,7 +73,8 @@ final class DisplayMeanShift extends Dis
       DisplayClustering.plotRectangle(g2, v.get(), dv);
     }
     int i = 0;
-    for (MeanShiftCanopy canopy : canopies) {
+    for (Cluster cluster : CLUSTERS.get(CLUSTERS.size()-1)) {
+      MeanShiftCanopy canopy = (MeanShiftCanopy) cluster;
       if (canopy.getBoundPoints().toList().size() >= SIGNIFICANCE * DisplayClustering.SAMPLE_DATA.size()) {
         g2.setColor(COLORS[Math.min(i++, DisplayClustering.COLORS.length - 1)]);
         int count = 0;
@@ -88,20 +92,35 @@ final class DisplayMeanShift extends Dis
     }
   }
 
-  public static void main(String[] args) {
+  public static void main(String[] args) throws IOException, InstantiationException, IllegalAccessException, InterruptedException,
+      ClassNotFoundException {
     t1 = 1.5;
-    t2 = 0.1;
+    t2 = 0.5;
     SIGNIFICANCE = 0.02;
+    EuclideanDistanceMeasure measure = new EuclideanDistanceMeasure();
+
+    Path samples = new Path("samples");
+    Path output = new Path("output");
+    HadoopUtil.overwriteOutput(samples);
+    HadoopUtil.overwriteOutput(output);
 
     RandomUtils.useTestSeed();
     DisplayClustering.generateSamples();
-    List<Vector> points = new ArrayList<Vector>();
-    for (VectorWritable sample : SAMPLE_DATA) {
-      points.add(sample.get());
-    }
-    canopies = MeanShiftCanopyClusterer.clusterPoints(points, new EuclideanDistanceMeasure(), 0.005, t1, t2, 20);
-    for (MeanShiftCanopy canopy : canopies) {
-      log.info(canopy.toString());
+    writeSampleData(samples);
+    boolean b = true;
+    if (b) {
+      MeanShiftCanopyDriver.runJob(samples, output, measure.getClass().getName(), t1, t2, 0.005, 20, false, true, true);
+      loadClusters(output);
+    } else {
+      List<Vector> points = new ArrayList<Vector>();
+      for (VectorWritable sample : SAMPLE_DATA) {
+        points.add(sample.get());
+      }
+      List<MeanShiftCanopy> canopies = new ArrayList<MeanShiftCanopy>();
+      canopies = MeanShiftCanopyClusterer.clusterPoints(points, measure, 0.005, t1, t2, 20);
+      for (MeanShiftCanopy canopy : canopies) {
+        log.info(canopy.toString());
+      }
     }
     new DisplayMeanShift();
   }

Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java?rev=978786&r1=978785&r2=978786&view=diff
==============================================================================
--- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java (original)
+++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java Sat Jul 24 03:39:30 2010
@@ -40,8 +40,8 @@ import org.apache.hadoop.fs.PathFilter;
 import org.apache.hadoop.io.IntWritable;
 import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Writable;
+import org.apache.mahout.clustering.AbstractCluster;
 import org.apache.mahout.clustering.Cluster;
-import org.apache.mahout.clustering.ClusterBase;
 import org.apache.mahout.clustering.WeightedVectorWritable;
 import org.apache.mahout.common.AbstractJob;
 import org.apache.mahout.common.Pair;
@@ -191,7 +191,7 @@ public final class ClusterDumper extends
                 WeightedVectorWritable point = iterator.next();
                 writer.write(String.valueOf(point.getWeight()));
                 writer.write(": ");
-                writer.write(ClusterBase.formatVector(point.getVector().get(), dictionary));
+                writer.write(AbstractCluster.formatVector(point.getVector().get(), dictionary));
                 if (iterator.hasNext()) {
                   writer.write("\n\t");
                 }

Modified: mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java?rev=978786&r1=978785&r2=978786&view=diff
==============================================================================
--- mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java (original)
+++ mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java Sat Jul 24 03:39:30 2010
@@ -129,7 +129,7 @@ public class TestClusterDumper extends M
       } else {
         namedVector = new NamedVector(vector, "P(" + i + ')');
       }
-      System.out.println(ClusterBase.formatVector(namedVector, termDictionary));
+      System.out.println(AbstractCluster.formatVector(namedVector, termDictionary));
       sampleData.add(new VectorWritable(namedVector));
       i++;
     }

Modified: mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java?rev=978786&r1=978785&r2=978786&view=diff
==============================================================================
--- mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java (original)
+++ mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java Sat Jul 24 03:39:30 2010
@@ -29,8 +29,8 @@ import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.IntWritable;
 import org.apache.hadoop.io.SequenceFile;
+import org.apache.mahout.clustering.AbstractCluster;
 import org.apache.mahout.clustering.Cluster;
-import org.apache.mahout.clustering.ClusterBase;
 import org.apache.mahout.clustering.ClusteringTestUtils;
 import org.apache.mahout.clustering.canopy.Canopy;
 import org.apache.mahout.clustering.canopy.CanopyDriver;
@@ -77,7 +77,7 @@ public class TestCDbwEvaluator extends M
             IntWritable clusterId = new IntWritable(0);
             VectorWritable point = new VectorWritable();
             while (reader.next(clusterId, point)) {
-              System.out.println("\tC-" + clusterId + ": " + ClusterBase.formatVector(point.get(), null));
+              System.out.println("\tC-" + clusterId + ": " + AbstractCluster.formatVector(point.get(), null));
             }
           } finally {
             reader.close();