You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by je...@apache.org on 2010/04/28 19:37:13 UTC

svn commit: r939019 [1/2] - in /lucene/mahout/trunk: core/src/main/java/org/apache/mahout/clustering/ core/src/main/java/org/apache/mahout/clustering/canopy/ core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/ core/src/main/java/org/apache/mah...

Author: jeastman
Date: Wed Apr 28 17:37:12 2010
New Revision: 939019

URL: http://svn.apache.org/viewvc?rev=939019&view=rev
Log:
MAHOUT-236:
- replaced NamedVector references with Vector in clustering code
- replaced NamedVector references with Vector in clustering examples
- removed coercions to RandomAccessSparseVector from cluster constructors
- fixed FuzzyKMeans tests to reflect new clustering implementation
- moved CDbwDispantPointWritable to core WeightedPointWritable
- updated CDbw evaluator with above

all tests run

Added:
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/WeightedPointWritable.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java
Removed:
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwDistantPointWritable.java
Modified:
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/ClusterBase.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/Canopy.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansClusterer.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/SoftCluster.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/Cluster.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterer.java
    lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java
    lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java
    lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/dirichlet/DisplayDirichlet.java
    lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/fuzzykmeans/DisplayFuzzyKMeans.java
    lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/kmeans/DisplayKMeans.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwDriver.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwMapper.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwReducer.java
    lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/ClusterBase.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/ClusterBase.java?rev=939019&r1=939018&r2=939019&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/ClusterBase.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/ClusterBase.java Wed Apr 28 17:37:12 2010
@@ -26,7 +26,6 @@ import java.util.Locale;
 
 import org.apache.hadoop.io.Writable;
 import org.apache.mahout.math.JsonVectorAdapter;
-import org.apache.mahout.math.RandomAccessSparseVector;
 import org.apache.mahout.math.Vector;
 import org.apache.mahout.math.Vector.Element;
 
@@ -55,7 +54,7 @@ public abstract class ClusterBase implem
   private int id;
 
   // the current cluster center
-  private Vector center = new RandomAccessSparseVector(0);
+  private Vector center;
 
   // the number of points in the cluster
   private int numPoints;

Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/WeightedPointWritable.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/WeightedPointWritable.java?rev=939019&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/WeightedPointWritable.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/WeightedPointWritable.java Wed Apr 28 17:37:12 2010
@@ -0,0 +1,74 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.math.VectorWritable;
+
+public class WeightedPointWritable implements Writable {
+
+  /**
+   * @return the weight
+   */
+  public double getWeight() {
+    return weight;
+  }
+
+  /**
+   * @return the point
+   */
+  public VectorWritable getPoint() {
+    return point;
+  }
+
+  public WeightedPointWritable(double weight, VectorWritable point) {
+    super();
+    this.weight = weight;
+    this.point = point;
+  }
+
+  public WeightedPointWritable() {
+    super();
+  }
+
+  private double weight;
+
+  private VectorWritable point;
+
+  @Override
+  public void readFields(DataInput in) throws IOException {
+    weight = in.readDouble();
+    point = new VectorWritable();
+    point.readFields(in);
+  }
+
+  @Override
+  public void write(DataOutput out) throws IOException {
+    out.writeDouble(weight);
+    point.write(out);
+  }
+
+  public String toString() {
+    return String.valueOf(weight) + ": " + (point == null ? "null" : ClusterBase.formatVector(point.get(), null));
+  }
+
+}

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/Canopy.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/Canopy.java?rev=939019&r1=939018&r2=939019&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/Canopy.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/Canopy.java Wed Apr 28 17:37:12 2010
@@ -49,8 +49,8 @@ public class Canopy extends ClusterBase 
    */
   public Canopy(Vector point, int canopyId) {
     this.setId(canopyId);
-    this.setCenter(new RandomAccessSparseVector(point.clone()));
-    this.setPointTotal(getCenter().clone());
+    this.setCenter(point.clone());
+    this.setPointTotal(point.clone());
     this.setNumPoints(1);
   }
   
@@ -65,7 +65,7 @@ public class Canopy extends ClusterBase 
     super.readFields(in);
     VectorWritable temp = new VectorWritable();
     temp.readFields(in);
-    this.setCenter(new RandomAccessSparseVector(temp.get()));
+    this.setCenter(temp.get());
     this.setPointTotal(getCenter().clone());
     this.setNumPoints(1);
   }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansClusterer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansClusterer.java?rev=939019&r1=939018&r2=939019&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansClusterer.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansClusterer.java Wed Apr 28 17:37:12 2010
@@ -201,10 +201,16 @@ public class FuzzyKMeansClusterer {
    *          the initial List<SoftCluster> of clusters
    * @param measure
    *          the DistanceMeasure to use
+   * @param threshold
+   *          the double convergence threshold
+   * @param m
+   *          the double "fuzzyness" argument (>1)
    * @param numIter
    *          the maximum number of iterations
+   * @return
+   *          a List<List<SoftCluster>> of clusters produced per iteration
    */
-  public static List<List<SoftCluster>> clusterPoints(List<NamedVector> points,
+  public static List<List<SoftCluster>> clusterPoints(List<Vector> points,
                                                       List<SoftCluster> clusters,
                                                       DistanceMeasure measure,
                                                       double threshold,
@@ -237,7 +243,7 @@ public class FuzzyKMeansClusterer {
    *          the List<Cluster> clusters
    * @return
    */
-  public static boolean runFuzzyKMeansIteration(List<NamedVector> points,
+  public static boolean runFuzzyKMeansIteration(List<Vector> points,
                                                 List<SoftCluster> clusterList,
                                                 FuzzyKMeansClusterer clusterer) {
     for (Vector point : points) {

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/SoftCluster.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/SoftCluster.java?rev=939019&r1=939018&r2=939019&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/SoftCluster.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/SoftCluster.java Wed Apr 28 17:37:12 2010
@@ -23,7 +23,6 @@ import java.io.IOException;
 
 import org.apache.mahout.clustering.ClusterBase;
 import org.apache.mahout.math.AbstractVector;
-import org.apache.mahout.math.RandomAccessSparseVector;
 import org.apache.mahout.math.Vector;
 import org.apache.mahout.math.VectorWritable;
 import org.apache.mahout.math.function.Functions;
@@ -93,7 +92,7 @@ public class SoftCluster extends Cluster
    *          the center point
    */
   public SoftCluster(Vector center) {
-    setCenter(new RandomAccessSparseVector(center));
+    setCenter(center.clone());
     this.pointProbSum = 0;
     this.weightedPointTotal = getCenter().like();
   }
@@ -112,7 +111,7 @@ public class SoftCluster extends Cluster
     converged = in.readBoolean();
     VectorWritable temp = new VectorWritable();
     temp.readFields(in);
-    this.setCenter(new RandomAccessSparseVector(temp.get()));
+    this.setCenter(temp.get());
     this.pointProbSum = 0;
     this.weightedPointTotal = getCenter().like();
   }
@@ -124,6 +123,8 @@ public class SoftCluster extends Cluster
    */
   @Override
   public Vector computeCentroid() {
+    if (centroid != null)
+      return centroid;
     if (pointProbSum == 0) {
       return weightedPointTotal;
     } else if (centroid == null) {
@@ -157,7 +158,7 @@ public class SoftCluster extends Cluster
   
   @Override
   public String toString() {
-    return getIdentifier() + ": " + getCenter().asFormatString();
+    return asFormatString(null);
   }
   
   @Override

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/Cluster.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/Cluster.java?rev=939019&r1=939018&r2=939019&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/Cluster.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/Cluster.java Wed Apr 28 17:37:12 2010
@@ -104,7 +104,7 @@ public class Cluster extends ClusterBase
     this.converged = in.readBoolean();
     VectorWritable temp = new VectorWritable();
     temp.readFields(in);
-    this.setCenter(new RandomAccessSparseVector(temp.get()));
+    this.setCenter(temp.get());
     this.setNumPoints(0);
     this.setPointTotal(getCenter().like());
     this.pointSquaredTotal = getCenter().like();
@@ -134,7 +134,7 @@ public class Cluster extends ClusterBase
    */
   public Cluster(Vector center) {
     super();
-    this.setCenter(new RandomAccessSparseVector(center));
+    this.setCenter(center.clone());
     this.setNumPoints(0);
     this.setPointTotal(getCenter().like());
     this.pointSquaredTotal = getCenter().like();
@@ -152,7 +152,7 @@ public class Cluster extends ClusterBase
   public Cluster(Vector center, int clusterId) {
     super();
     this.setId(clusterId);
-    this.setCenter(new RandomAccessSparseVector(center));
+    this.setCenter(center.clone());
     this.setNumPoints(0);
     this.setPointTotal(getCenter().like());
     this.pointSquaredTotal = getCenter().like();

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterer.java?rev=939019&r1=939018&r2=939019&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterer.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterer.java Wed Apr 28 17:37:12 2010
@@ -111,7 +111,7 @@ public class KMeansClusterer {
    * @param maxIter
    *          the maximum number of iterations
    */
-  public static List<List<Cluster>> clusterPoints(List<NamedVector> points,
+  public static List<List<Cluster>> clusterPoints(List<Vector> points,
                                                   List<Cluster> clusters,
                                                   DistanceMeasure measure,
                                                   int maxIter,
@@ -146,7 +146,7 @@ public class KMeansClusterer {
    *          a DistanceMeasure to use
    * @return
    */
-  public static boolean runKMeansIteration(List<NamedVector> points,
+  public static boolean runKMeansIteration(List<Vector> points,
                                            List<Cluster> clusters,
                                            DistanceMeasure measure,
                                            double distanceThreshold) {

Modified: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java?rev=939019&r1=939018&r2=939019&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java (original)
+++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java Wed Apr 28 17:37:12 2010
@@ -37,14 +37,13 @@ import org.apache.mahout.common.DummyRep
 import org.apache.mahout.common.MahoutTestCase;
 import org.apache.mahout.common.distance.DistanceMeasure;
 import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
-import org.apache.mahout.math.NamedVector;
 import org.apache.mahout.math.Vector;
 import org.apache.mahout.math.VectorWritable;
 
 public class TestFuzzyKmeansClustering extends MahoutTestCase {
-  
+
   private FileSystem fs;
-  
+
   private static void rmr(String path) {
     File f = new File(path);
     if (f.exists()) {
@@ -57,7 +56,7 @@ public class TestFuzzyKmeansClustering e
       f.delete();
     }
   }
-  
+
   @Override
   protected void setUp() throws Exception {
     super.setUp();
@@ -66,86 +65,98 @@ public class TestFuzzyKmeansClustering e
     Configuration conf = new Configuration();
     fs = FileSystem.get(conf);
   }
-  
+
   private static double round(double val, int places) {
     long factor = (long) Math.pow(10, places);
-    
+
     // Shift the decimal the correct number of places
     // to the right.
     val *= factor;
-    
+
     // Round to the nearest integer.
     long tmp = Math.round(val);
-    
+
     // Shift the decimal the correct number of places
     // back to the left.
     return (double) tmp / factor;
   }
-  
+
   private static Vector tweakValue(Vector point) {
     return point.plus(0.1);
-    
+
   }
-  
-  private static void computeCluster(List<NamedVector> points,
-                                     List<SoftCluster> clusterList,
-                                     FuzzyKMeansClusterer clusterer,
-                                     Map<String,String> pointClusterInfo) {
-    
-    for (NamedVector point : points) {
-      StringBuilder outputValue = new StringBuilder("[");
+
+  private static void computeCluster(List<Vector> points, List<SoftCluster> clusterList, FuzzyKMeansClusterer clusterer,
+      Map<Integer, List<Vector>> pointClusterInfo) {
+
+    for (Vector point : points) {
       List<Double> clusterDistanceList = new ArrayList<Double>();
+      SoftCluster closestCluster = null;
+      double closestDistance = Double.MAX_VALUE;
       for (SoftCluster cluster : clusterList) {
-        clusterDistanceList.add(clusterer.getMeasure().distance(point, cluster.getCenter()));
+        double distance = clusterer.getMeasure().distance(point, cluster.getCenter());
+        if (distance < closestDistance) {
+          closestDistance = distance;
+          closestCluster = cluster;
+        }
+        clusterDistanceList.add(distance);
+      }
+      List<Vector> list = pointClusterInfo.get(closestCluster.getId());
+      if (list == null) {
+        list = new ArrayList<Vector>();
+        pointClusterInfo.put(closestCluster.getId(), list);
       }
+      list.add(point);
+      double totalProb = 0;
       for (int i = 0; i < clusterList.size(); i++) {
+        SoftCluster cluster = clusterList.get(i);
         double probWeight = clusterer.computeProbWeight(clusterDistanceList.get(i), clusterDistanceList);
-        outputValue.append(clusterList.get(i).getId()).append(':').append(probWeight).append(' ');
+        totalProb += probWeight;
       }
-      pointClusterInfo.put(point.getName(), outputValue.toString().trim() + ']');
+      assertTrue("total probability", Math.abs(1.0 - totalProb) < 0.0001);
+    }
+
+    for (SoftCluster cluster : clusterList) {
+      System.out.println(cluster.asFormatString(null));
+      List<Vector> list = pointClusterInfo.get(cluster.getId());
+      if (list != null)
+        for (Vector vector : list) {
+          System.out.println("\t" + vector);
+        }
     }
   }
-  
+
   public void testReferenceImplementation() throws Exception {
-    List<NamedVector> points = TestKmeansClustering.getPoints(TestKmeansClustering.reference);
+    List<Vector> points = TestKmeansClustering.getPoints(TestKmeansClustering.reference);
     for (int k = 0; k < points.size(); k++) {
       System.out.println("test k= " + k);
-      
+
       List<SoftCluster> clusterList = new ArrayList<SoftCluster>();
       // pick k initial cluster centers at random
       for (int i = 0; i < k + 1; i++) {
         Vector vec = tweakValue(points.get(i));
-        SoftCluster cluster = new SoftCluster(vec);
+        SoftCluster cluster = new SoftCluster(vec, i);
         // add the center so the centroid will be correct upon output
-        cluster.addPoint(cluster.getCenter(), 1);
-        
+        //cluster.addPoint(cluster.getCenter(), 1);
         clusterList.add(cluster);
       }
-      Map<String,String> pointClusterInfo = new HashMap<String,String>();
+      Map<Integer, List<Vector>> pointClusterInfo = new HashMap<Integer, List<Vector>>();
       // run reference FuzzyKmeans algorithm
-      List<List<SoftCluster>> clusters = FuzzyKMeansClusterer.clusterPoints(points, clusterList,
-        new EuclideanDistanceMeasure(), 0.001, 2, 2);
-      computeCluster(points, clusters.get(clusters.size() - 1), new FuzzyKMeansClusterer(
-          new EuclideanDistanceMeasure(), 0.001, 2), pointClusterInfo);
-      
-      // iterate for each point
-      for (String value : pointClusterInfo.values()) {
-        String clusterInfoStr = value.substring(1, value.length() - 1);
-        String[] clusterInfoList = clusterInfoStr.split(" ");
-        assertEquals("Number of clusters", k + 1, clusterInfoList.length);
-        double prob = 0.0;
-        for (String clusterInfo : clusterInfoList) {
-          String[] clusterProb = clusterInfo.split(":");
-          
-          double clusterProbVal = Double.parseDouble(clusterProb[1]);
-          prob += clusterProbVal;
-        }
-        prob = round(prob, 1);
-        assertEquals("Sum of cluster Membership problability should be equal to=", 1.0, prob);
+      List<List<SoftCluster>> clusters = FuzzyKMeansClusterer.clusterPoints(points, clusterList, new EuclideanDistanceMeasure(),
+          0.001, 2, 2);
+      computeCluster(points, clusters.get(clusters.size() - 1), new FuzzyKMeansClusterer(new EuclideanDistanceMeasure(), 0.001, 2),
+          pointClusterInfo);
+
+      // iterate for each cluster
+      int size = 0;
+      for (int cId : pointClusterInfo.keySet()) {
+        List<Vector> pts = pointClusterInfo.get(cId);
+        size += pts.size();
       }
+      assertEquals("total size", size, points.size());
     }
   }
-  
+
   public void testFuzzyKMeansMRJob() throws Exception {
     List<VectorWritable> points = TestKmeansClustering.getPointsWritable(TestKmeansClustering.reference);
     File testData = new File("testdata");
@@ -158,7 +169,7 @@ public class TestFuzzyKmeansClustering e
     }
     Configuration conf = new Configuration();
     ClusteringTestUtils.writePointsToFile(points, "testdata/points/file1", fs, conf);
-    
+
     for (int k = 0; k < points.size(); k++) {
       System.out.println("testKFuzzyKMeansMRJob k= " + k);
       // pick k initial cluster centers at random
@@ -168,21 +179,21 @@ public class TestFuzzyKmeansClustering e
       if (fs.exists(path)) {
         fs.delete(path, true);
       }
-      
+
       testData = new File("testdata/clusters");
       if (!testData.exists()) {
         testData.mkdir();
       }
-      
+
       /*
        * BufferedWriter writer = new BufferedWriter(new OutputStreamWriter( new
        * FileOutputStream("testdata/clusters/part-00000"), Charset .forName("UTF-8")));
        */
-      SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf,
-          new Path("testdata/clusters/part-00000"), Text.class, SoftCluster.class);
+      SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, new Path("testdata/clusters/part-00000"), Text.class,
+          SoftCluster.class);
       for (int i = 0; i < k + 1; i++) {
         Vector vec = tweakValue(points.get(i).get());
-        
+
         SoftCluster cluster = new SoftCluster(vec);
         // add the center so the centroid will be correct upon output
         cluster.addPoint(cluster.getCenter(), 1);
@@ -190,10 +201,10 @@ public class TestFuzzyKmeansClustering e
          * writer.write(cluster.getIdentifier() + '\t' + SoftCluster.formatCluster(cluster) + '\n');
          */
         writer.append(new Text(cluster.getIdentifier()), cluster);
-        
+
       }
       writer.close();
-      
+
       Path outPath = new Path("output");
       fs = FileSystem.get(outPath.toUri(), conf);
       if (fs.exists(outPath)) {
@@ -201,9 +212,9 @@ public class TestFuzzyKmeansClustering e
       }
       fs.mkdirs(outPath);
       // now run the Job
-      FuzzyKMeansDriver.runJob("testdata/points", "testdata/clusters", "output",
-        EuclideanDistanceMeasure.class.getName(), 0.001, 2, 1, k + 1, 2);
-      
+      FuzzyKMeansDriver.runJob("testdata/points", "testdata/clusters", "output", EuclideanDistanceMeasure.class.getName(), 0.001,
+          2, 1, k + 1, 2);
+
       // now compare the expected clusters with actual
       File outDir = new File("output/clusteredPoints");
       assertTrue("output dir exists?", outDir.exists());
@@ -214,199 +225,196 @@ public class TestFuzzyKmeansClustering e
       VectorWritable out = new VectorWritable();
       while (reader.next(key, out)) {
         // make sure we can read all the clusters
-      }     
+      }
       reader.close();
-      
+
     }
-    
+
   }
-  
+
   public void testFuzzyKMeansMapper() throws Exception {
     List<VectorWritable> points = TestKmeansClustering.getPointsWritable(TestKmeansClustering.reference);
-    
+
     for (int k = 0; k < points.size(); k++) {
       System.out.println("testKFuzzyKMeansMRJob k= " + k);
       // pick k initial cluster centers at random
       List<SoftCluster> clusterList = new ArrayList<SoftCluster>();
-      
+
       for (int i = 0; i < k + 1; i++) {
         Vector vec = tweakValue(points.get(i).get());
-        
+
         SoftCluster cluster = new SoftCluster(vec, i);
         cluster.addPoint(cluster.getCenter(), 1);
         clusterList.add(cluster);
       }
-      
+
       // run mapper
       FuzzyKMeansMapper mapper = new FuzzyKMeansMapper();
       mapper.config(clusterList);
-      
+
       JobConf conf = new JobConf();
-      conf.set(FuzzyKMeansConfigKeys.DISTANCE_MEASURE_KEY,
-        "org.apache.mahout.common.distance.EuclideanDistanceMeasure");
+      conf.set(FuzzyKMeansConfigKeys.DISTANCE_MEASURE_KEY, "org.apache.mahout.common.distance.EuclideanDistanceMeasure");
       conf.set(FuzzyKMeansConfigKeys.CLUSTER_CONVERGENCE_KEY, "0.001");
       conf.set(FuzzyKMeansConfigKeys.M_KEY, "2");
       mapper.configure(conf);
-      
-      DummyOutputCollector<Text,FuzzyKMeansInfo> mapCollector = new DummyOutputCollector<Text,FuzzyKMeansInfo>();
+
+      DummyOutputCollector<Text, FuzzyKMeansInfo> mapCollector = new DummyOutputCollector<Text, FuzzyKMeansInfo>();
       for (VectorWritable point : points) {
         mapper.map(new Text(), point, mapCollector, null);
       }
-      
+
       // now verify mapper output
       assertEquals("Mapper Keys", k + 1, mapCollector.getData().size());
-      
-      Map<Vector,Double> pointTotalProbMap = new HashMap<Vector,Double>();
-      
+
+      Map<Vector, Double> pointTotalProbMap = new HashMap<Vector, Double>();
+
       for (Text key : mapCollector.getKeys()) {
         // SoftCluster cluster = SoftCluster.decodeCluster(key);
         List<FuzzyKMeansInfo> values = mapCollector.getValue(key);
-        
+
         for (FuzzyKMeansInfo value : values) {
-          
+
           Double val = pointTotalProbMap.get(value.getVector());
           double probVal = 0.0;
           if (val != null) {
             probVal = val;
           }
-          
+
           pointTotalProbMap.put(value.getVector(), probVal + value.getProbability());
         }
       }
-      
-      for (Map.Entry<Vector,Double> entry : pointTotalProbMap.entrySet()) {
+
+      for (Map.Entry<Vector, Double> entry : pointTotalProbMap.entrySet()) {
         Vector key = entry.getKey();
         double value = round(entry.getValue(), 1);
-        
+
         assertEquals("total Prob for Point:" + key, 1.0, value);
       }
     }
   }
-  
+
   public void testFuzzyKMeansCombiner() throws Exception {
     List<VectorWritable> points = TestKmeansClustering.getPointsWritable(TestKmeansClustering.reference);
-    
+
     for (int k = 0; k < points.size(); k++) {
       System.out.println("testKFuzzyKMeansMRJob k= " + k);
       // pick k initial cluster centers at random
       List<SoftCluster> clusterList = new ArrayList<SoftCluster>();
-      
+
       for (int i = 0; i < k + 1; i++) {
         Vector vec = tweakValue(points.get(i).get());
-        
+
         SoftCluster cluster = new SoftCluster(vec, i);
         cluster.addPoint(cluster.getCenter(), 1);
         clusterList.add(cluster);
       }
-      
+
       // run mapper
       FuzzyKMeansMapper mapper = new FuzzyKMeansMapper();
       mapper.config(clusterList);
-      
+
       JobConf conf = new JobConf();
-      conf.set(FuzzyKMeansConfigKeys.DISTANCE_MEASURE_KEY,
-        "org.apache.mahout.common.distance.EuclideanDistanceMeasure");
+      conf.set(FuzzyKMeansConfigKeys.DISTANCE_MEASURE_KEY, "org.apache.mahout.common.distance.EuclideanDistanceMeasure");
       conf.set(FuzzyKMeansConfigKeys.CLUSTER_CONVERGENCE_KEY, "0.001");
       conf.set(FuzzyKMeansConfigKeys.M_KEY, "2");
       mapper.configure(conf);
-      
-      DummyOutputCollector<Text,FuzzyKMeansInfo> mapCollector = new DummyOutputCollector<Text,FuzzyKMeansInfo>();
+
+      DummyOutputCollector<Text, FuzzyKMeansInfo> mapCollector = new DummyOutputCollector<Text, FuzzyKMeansInfo>();
       for (VectorWritable point : points) {
         mapper.map(new Text(), point, mapCollector, null);
       }
-      
+
       // run combiner
-      DummyOutputCollector<Text,FuzzyKMeansInfo> combinerCollector = new DummyOutputCollector<Text,FuzzyKMeansInfo>();
+      DummyOutputCollector<Text, FuzzyKMeansInfo> combinerCollector = new DummyOutputCollector<Text, FuzzyKMeansInfo>();
       FuzzyKMeansCombiner combiner = new FuzzyKMeansCombiner();
       combiner.configure(conf);
-      
+
       for (Text key : mapCollector.getKeys()) {
-        
+
         List<FuzzyKMeansInfo> values = mapCollector.getValue(key);
         combiner.reduce(new Text(key), values.iterator(), combinerCollector, null);
       }
-      
+
       // now verify the combiner output
       assertEquals("Combiner Output", k + 1, combinerCollector.getData().size());
-      
+
       for (Text key : combinerCollector.getKeys()) {
         List<FuzzyKMeansInfo> values = combinerCollector.getValue(key);
         assertEquals("too many values", 1, values.size());
       }
     }
   }
-  
+
   public void testFuzzyKMeansReducer() throws Exception {
     List<VectorWritable> points = TestKmeansClustering.getPointsWritable(TestKmeansClustering.reference);
-    
+
     for (int k = 0; k < points.size(); k++) {
       System.out.println("testKFuzzyKMeansMRJob k= " + k);
       // pick k initial cluster centers at random
       List<SoftCluster> clusterList = new ArrayList<SoftCluster>();
-      
+
       for (int i = 0; i < k + 1; i++) {
         Vector vec = tweakValue(points.get(i).get());
-        
+
         SoftCluster cluster = new SoftCluster(vec, i);
         // cluster.addPoint(cluster.getCenter(), 1);
         clusterList.add(cluster);
       }
-      
+
       // run mapper
       FuzzyKMeansMapper mapper = new FuzzyKMeansMapper();
       mapper.config(clusterList);
-      
+
       JobConf conf = new JobConf();
-      conf.set(FuzzyKMeansConfigKeys.DISTANCE_MEASURE_KEY,
-        "org.apache.mahout.common.distance.EuclideanDistanceMeasure");
+      conf.set(FuzzyKMeansConfigKeys.DISTANCE_MEASURE_KEY, "org.apache.mahout.common.distance.EuclideanDistanceMeasure");
       conf.set(FuzzyKMeansConfigKeys.CLUSTER_CONVERGENCE_KEY, "0.001");
       conf.set(FuzzyKMeansConfigKeys.M_KEY, "2");
       mapper.configure(conf);
-      
-      DummyOutputCollector<Text,FuzzyKMeansInfo> mapCollector = new DummyOutputCollector<Text,FuzzyKMeansInfo>();
+
+      DummyOutputCollector<Text, FuzzyKMeansInfo> mapCollector = new DummyOutputCollector<Text, FuzzyKMeansInfo>();
       for (VectorWritable point : points) {
         mapper.map(new Text(), point, mapCollector, null);
       }
-      
+
       // run combiner
-      DummyOutputCollector<Text,FuzzyKMeansInfo> combinerCollector = new DummyOutputCollector<Text,FuzzyKMeansInfo>();
+      DummyOutputCollector<Text, FuzzyKMeansInfo> combinerCollector = new DummyOutputCollector<Text, FuzzyKMeansInfo>();
       FuzzyKMeansCombiner combiner = new FuzzyKMeansCombiner();
       combiner.configure(conf);
-      
+
       for (Text key : mapCollector.getKeys()) {
         List<FuzzyKMeansInfo> values = mapCollector.getValue(key);
         combiner.reduce(new Text(key), values.iterator(), combinerCollector, null);
       }
-      
+
       // run reducer
-      DummyOutputCollector<Text,SoftCluster> reducerCollector = new DummyOutputCollector<Text,SoftCluster>();
+      DummyOutputCollector<Text, SoftCluster> reducerCollector = new DummyOutputCollector<Text, SoftCluster>();
       FuzzyKMeansReducer reducer = new FuzzyKMeansReducer();
       reducer.config(clusterList);
       reducer.configure(conf);
-      
+
       for (Text key : combinerCollector.getKeys()) {
         List<FuzzyKMeansInfo> values = combinerCollector.getValue(key);
         reducer.reduce(new Text(key), values.iterator(), reducerCollector, new DummyReporter());
       }
-      
+
       // now verify the reducer output
       assertEquals("Reducer Output", k + 1, combinerCollector.getData().size());
-      
+
       // compute the reference result after one iteration and compare
       List<SoftCluster> reference = new ArrayList<SoftCluster>();
       for (int i = 0; i < k + 1; i++) {
         Vector vec = tweakValue(points.get(i).get());
         reference.add(new SoftCluster(vec, i));
       }
-      List<NamedVector> pointsVectors = new ArrayList<NamedVector>();
+      List<Vector> pointsVectors = new ArrayList<Vector>();
       for (VectorWritable point : points) {
-        pointsVectors.add((NamedVector) point.get());
+        pointsVectors.add((Vector) point.get());
       }
-      
+
       DistanceMeasure measure = new EuclideanDistanceMeasure();
       FuzzyKMeansClusterer clusterer = new FuzzyKMeansClusterer(measure, 0.001, 2);
       FuzzyKMeansClusterer.runFuzzyKMeansIteration(pointsVectors, reference, clusterer);
-      
+
       for (SoftCluster key : reference) {
         String clusterId = key.getIdentifier();
         List<SoftCluster> values = reducerCollector.getValue(new Text(clusterId));
@@ -414,39 +422,38 @@ public class TestFuzzyKmeansClustering e
         System.out.println("ref= " + key.toString() + " cluster= " + cluster.toString());
         cluster.recomputeCenter();
         assertEquals("key center: " + key.getCenter().asFormatString() + " does not equal cluster: "
-                     + cluster.getCenter().asFormatString(), key.getCenter(), cluster.getCenter());
+            + cluster.getCenter().asFormatString(), key.getCenter(), cluster.getCenter());
       }
     }
   }
-  
+
   public void testFuzzyKMeansClusterMapper() throws Exception {
     List<VectorWritable> points = TestKmeansClustering.getPointsWritable(TestKmeansClustering.reference);
-    
+
     for (int k = 0; k < points.size(); k++) {
       System.out.println("testKFuzzyKMeansMRJob k= " + k);
       // pick k initial cluster centers at random
       List<SoftCluster> clusterList = new ArrayList<SoftCluster>();
-      
+
       for (int i = 0; i < k + 1; i++) {
         Vector vec = tweakValue(points.get(i).get());
-        
+
         SoftCluster cluster = new SoftCluster(vec, i);
         cluster.addPoint(cluster.getCenter(), 1);
         clusterList.add(cluster);
       }
-      
+
       // run mapper
       FuzzyKMeansMapper mapper = new FuzzyKMeansMapper();
       mapper.config(clusterList);
-      
+
       JobConf conf = new JobConf();
-      conf.set(FuzzyKMeansConfigKeys.DISTANCE_MEASURE_KEY,
-        "org.apache.mahout.common.distance.EuclideanDistanceMeasure");
+      conf.set(FuzzyKMeansConfigKeys.DISTANCE_MEASURE_KEY, "org.apache.mahout.common.distance.EuclideanDistanceMeasure");
       conf.set(FuzzyKMeansConfigKeys.CLUSTER_CONVERGENCE_KEY, "0.001");
       conf.set(FuzzyKMeansConfigKeys.M_KEY, "2");
       mapper.configure(conf);
-      
-      DummyOutputCollector<Text,FuzzyKMeansInfo> mapCollector = new DummyOutputCollector<Text,FuzzyKMeansInfo>();
+
+      DummyOutputCollector<Text, FuzzyKMeansInfo> mapCollector = new DummyOutputCollector<Text, FuzzyKMeansInfo>();
       for (VectorWritable point : points) {
         mapper.map(new Text(), point, mapCollector, null);
       }
@@ -454,30 +461,30 @@ public class TestFuzzyKmeansClustering e
         softCluster.recomputeCenter();
       }
       // run combiner
-      DummyOutputCollector<Text,FuzzyKMeansInfo> combinerCollector = new DummyOutputCollector<Text,FuzzyKMeansInfo>();
+      DummyOutputCollector<Text, FuzzyKMeansInfo> combinerCollector = new DummyOutputCollector<Text, FuzzyKMeansInfo>();
       FuzzyKMeansCombiner combiner = new FuzzyKMeansCombiner();
       combiner.configure(conf);
-      
+
       for (Text key : mapCollector.getKeys()) {
-        
+
         List<FuzzyKMeansInfo> values = mapCollector.getValue(key);
         combiner.reduce(new Text(key), values.iterator(), combinerCollector, null);
       }
-      
+
       // run reducer
-      DummyOutputCollector<Text,SoftCluster> reducerCollector = new DummyOutputCollector<Text,SoftCluster>();
+      DummyOutputCollector<Text, SoftCluster> reducerCollector = new DummyOutputCollector<Text, SoftCluster>();
       FuzzyKMeansReducer reducer = new FuzzyKMeansReducer();
       reducer.config(clusterList);
       reducer.configure(conf);
-      
+
       for (Text key : combinerCollector.getKeys()) {
         List<FuzzyKMeansInfo> values = combinerCollector.getValue(key);
         reducer.reduce(new Text(key), values.iterator(), reducerCollector, null);
       }
-      
+
       // run clusterMapper
       List<SoftCluster> reducerCluster = new ArrayList<SoftCluster>();
-      
+
       for (Text key : reducerCollector.getKeys()) {
         List<SoftCluster> values = reducerCollector.getValue(key);
         reducerCluster.add(values.get(0));
@@ -485,17 +492,17 @@ public class TestFuzzyKmeansClustering e
       for (SoftCluster softCluster : reducerCluster) {
         softCluster.recomputeCenter();
       }
-      
-      DummyOutputCollector<IntWritable,VectorWritable> clusterMapperCollector = new DummyOutputCollector<IntWritable,VectorWritable>();
-      
+
+      DummyOutputCollector<IntWritable, VectorWritable> clusterMapperCollector = new DummyOutputCollector<IntWritable, VectorWritable>();
+
       FuzzyKMeansClusterMapper clusterMapper = new FuzzyKMeansClusterMapper();
       clusterMapper.config(reducerCluster);
       clusterMapper.configure(conf);
-      
+
       for (VectorWritable point : points) {
         clusterMapper.map(new Text(), point, clusterMapperCollector, null);
       }
-      
+
       // now run for one iteration of referencefuzzykmeans and compare the
       // results
       // compute the reference result after one iteration and compare
@@ -504,36 +511,32 @@ public class TestFuzzyKmeansClustering e
         Vector vec = tweakValue(points.get(i).get());
         reference.add(new SoftCluster(vec, i));
       }
-      Map<String,String> pointClusterInfo = new HashMap<String,String>();
-      List<NamedVector> pointsVectors = new ArrayList<NamedVector>();
+      Map<Integer, List<Vector>> refClusters = new HashMap<Integer, List<Vector>>();
+      List<Vector> pointsVectors = new ArrayList<Vector>();
       for (VectorWritable point : points) {
-        pointsVectors.add((NamedVector) point.get());
+        pointsVectors.add((Vector) point.get());
       }
-      
+
       List<List<SoftCluster>> clusters = FuzzyKMeansClusterer.clusterPoints(pointsVectors, reference,
-        new EuclideanDistanceMeasure(), 0.001, 2, 1);
-      computeCluster(pointsVectors, clusters.get(clusters.size() - 1), new FuzzyKMeansClusterer(
-          new EuclideanDistanceMeasure(), 0.001, 2), pointClusterInfo);
-      
-      // Now compare the clustermapper results with reducer
-      for (IntWritable key : clusterMapperCollector.getKeys()) {
-        List<VectorWritable> value = clusterMapperCollector.getValue(key);
-        
-        String refValue = pointClusterInfo.get(key.toString());
-        String clusterInfoStr = refValue.substring(1, refValue.length() - 1);
-        String[] refClusterInfoList = clusterInfoStr.split(" ");
-        assertEquals("Number of clusters", k + 1, refClusterInfoList.length);
-        Map<String,Double> refClusterInfoMap = new HashMap<String,Double>();
-        for (String clusterInfo : refClusterInfoList) {
-          String[] clusterProb = clusterInfo.split(":");
-          double clusterProbVal = Double.parseDouble(clusterProb[1]);
-          refClusterInfoMap.put(clusterProb[0], clusterProbVal);
-        }
-        
-        VectorWritable kMeansOutput = value.get(0);
-        // TODO: fail("test this output");
+          new EuclideanDistanceMeasure(), 0.001, 2, 1);
+
+      computeCluster(pointsVectors, clusters.get(clusters.size() - 1), new FuzzyKMeansClusterer(new EuclideanDistanceMeasure(),
+          0.001, 2), refClusters);
+
+      // Now compare the clustermapper results with reference implementation
+      assertEquals("mapper and reference sizes", refClusters.size(), clusterMapperCollector.getKeys().size());
+      for (int pcId : refClusters.keySet()) {
+        assertEquals("cluster " + pcId + " sizes", refClusters.get(pcId).size(), clusterMapperCollector.getValue(
+            new IntWritable(pcId)).size());
+      }
+      // make sure all points are allocated to a cluster
+      int size = 0;
+      for (int cId : refClusters.keySet()) {
+        List<Vector> pts = refClusters.get(cId);
+        size += pts.size();
       }
+      assertEquals("total size", size, points.size());
     }
   }
-  
+
 }

Modified: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java?rev=939019&r1=939018&r2=939019&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java (original)
+++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java Wed Apr 28 17:37:12 2010
@@ -40,23 +40,21 @@ import org.apache.mahout.common.distance
 import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
 import org.apache.mahout.common.distance.ManhattanDistanceMeasure;
 import org.apache.mahout.math.DenseVector;
-import org.apache.mahout.math.NamedVector;
 import org.apache.mahout.math.RandomAccessSparseVector;
 import org.apache.mahout.math.SequentialAccessSparseVector;
 import org.apache.mahout.math.Vector;
 import org.apache.mahout.math.VectorWritable;
 
 public class TestKmeansClustering extends MahoutTestCase {
-  
-  public static final double[][] reference = { {1, 1}, {2, 1}, {1, 2}, {2, 2}, {3, 3}, {4, 4}, {5, 4},
-                                              {4, 5}, {5, 5}};
-  
-  private static final int[][] expectedNumPoints = { {9}, {4, 5}, {4, 4, 1}, {1, 2, 1, 5}, {1, 1, 1, 2, 4},
-                                                    {1, 1, 1, 1, 1, 4}, {1, 1, 1, 1, 1, 2, 2},
-                                                    {1, 1, 1, 1, 1, 1, 2, 1}, {1, 1, 1, 1, 1, 1, 1, 1, 1}};
-  
+
+  public static final double[][] reference = { { 1, 1 }, { 2, 1 }, { 1, 2 }, { 2, 2 }, { 3, 3 }, { 4, 4 }, { 5, 4 }, { 4, 5 },
+      { 5, 5 } };
+
+  private static final int[][] expectedNumPoints = { { 9 }, { 4, 5 }, { 4, 4, 1 }, { 1, 2, 1, 5 }, { 1, 1, 1, 2, 4 },
+      { 1, 1, 1, 1, 1, 4 }, { 1, 1, 1, 1, 1, 2, 2 }, { 1, 1, 1, 1, 1, 1, 2, 1 }, { 1, 1, 1, 1, 1, 1, 1, 1, 1 } };
+
   private FileSystem fs;
-  
+
   private static void rmr(String path) {
     File f = new File(path);
     if (f.exists()) {
@@ -69,7 +67,7 @@ public class TestKmeansClustering extend
       f.delete();
     }
   }
-  
+
   @Override
   protected void setUp() throws Exception {
     super.setUp();
@@ -78,32 +76,32 @@ public class TestKmeansClustering extend
     Configuration conf = new Configuration();
     fs = FileSystem.get(conf);
   }
-  
+
   public static List<VectorWritable> getPointsWritable(double[][] raw) {
     List<VectorWritable> points = new ArrayList<VectorWritable>();
     for (int i = 0; i < raw.length; i++) {
       double[] fr = raw[i];
       Vector vec = new RandomAccessSparseVector(fr.length);
       vec.assign(fr);
-      points.add(new VectorWritable(new NamedVector(vec, String.valueOf(i))));
+      points.add(new VectorWritable(vec));
     }
     return points;
   }
-  
-  public static List<NamedVector> getPoints(double[][] raw) {
-    List<NamedVector> points = new ArrayList<NamedVector>();
+
+  public static List<Vector> getPoints(double[][] raw) {
+    List<Vector> points = new ArrayList<Vector>();
     for (int i = 0; i < raw.length; i++) {
       double[] fr = raw[i];
-      NamedVector vec = new NamedVector(new SequentialAccessSparseVector(fr.length), String.valueOf(i));
+      Vector vec = new SequentialAccessSparseVector(fr.length);
       vec.assign(fr);
       points.add(vec);
     }
     return points;
   }
-  
+
   /** Story: Test the reference implementation */
   public void testReferenceImplementation() throws Exception {
-    List<NamedVector> points = getPoints(reference);
+    List<Vector> points = getPoints(reference);
     DistanceMeasure measure = new EuclideanDistanceMeasure();
     // try all possible values of k
     for (int k = 0; k < points.size(); k++) {
@@ -116,8 +114,7 @@ public class TestKmeansClustering extend
       }
       // iterate clusters until they converge
       int maxIter = 10;
-      List<List<Cluster>> clustersList = KMeansClusterer.clusterPoints(points, clusters, measure, maxIter,
-        0.001);
+      List<List<Cluster>> clustersList = KMeansClusterer.clusterPoints(points, clusters, measure, maxIter, 0.001);
       clusters = clustersList.get(clustersList.size() - 1);
       for (int c = 0; c < clusters.size(); c++) {
         Cluster cluster = clusters.get(c);
@@ -126,9 +123,9 @@ public class TestKmeansClustering extend
       }
     }
   }
-  
+
   public void testStd() {
-    List<NamedVector> points = getPoints(reference);
+    List<Vector> points = getPoints(reference);
     Cluster c = new Cluster(points.get(0));
     for (Vector p : points) {
       c.addPoint(p);
@@ -138,30 +135,29 @@ public class TestKmeansClustering extend
     }
   }
 
-  private static Map<String,Cluster> loadClusterMap(List<Cluster> clusters) {
-    Map<String,Cluster> clusterMap = new HashMap<String,Cluster>();
-    
+  private static Map<String, Cluster> loadClusterMap(List<Cluster> clusters) {
+    Map<String, Cluster> clusterMap = new HashMap<String, Cluster>();
+
     for (Cluster cluster : clusters) {
       clusterMap.put(cluster.getIdentifier(), cluster);
     }
     return clusterMap;
   }
-  
+
   /** Story: test that the mapper will map input points to the nearest cluster */
   public void testKMeansMapper() throws Exception {
     KMeansMapper mapper = new KMeansMapper();
     JobConf conf = new JobConf();
-    conf.set(KMeansConfigKeys.DISTANCE_MEASURE_KEY,
-      "org.apache.mahout.common.distance.EuclideanDistanceMeasure");
+    conf.set(KMeansConfigKeys.DISTANCE_MEASURE_KEY, "org.apache.mahout.common.distance.EuclideanDistanceMeasure");
     conf.set(KMeansConfigKeys.CLUSTER_CONVERGENCE_KEY, "0.001");
     conf.set(KMeansConfigKeys.CLUSTER_PATH_KEY, "");
     mapper.configure(conf);
     List<VectorWritable> points = getPointsWritable(reference);
     for (int k = 0; k < points.size(); k++) {
       // pick k initial cluster centers at random
-      DummyOutputCollector<Text,KMeansInfo> collector = new DummyOutputCollector<Text,KMeansInfo>();
+      DummyOutputCollector<Text, KMeansInfo> collector = new DummyOutputCollector<Text, KMeansInfo>();
       List<Cluster> clusters = new ArrayList<Cluster>();
-      
+
       for (int i = 0; i < k + 1; i++) {
         Cluster cluster = new Cluster(points.get(i).get(), i);
         // add the center so the centroid will be correct upon output
@@ -169,7 +165,7 @@ public class TestKmeansClustering extend
         clusters.add(cluster);
       }
       mapper.config(clusters);
-      
+
       // map the data
       for (VectorWritable point : points) {
         mapper.map(new Text(), point, collector, null);
@@ -177,21 +173,20 @@ public class TestKmeansClustering extend
       assertEquals("Number of map results", k + 1, collector.getData().size());
       // now verify that all points are correctly allocated
       EuclideanDistanceMeasure euclideanDistanceMeasure = new EuclideanDistanceMeasure();
-      Map<String,Cluster> clusterMap = loadClusterMap(clusters);
+      Map<String, Cluster> clusterMap = loadClusterMap(clusters);
       for (Text key : collector.getKeys()) {
         Cluster cluster = clusterMap.get(key.toString());
         List<KMeansInfo> values = collector.getValue(key);
         for (KMeansInfo value : values) {
           double distance = euclideanDistanceMeasure.distance(cluster.getCenter(), value.getPointTotal());
           for (Cluster c : clusters) {
-            assertTrue("distance error", distance <= euclideanDistanceMeasure.distance(value.getPointTotal(),
-              c.getCenter()));
+            assertTrue("distance error", distance <= euclideanDistanceMeasure.distance(value.getPointTotal(), c.getCenter()));
           }
         }
       }
     }
   }
-  
+
   /**
    * Story: test that the combiner will produce partial cluster totals for all of the clusters and points that
    * it sees
@@ -199,19 +194,18 @@ public class TestKmeansClustering extend
   public void testKMeansCombiner() throws Exception {
     KMeansMapper mapper = new KMeansMapper();
     JobConf conf = new JobConf();
-    conf.set(KMeansConfigKeys.DISTANCE_MEASURE_KEY,
-      "org.apache.mahout.common.distance.EuclideanDistanceMeasure");
+    conf.set(KMeansConfigKeys.DISTANCE_MEASURE_KEY, "org.apache.mahout.common.distance.EuclideanDistanceMeasure");
     conf.set(KMeansConfigKeys.CLUSTER_CONVERGENCE_KEY, "0.001");
     conf.set(KMeansConfigKeys.CLUSTER_PATH_KEY, "");
     mapper.configure(conf);
     List<VectorWritable> points = getPointsWritable(reference);
     for (int k = 0; k < points.size(); k++) {
       // pick k initial cluster centers at random
-      DummyOutputCollector<Text,KMeansInfo> collector = new DummyOutputCollector<Text,KMeansInfo>();
+      DummyOutputCollector<Text, KMeansInfo> collector = new DummyOutputCollector<Text, KMeansInfo>();
       List<Cluster> clusters = new ArrayList<Cluster>();
       for (int i = 0; i < k + 1; i++) {
         Vector vec = points.get(i).get();
-        
+
         Cluster cluster = new Cluster(vec, i);
         // add the center so the centroid will be correct upon output
         cluster.addPoint(cluster.getCenter());
@@ -224,11 +218,11 @@ public class TestKmeansClustering extend
       }
       // now combine the data
       KMeansCombiner combiner = new KMeansCombiner();
-      DummyOutputCollector<Text,KMeansInfo> collector2 = new DummyOutputCollector<Text,KMeansInfo>();
+      DummyOutputCollector<Text, KMeansInfo> collector2 = new DummyOutputCollector<Text, KMeansInfo>();
       for (Text key : collector.getKeys()) {
         combiner.reduce(new Text(key), collector.getValue(key).iterator(), collector2, null);
       }
-      
+
       assertEquals("Number of map results", k + 1, collector2.getData().size());
       // now verify that all points are accounted for
       int count = 0;
@@ -238,7 +232,7 @@ public class TestKmeansClustering extend
         assertEquals("too many values", 1, values.size());
         // String value = values.get(0).toString();
         KMeansInfo info = values.get(0);
-        
+
         count += info.getPoints();
         total = total.plus(info.getPointTotal());
       }
@@ -247,7 +241,7 @@ public class TestKmeansClustering extend
       assertEquals("point total[1]", 27, (int) total.get(1));
     }
   }
-  
+
   /**
    * Story: test that the reducer will sum the partial cluster totals for all of the clusters and points that
    * it sees
@@ -256,8 +250,7 @@ public class TestKmeansClustering extend
     KMeansMapper mapper = new KMeansMapper();
     EuclideanDistanceMeasure euclideanDistanceMeasure = new EuclideanDistanceMeasure();
     JobConf conf = new JobConf();
-    conf.set(KMeansConfigKeys.DISTANCE_MEASURE_KEY,
-      "org.apache.mahout.common.distance.EuclideanDistanceMeasure");
+    conf.set(KMeansConfigKeys.DISTANCE_MEASURE_KEY, "org.apache.mahout.common.distance.EuclideanDistanceMeasure");
     conf.set(KMeansConfigKeys.CLUSTER_CONVERGENCE_KEY, "0.001");
     conf.set(KMeansConfigKeys.CLUSTER_PATH_KEY, "");
     mapper.configure(conf);
@@ -265,7 +258,7 @@ public class TestKmeansClustering extend
     for (int k = 0; k < points.size(); k++) {
       System.out.println("K = " + k);
       // pick k initial cluster centers at random
-      DummyOutputCollector<Text,KMeansInfo> collector = new DummyOutputCollector<Text,KMeansInfo>();
+      DummyOutputCollector<Text, KMeansInfo> collector = new DummyOutputCollector<Text, KMeansInfo>();
       List<Cluster> clusters = new ArrayList<Cluster>();
       for (int i = 0; i < k + 1; i++) {
         Vector vec = points.get(i).get();
@@ -281,40 +274,39 @@ public class TestKmeansClustering extend
       }
       // now combine the data
       KMeansCombiner combiner = new KMeansCombiner();
-      DummyOutputCollector<Text,KMeansInfo> collector2 = new DummyOutputCollector<Text,KMeansInfo>();
+      DummyOutputCollector<Text, KMeansInfo> collector2 = new DummyOutputCollector<Text, KMeansInfo>();
       for (Text key : collector.getKeys()) {
         combiner.reduce(new Text(key), collector.getValue(key).iterator(), collector2, null);
       }
-      
+
       // now reduce the data
       KMeansReducer reducer = new KMeansReducer();
       reducer.configure(conf);
       reducer.config(clusters);
-      DummyOutputCollector<Text,Cluster> collector3 = new DummyOutputCollector<Text,Cluster>();
+      DummyOutputCollector<Text, Cluster> collector3 = new DummyOutputCollector<Text, Cluster>();
       for (Text key : collector2.getKeys()) {
         reducer.reduce(new Text(key), collector2.getValue(key).iterator(), collector3, new DummyReporter());
       }
-      
+
       assertEquals("Number of map results", k + 1, collector3.getData().size());
-      
+
       // compute the reference result after one iteration and compare
       List<Cluster> reference = new ArrayList<Cluster>();
       for (int i = 0; i < k + 1; i++) {
         Vector vec = points.get(i).get();
         reference.add(new Cluster(vec, i));
       }
-      List<NamedVector> pointsVectors = new ArrayList<NamedVector>();
+      List<Vector> pointsVectors = new ArrayList<Vector>();
       for (VectorWritable point : points) {
-        pointsVectors.add((NamedVector) point.get());
+        pointsVectors.add((Vector) point.get());
       }
-      boolean converged = KMeansClusterer.runKMeansIteration(pointsVectors, reference,
-        euclideanDistanceMeasure, 0.001);
+      boolean converged = KMeansClusterer.runKMeansIteration(pointsVectors, reference, euclideanDistanceMeasure, 0.001);
       if (k == 8) {
         assertTrue("not converged? " + k, converged);
       } else {
         assertFalse("converged? " + k, converged);
       }
-      
+
       // now verify that all clusters have correct centers
       converged = true;
       for (int i = 0; i < reference.size(); i++) {
@@ -335,7 +327,7 @@ public class TestKmeansClustering extend
       }
     }
   }
-  
+
   /** Story: User wishes to run kmeans job on reference data */
   public void testKMeansMRJob() throws Exception {
     List<VectorWritable> points = getPointsWritable(reference);
@@ -347,7 +339,7 @@ public class TestKmeansClustering extend
     if (!testData.exists()) {
       testData.mkdir();
     }
-    
+
     Configuration conf = new Configuration();
     ClusteringTestUtils.writePointsToFile(points, "testdata/points/file1", fs, conf);
     ClusteringTestUtils.writePointsToFile(points, "testdata/points/file2", fs, conf);
@@ -358,10 +350,10 @@ public class TestKmeansClustering extend
       Path path = new Path("testdata/clusters/part-00000");
       FileSystem fs = FileSystem.get(path.toUri(), job);
       SequenceFile.Writer writer = new SequenceFile.Writer(fs, job, path, Text.class, Cluster.class);
-      
+
       for (int i = 0; i < k + 1; i++) {
         Vector vec = points.get(i).get();
-        
+
         Cluster cluster = new Cluster(vec, i);
         // add the center so the centroid will be correct upon output
         cluster.addPoint(cluster.getCenter());
@@ -370,24 +362,24 @@ public class TestKmeansClustering extend
       writer.close();
       // now run the Job
       HadoopUtil.overwriteOutput("output");
-      KMeansDriver.runJob("testdata/points", "testdata/clusters", "output", EuclideanDistanceMeasure.class
-          .getName(), 0.001, 10, k + 1);
+      KMeansDriver.runJob("testdata/points", "testdata/clusters", "output", EuclideanDistanceMeasure.class.getName(), 0.001, 10,
+          k + 1);
       // now compare the expected clusters with actual
       File outDir = new File("output/clusteredPoints");
       assertTrue("output dir exists?", outDir.exists());
       // assertEquals("output dir files?", 4, outFiles.length);
       SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path("output/clusteredPoints/part-00000"), conf);
       int[] expect = expectedNumPoints[k];
-      DummyOutputCollector<IntWritable,VectorWritable> collector = new DummyOutputCollector<IntWritable,VectorWritable>();
+      DummyOutputCollector<IntWritable, VectorWritable> collector = new DummyOutputCollector<IntWritable, VectorWritable>();
       // The key is the clusterId
       IntWritable clusterId = new IntWritable(0);
       // The value is the vector
       VectorWritable value = new VectorWritable();
       while (reader.next(clusterId, value)) {
-       collector.collect(clusterId, value);
+        collector.collect(clusterId, value);
         clusterId = new IntWritable(0);
         value = new VectorWritable();
-        
+
       }
       reader.close();
       if (k == 2)
@@ -399,7 +391,7 @@ public class TestKmeansClustering extend
       }
     }
   }
-  
+
   /** Story: User wants to use canopy clustering to input the initial clusters for kmeans job. */
   public void testKMeansWithCanopyClusterInput() throws Exception {
     List<VectorWritable> points = getPointsWritable(reference);
@@ -414,23 +406,21 @@ public class TestKmeansClustering extend
     Configuration conf = new Configuration();
     ClusteringTestUtils.writePointsToFile(points, "testdata/points/file1", fs, conf);
     ClusteringTestUtils.writePointsToFile(points, "testdata/points/file2", fs, conf);
-    
+
     // now run the Canopy job
-    CanopyDriver.runJob("testdata/points", "testdata/canopies", ManhattanDistanceMeasure.class.getName(),
-      3.1, 2.1);
-    
+    CanopyDriver.runJob("testdata/points", "testdata/canopies", ManhattanDistanceMeasure.class.getName(), 3.1, 2.1);
+
     // now run the KMeans job
-    KMeansDriver.runJob("testdata/points", "testdata/canopies", "output", EuclideanDistanceMeasure.class
-        .getName(), 0.001, 10, 1);
-    
+    KMeansDriver.runJob("testdata/points", "testdata/canopies", "output", EuclideanDistanceMeasure.class.getName(), 0.001, 10, 1);
+
     // now compare the expected clusters with actual
     File outDir = new File("output/clusteredPoints");
     assertTrue("output dir exists?", outDir.exists());
     String[] outFiles = outDir.list();
     assertEquals("output dir files?", 4, outFiles.length);
-    DummyOutputCollector<IntWritable,VectorWritable> collector = new DummyOutputCollector<IntWritable,VectorWritable>();
+    DummyOutputCollector<IntWritable, VectorWritable> collector = new DummyOutputCollector<IntWritable, VectorWritable>();
     SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path("output/clusteredPoints/part-00000"), conf);
-    
+
     // The key is the clusterId
     IntWritable clusterId = new IntWritable(0);
     // The value is the vector
@@ -439,10 +429,10 @@ public class TestKmeansClustering extend
       collector.collect(clusterId, value);
       clusterId = new IntWritable(0);
       value = new VectorWritable();
-      
+
     }
     reader.close();
-    
+
     assertEquals("num points[0]", 4, collector.getValue(new IntWritable(0)).size());
     assertEquals("num points[1]", 5, collector.getValue(new IntWritable(1)).size());
   }

Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/dirichlet/DisplayDirichlet.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/dirichlet/DisplayDirichlet.java?rev=939019&r1=939018&r2=939019&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/dirichlet/DisplayDirichlet.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/dirichlet/DisplayDirichlet.java Wed Apr 28 17:37:12 2010
@@ -215,10 +215,8 @@ public class DisplayDirichlet extends Fr
     sampleParams.add(new DenseVector(params));
     log.info("Generating {} samples m=[{}, {}] sd={}", new Object[] {num, mx, my, sd});
     for (int i = 0; i < num; i++) {
-      sampleData.add(new VectorWritable(
-          new NamedVector(new DenseVector(new double[] {UncommonDistributions.rNorm(mx, sd),
-                                                        UncommonDistributions.rNorm(my, sd)}),
-                          String.valueOf(i))));
+      sampleData.add(new VectorWritable(new DenseVector(new double[] {UncommonDistributions.rNorm(mx, sd),
+          UncommonDistributions.rNorm(my, sd)})));
     }
   }
   

Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/fuzzykmeans/DisplayFuzzyKMeans.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/fuzzykmeans/DisplayFuzzyKMeans.java?rev=939019&r1=939018&r2=939019&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/fuzzykmeans/DisplayFuzzyKMeans.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/fuzzykmeans/DisplayFuzzyKMeans.java Wed Apr 28 17:37:12 2010
@@ -62,9 +62,9 @@ class DisplayFuzzyKMeans extends Display
   public static void main(String[] args) {
     RandomUtils.useTestSeed();
     DisplayDirichlet.generateSamples();
-    List<NamedVector> points = new ArrayList<NamedVector>();
+    List<Vector> points = new ArrayList<Vector>();
     for (VectorWritable sample : sampleData) {
-      points.add((NamedVector) sample.get());
+      points.add((Vector) sample.get());
     }
     DistanceMeasure measure = new ManhattanDistanceMeasure();
     List<SoftCluster> initialClusters = new ArrayList<SoftCluster>();

Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/kmeans/DisplayKMeans.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/kmeans/DisplayKMeans.java?rev=939019&r1=939018&r2=939019&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/kmeans/DisplayKMeans.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/kmeans/DisplayKMeans.java Wed Apr 28 17:37:12 2010
@@ -33,14 +33,14 @@ import org.apache.mahout.math.Vector;
 import org.apache.mahout.math.VectorWritable;
 
 class DisplayKMeans extends DisplayDirichlet {
-  
+
   private static List<List<Cluster>> clusters;
-  
+
   DisplayKMeans() {
     initialize();
     this.setTitle("K-Means Clusters (> 5% of population)");
   }
-  
+
   @Override
   public void paint(Graphics g) {
     super.plotSampleData(g);
@@ -59,13 +59,13 @@ class DisplayKMeans extends DisplayDiric
       }
     }
   }
-  
+
   public static void main(String[] args) {
     RandomUtils.useTestSeed();
     DisplayDirichlet.generateSamples();
-    List<NamedVector> points = new ArrayList<NamedVector>();
+    List<Vector> points = new ArrayList<Vector>();
     for (VectorWritable sample : sampleData) {
-      points.add((NamedVector) sample.get());
+      points.add(sample.get());
     }
     DistanceMeasure measure = new ManhattanDistanceMeasure();
     List<Cluster> initialClusters = new ArrayList<Cluster>();

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwDriver.java?rev=939019&r1=939018&r2=939019&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwDriver.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwDriver.java Wed Apr 28 17:37:12 2010
@@ -34,7 +34,6 @@ import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.IntWritable;
 import org.apache.hadoop.io.SequenceFile;
-import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.mapred.FileInputFormat;
 import org.apache.hadoop.mapred.FileOutputFormat;
@@ -44,8 +43,8 @@ import org.apache.hadoop.mapred.Sequence
 import org.apache.hadoop.mapred.SequenceFileOutputFormat;
 import org.apache.mahout.clustering.Cluster;
 import org.apache.mahout.clustering.ClusterBase;
+import org.apache.mahout.clustering.WeightedPointWritable;
 import org.apache.mahout.clustering.dirichlet.DirichletCluster;
-import org.apache.mahout.clustering.dirichlet.DirichletMapper;
 import org.apache.mahout.clustering.kmeans.KMeansDriver;
 import org.apache.mahout.common.CommandLineUtil;
 import org.apache.mahout.common.commandline.DefaultOptionCreator;
@@ -142,6 +141,13 @@ public class CDbwDriver {
       // now point the input to the old output directory
       stateIn = stateOut;
     }
+
+    Configurable client = new JobClient();
+    JobConf conf = new JobConf(CDbwDriver.class);
+    conf.set(STATE_IN_KEY, stateIn);
+    conf.set(DISTANCE_MEASURE_KEY, distanceMeasureClass);
+    CDbwEvaluator evaluator = new CDbwEvaluator(conf, clustersIn);
+    System.out.println("CDbw = " + evaluator.CDbw());
   }
 
   private static void writeInitialState(String output, String clustersIn) throws ClassNotFoundException, InstantiationException,
@@ -192,7 +198,7 @@ public class CDbwDriver {
     conf.setOutputKeyClass(IntWritable.class);
     conf.setOutputValueClass(VectorWritable.class);
     conf.setMapOutputKeyClass(IntWritable.class);
-    conf.setMapOutputValueClass(CDbwDistantPointWritable.class);
+    conf.setMapOutputValueClass(WeightedPointWritable.class);
 
     FileInputFormat.setInputPaths(conf, new Path(input));
     Path outPath = new Path(stateOut);
@@ -213,36 +219,4 @@ public class CDbwDriver {
       log.warn(e.toString(), e);
     }
   }
-
-  /**
-   * Run the job using supplied arguments
-   * 
-   * @param input
-   *          the directory pathname for input points
-   * @param stateIn
-   *          the directory pathname for input state
-   * @param output
-   *          the directory pathname for output points
-   */
-  public static void runClustering(String input, String stateIn, String output) {
-    Configurable client = new JobClient();
-    JobConf conf = new JobConf(CDbwDriver.class);
-
-    conf.setOutputKeyClass(Text.class);
-    conf.setOutputValueClass(Text.class);
-
-    FileInputFormat.setInputPaths(conf, new Path(input));
-    Path outPath = new Path(output);
-    FileOutputFormat.setOutputPath(conf, outPath);
-
-    conf.setMapperClass(DirichletMapper.class);
-    conf.setNumReduceTasks(0);
-
-    client.setConf(conf);
-    try {
-      JobClient.runJob(conf);
-    } catch (IOException e) {
-      log.warn(e.toString(), e);
-    }
-  }
 }

Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java?rev=939019&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java Wed Apr 28 17:37:12 2010
@@ -0,0 +1,304 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.cdbw;
+
+import java.io.File;
+import java.io.IOException;
+import java.lang.reflect.InvocationTargetException;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.function.SquareRootFunction;
+
+public class CDbwEvaluator {
+
+  Map<Integer, List<VectorWritable>> representativePoints;
+
+  Map<Integer, Double> stDevs = new HashMap<Integer, Double>();
+
+  Map<Integer, Cluster> clusters;
+
+  DistanceMeasure measure;
+
+  /**
+   * For testing only
+   * 
+   * @param representativePoints
+   *            a Map<Integer,List<VectorWritable>> of representative points keyed by clusterId
+   * @param clusters
+   *            a Map<Integer,Cluster> of the clusters keyed by clusterId
+   * @param measure
+   *            an appropriate DistanceMeasure
+   */
+  public CDbwEvaluator(Map<Integer, List<VectorWritable>> representativePoints, Map<Integer, Cluster> clusters,
+      DistanceMeasure measure) {
+    super();
+    this.representativePoints = representativePoints;
+    this.clusters = clusters;
+    this.measure = measure;
+    for (Integer cId : representativePoints.keySet()) {
+      setStDev(cId);
+    }
+  }
+
+  /**
+   * Initialize a new instance from job information
+   * 
+   * @param job
+   *            a JobConf with appropriate parameters
+   * @param clustersIn
+   *            a String path to the input clusters directory
+   *            
+   * @throws SecurityException
+   * @throws IllegalArgumentException
+   * @throws NoSuchMethodException
+   * @throws InvocationTargetException
+   * @throws ClassNotFoundException
+   * @throws InstantiationException
+   * @throws IllegalAccessException
+   * @throws IOException
+   */
+  public CDbwEvaluator(JobConf job, String clustersIn) throws SecurityException, IllegalArgumentException, NoSuchMethodException,
+      InvocationTargetException, ClassNotFoundException, InstantiationException, IllegalAccessException, IOException {
+    super();
+    ClassLoader ccl = Thread.currentThread().getContextClassLoader();
+    Class<?> cl = ccl.loadClass(job.get(CDbwDriver.DISTANCE_MEASURE_KEY));
+    measure = (DistanceMeasure) cl.newInstance();
+    representativePoints = CDbwMapper.getRepresentativePoints(job);
+    clusters = loadClusters(job, clustersIn);
+    for (Integer cId : representativePoints.keySet()) {
+      setStDev(cId);
+    }
+  }
+
+  public double CDbw() {
+    double cdbw = intraClusterDensity() * separation();
+    System.out.println("CDbw=" + cdbw);
+    return cdbw;
+  }
+
+  /**
+   * Load the clusters from their sequence files
+   * 
+   * @param clustersIn 
+   *            a String pathname to the directory containing input cluster files
+   * @return a List<Cluster> of the clusters
+   * 
+   * @throws ClassNotFoundException
+   * @throws InstantiationException
+   * @throws IllegalAccessException
+   * @throws IOException
+   * @throws SecurityException
+   * @throws NoSuchMethodException
+   * @throws InvocationTargetException
+   */
+  private HashMap<Integer, Cluster> loadClusters(JobConf job, String clustersIn) throws ClassNotFoundException,
+      InstantiationException, IllegalAccessException, IOException, SecurityException, NoSuchMethodException,
+      InvocationTargetException {
+    HashMap<Integer, Cluster> clusters = new HashMap<Integer, Cluster>();
+    File f = new File(clustersIn);
+    for (File part : f.listFiles()) {
+      if (!part.getName().startsWith(".")) {
+        Path inPart = new Path(clustersIn + "/" + part.getName());
+        FileSystem fs = FileSystem.get(inPart.toUri(), job);
+        SequenceFile.Reader reader = new SequenceFile.Reader(fs, inPart, job);
+        Writable key = (Writable) reader.getKeyClass().newInstance();
+        Writable value = (Writable) reader.getValueClass().newInstance();
+        while (reader.next(key, value)) {
+          Cluster cluster = (Cluster) value;
+          clusters.put(cluster.getId(), cluster);
+          value = (Writable) reader.getValueClass().newInstance();
+        }
+        reader.close();
+      }
+    }
+    return clusters;
+  }
+
+  double interClusterDensity() {
+    double sum = 0;
+    for (int cI : representativePoints.keySet()) {
+      for (int cJ : representativePoints.keySet()) {
+        if (cI == cJ) {
+          continue;
+        }
+        List<VectorWritable> repI = representativePoints.get(cI);
+        List<VectorWritable> repJ = representativePoints.get(cJ);
+        double minDistance = Double.MAX_VALUE;
+        Vector uIJ = null;
+        for (int ptI = 0; ptI < repI.size(); ptI++) {
+          for (int ptJ = 0; ptJ < repJ.size(); ptJ++) {
+            Vector vI = repI.get(ptI).get();
+            Vector vJ = repJ.get(ptJ).get();
+            double distance = measure.distance(vI, vJ);
+            if (distance < minDistance) {
+              minDistance = distance;
+              uIJ = vI.plus(vJ).divide(2);
+            }
+          }
+        }
+        double stDevI = stDevs.get(cI);
+        double stDevJ = stDevs.get(cJ);
+        double interDensity = interDensity(uIJ, cI, cJ);
+        double stdSum = stDevI + stDevJ;
+        double density = 0;
+        if (stdSum > 0) {
+          density = minDistance * interDensity / stdSum;
+        }
+
+        if (false) {
+          System.out.println("minDistance[" + cI + "," + cJ + "]=" + minDistance);
+          System.out.println("stDev[" + cI + "]=" + stDevI);
+          System.out.println("stDev[" + cJ + "]=" + stDevJ);
+          System.out.println("interDensity[" + cI + "," + cJ + "]=" + interDensity);
+          System.out.println("density[" + cI + "," + cJ + "]=" + density);
+          System.out.println();
+        }
+        sum += density;
+      }
+    }
+    System.out.println("interClusterDensity=" + sum);
+    return sum;
+  }
+
+  double interDensity(Vector uIJ, int cI, int cJ) {
+    List<VectorWritable> repI = representativePoints.get(cI);
+    List<VectorWritable> repJ = representativePoints.get(cJ);
+    double density = 0;
+    double std = (stDevs.get(cI) + stDevs.get(cJ)) / 2;
+    for (VectorWritable vwI : repI) {
+      if (measure.distance(uIJ, vwI.get()) <= std) {
+        density++;
+      }
+    }
+    for (VectorWritable vwJ : repJ) {
+      if (measure.distance(uIJ, vwJ.get()) <= std) {
+        density++;
+      }
+    }
+    return density / (repI.size() + repJ.size());
+  }
+
+  private void setStDev(int cI) {
+    List<VectorWritable> repPts = representativePoints.get(cI);
+    double d = 0;
+    if (repPts == null) {
+      System.out.println();
+    }
+    int s0 = 0;
+    Vector s1 = null;
+    Vector s2 = null;
+    for (VectorWritable vw : repPts) {
+      s0++;
+      Vector v = vw.get();
+      if (s1 == null) {
+        s1 = v.clone();
+      } else {
+        s1 = s1.plus(v);
+      }
+      if (s2 == null) {
+        s2 = v.times(v);
+      } else {
+        s2 = s2.plus(v.times(v));
+      }
+    }
+    Vector std = s2.times(s0).minus(s1.times(s1)).assign(new SquareRootFunction()).divide(s0);
+    d = std.zSum() / std.size();
+    System.out.println("stDev[" + cI + "]=" + d);
+    stDevs.put(cI, d);
+  }
+
+  double minRpDistance(List<VectorWritable> repI, List<VectorWritable> repJ) {
+    double minDistance = Double.MAX_VALUE;
+    for (int ptI = 0; ptI < repI.size(); ptI++) {
+      for (int ptJ = 0; ptJ < repJ.size(); ptJ++) {
+        double distance = measure.distance(repI.get(ptI).get(), repJ.get(ptJ).get());
+        if (distance < minDistance) {
+          minDistance = distance;
+        }
+      }
+    }
+    return minDistance;
+  }
+
+  double separation() {
+    double minDistance = Double.MAX_VALUE;
+    for (Integer cI : representativePoints.keySet()) {
+      for (int cJ : representativePoints.keySet()) {
+        if (cI == cJ) {
+          continue;
+        }
+        List<VectorWritable> repI = representativePoints.get(cI);
+        List<VectorWritable> repJ = representativePoints.get(cJ);
+        for (int ptI = 0; ptI < repI.size(); ptI++) {
+          for (int ptJ = 0; ptJ < repJ.size(); ptJ++) {
+            double distance = measure.distance(repI.get(ptI).get(), repJ.get(ptJ).get());
+            if (distance < minDistance) {
+              minDistance = distance;
+            }
+          }
+        }
+      }
+    }
+    double separation = minDistance / (1 + interClusterDensity());
+    System.out.println("separation=" + separation);
+    return separation;
+  }
+
+  double intraClusterDensity() {
+    double avgStd = 0;
+    for (Integer cId : representativePoints.keySet()) {
+      avgStd += stDevs.get(cId);
+    }
+    avgStd = avgStd / representativePoints.size();
+
+    double sum = 0;
+    for (Integer cId : representativePoints.keySet()) {
+      List<VectorWritable> repI = representativePoints.get(cId);
+      double cSum = 0;
+      for (int ptI = 0; ptI < repI.size(); ptI++) {
+        double inDensity = intraDensity(clusters.get(cId).getCenter(), repI.get(ptI).get(), avgStd);
+        Double std = stDevs.get(cId);
+        if (std > 0) {
+          cSum += inDensity / std;
+        }
+      }
+      sum += cSum / repI.size();
+    }
+    double intraClusterDensity = sum / representativePoints.size();
+    System.out.println("intraClusterDensity=" + intraClusterDensity);
+    return intraClusterDensity;
+  }
+
+  double intraDensity(Vector clusterCenter, Vector repPoint, double avgStd) {
+    if (measure.distance(clusterCenter, repPoint) <= avgStd) {
+      return 1;
+    }
+    return 0;
+  }
+}

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwMapper.java?rev=939019&r1=939018&r2=939019&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwMapper.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwMapper.java Wed Apr 28 17:37:12 2010
@@ -35,36 +35,37 @@ import org.apache.hadoop.mapred.Mapper;
 import org.apache.hadoop.mapred.OutputCollector;
 import org.apache.hadoop.mapred.OutputLogFilter;
 import org.apache.hadoop.mapred.Reporter;
+import org.apache.mahout.clustering.WeightedPointWritable;
 import org.apache.mahout.common.distance.DistanceMeasure;
 import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
 import org.apache.mahout.math.VectorWritable;
 
-public class CDbwMapper extends MapReduceBase implements Mapper<IntWritable, VectorWritable, IntWritable, CDbwDistantPointWritable> {
+public class CDbwMapper extends MapReduceBase implements Mapper<IntWritable, VectorWritable, IntWritable, WeightedPointWritable> {
 
   private Map<Integer, List<VectorWritable>> representativePoints;
 
-  private Map<Integer, CDbwDistantPointWritable> mostDistantPoints = new HashMap<Integer, CDbwDistantPointWritable>();
+  private Map<Integer, WeightedPointWritable> mostDistantPoints = new HashMap<Integer, WeightedPointWritable>();
 
   private DistanceMeasure measure = new EuclideanDistanceMeasure();
 
-  private OutputCollector<IntWritable, CDbwDistantPointWritable> output = null;
+  private OutputCollector<IntWritable, WeightedPointWritable> output = null;
 
   @Override
-  public void map(IntWritable clusterId, VectorWritable point, OutputCollector<IntWritable, CDbwDistantPointWritable> output,
+  public void map(IntWritable clusterId, VectorWritable point, OutputCollector<IntWritable, WeightedPointWritable> output,
       Reporter reporter) throws IOException {
 
-      this.output = output;
+    this.output = output;
 
     int key = clusterId.get();
-    CDbwDistantPointWritable currentMDP = mostDistantPoints.get(key);
+    WeightedPointWritable currentMDP = mostDistantPoints.get(key);
 
     List<VectorWritable> refPoints = representativePoints.get(key);
     double totalDistance = 0.0;
     for (VectorWritable refPoint : refPoints) {
       totalDistance += measure.distance(refPoint.get(), point.get());
     }
-    if (currentMDP == null || currentMDP.getDistance() < totalDistance) {
-      mostDistantPoints.put(key, new CDbwDistantPointWritable(totalDistance, new VectorWritable(point.get().clone())));
+    if (currentMDP == null || currentMDP.getWeight() < totalDistance) {
+      mostDistantPoints.put(key, new WeightedPointWritable(totalDistance, new VectorWritable(point.get().clone())));
     }
   }
 
@@ -73,7 +74,7 @@ public class CDbwMapper extends MapReduc
     this.measure = measure;
   }
 
-  public static Map<Integer, List<VectorWritable>> getReferencePoints(JobConf job) throws SecurityException,
+  public static Map<Integer, List<VectorWritable>> getRepresentativePoints(JobConf job) throws SecurityException,
       IllegalArgumentException, NoSuchMethodException, InvocationTargetException {
     String statePath = job.get(CDbwDriver.STATE_IN_KEY);
     Map<Integer, List<VectorWritable>> representativePoints = new HashMap<Integer, List<VectorWritable>>();
@@ -93,7 +94,8 @@ public class CDbwMapper extends MapReduc
               representativePoints.put(key.get(), repPoints);
             }
             repPoints.add(point);
-            point = new VectorWritable();          }
+            point = new VectorWritable();
+          }
         } finally {
           reader.close();
         }
@@ -111,7 +113,7 @@ public class CDbwMapper extends MapReduc
       ClassLoader ccl = Thread.currentThread().getContextClassLoader();
       Class<?> cl = ccl.loadClass(job.get(CDbwDriver.DISTANCE_MEASURE_KEY));
       measure = (DistanceMeasure) cl.newInstance();
-      representativePoints = getReferencePoints(job);
+      representativePoints = getRepresentativePoints(job);
     } catch (NumberFormatException e) {
       throw new IllegalStateException(e);
     } catch (SecurityException e) {

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwReducer.java?rev=939019&r1=939018&r2=939019&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwReducer.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwReducer.java Wed Apr 28 17:37:12 2010
@@ -29,25 +29,25 @@ import org.apache.hadoop.mapred.MapReduc
 import org.apache.hadoop.mapred.OutputCollector;
 import org.apache.hadoop.mapred.Reducer;
 import org.apache.hadoop.mapred.Reporter;
+import org.apache.mahout.clustering.WeightedPointWritable;
 import org.apache.mahout.math.VectorWritable;
 
-public class CDbwReducer extends MapReduceBase implements
-    Reducer<IntWritable, CDbwDistantPointWritable, IntWritable, VectorWritable> {
+public class CDbwReducer extends MapReduceBase implements Reducer<IntWritable, WeightedPointWritable, IntWritable, VectorWritable> {
 
   private Map<Integer, List<VectorWritable>> referencePoints;
 
   private OutputCollector<IntWritable, VectorWritable> output;
 
   @Override
-  public void reduce(IntWritable key, Iterator<CDbwDistantPointWritable> values,
-      OutputCollector<IntWritable, VectorWritable> output, Reporter reporter) throws IOException {
+  public void reduce(IntWritable key, Iterator<WeightedPointWritable> values, OutputCollector<IntWritable, VectorWritable> output,
+      Reporter reporter) throws IOException {
     this.output = output;
     // find the most distant point
-    CDbwDistantPointWritable mdp = null;
+    WeightedPointWritable mdp = null;
     while (values.hasNext()) {
-      CDbwDistantPointWritable dpw = values.next();
-      if (mdp == null || mdp.getDistance() < dpw.getDistance()) {
-        mdp = new CDbwDistantPointWritable(dpw.getDistance(), dpw.getPoint());
+      WeightedPointWritable dpw = values.next();
+      if (mdp == null || mdp.getWeight() < dpw.getWeight()) {
+        mdp = new WeightedPointWritable(dpw.getWeight(), dpw.getPoint());
       }
     }
     output.collect(new IntWritable(key.get()), mdp.getPoint());
@@ -74,7 +74,7 @@ public class CDbwReducer extends MapRedu
   public void configure(JobConf job) {
     super.configure(job);
     try {
-      referencePoints = CDbwMapper.getReferencePoints(job);
+      referencePoints = CDbwMapper.getRepresentativePoints(job);
     } catch (NumberFormatException e) {
       throw new IllegalStateException(e);
     } catch (SecurityException e) {