You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by je...@apache.org on 2010/09/29 00:08:19 UTC

svn commit: r1002376 - in /mahout/trunk/utils/src: main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java

Author: jeastman
Date: Tue Sep 28 22:08:19 2010
New Revision: 1002376

URL: http://svn.apache.org/viewvc?rev=1002376&view=rev
Log:
MAHOUT-513: Reviewed calculations from paper in detail, making some changes that seemed needed. All tests run but I'm still not very confident in the computed numbers

Modified:
    mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java
    mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java

Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java?rev=1002376&r1=1002375&r2=1002376&view=diff
==============================================================================
--- mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java (original)
+++ mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java Tue Sep 28 22:08:19 2010
@@ -18,6 +18,7 @@
 package org.apache.mahout.clustering.cdbw;
 
 import java.io.IOException;
+import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.List;
@@ -39,6 +40,10 @@ import org.apache.mahout.math.function.S
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+/**
+ * This class calculates the CDbw metric as defined in
+ * http://www.db-net.aueb.gr/index.php/corporate/content/download/227/833/file/HV_poster2002.pdf 
+ */
 public class CDbwEvaluator {
 
   private static final Logger log = LoggerFactory.getLogger(CDbwEvaluator.class);
@@ -47,7 +52,7 @@ public class CDbwEvaluator {
 
   private final Map<Integer, Double> stDevs = new HashMap<Integer, Double>();
 
-  private final Map<Integer, Cluster> clusters;
+  private final List<Cluster> clusters;
 
   private final DistanceMeasure measure;
 
@@ -63,9 +68,7 @@ public class CDbwEvaluator {
    * @param measure
    *            an appropriate DistanceMeasure
    */
-  public CDbwEvaluator(Map<Integer, List<VectorWritable>> representativePoints,
-                       Map<Integer, Cluster> clusters,
-                       DistanceMeasure measure) {
+  public CDbwEvaluator(Map<Integer, List<VectorWritable>> representativePoints, List<Cluster> clusters, DistanceMeasure measure) {
     this.representativePoints = representativePoints;
     this.clusters = clusters;
     this.measure = measure;
@@ -101,9 +104,9 @@ public class CDbwEvaluator {
    *            a String pathname to the directory containing input cluster files
    * @return a List<Cluster> of the clusters
    */
-  private static Map<Integer, Cluster> loadClusters(Configuration conf, Path clustersIn) throws InstantiationException,
+  private static List<Cluster> loadClusters(Configuration conf, Path clustersIn) throws InstantiationException,
       IllegalAccessException, IOException {
-    Map<Integer, Cluster> clusters = new HashMap<Integer, Cluster>();
+    List<Cluster> clusters = new ArrayList<Cluster>();
     FileSystem fs = clustersIn.getFileSystem(conf);
     for (FileStatus part : fs.listStatus(clustersIn)) {
       if (!part.getPath().getName().startsWith(".")) {
@@ -113,7 +116,7 @@ public class CDbwEvaluator {
         Writable value = reader.getValueClass().asSubclass(Writable.class).newInstance();
         while (reader.next(key, value)) {
           Cluster cluster = (Cluster) value;
-          clusters.put(cluster.getId(), cluster);
+          clusters.add(cluster);
           value = reader.getValueClass().asSubclass(Writable.class).newInstance();
         }
         reader.close();
@@ -122,7 +125,14 @@ public class CDbwEvaluator {
     return clusters;
   }
 
+  /**
+   * Compute the standard deviation of the representative points for the given cluster.
+   * Store these in stDevs, indexed by cI
+   * 
+   * @param cI a int clusterId. 
+   */
   private void computeStd(int cI) {
+    // TODO: verify this approach
     List<VectorWritable> repPts = representativePoints.get(cI);
     int s0 = 0;
     Vector s1 = null;
@@ -167,7 +177,7 @@ public class CDbwEvaluator {
     if (pruned) {
       return;
     }
-    for (Iterator<Cluster> it = clusters.values().iterator(); it.hasNext();) {
+    for (Iterator<Cluster> it = clusters.iterator(); it.hasNext();) {
       Cluster cluster = it.next();
       if (invalidCluster(cluster)) {
         log.info("Pruning cluster Id=" + cluster.getId());
@@ -178,82 +188,116 @@ public class CDbwEvaluator {
     pruned = true;
   }
 
+  /**
+   * Compute the term density (eqn 2) used for inter-cluster density calculation
+   * 
+   * @param uIJ the Vector midpoint between the closest representative of the clusters
+   * @param cI the int clusterId of the i-th cluster
+   * @param cJ the int clusterId of the j-th cluster
+   * @return a double
+   */
   double interDensity(Vector uIJ, int cI, int cJ) {
     List<VectorWritable> repI = representativePoints.get(cI);
     List<VectorWritable> repJ = representativePoints.get(cJ);
-    double density = 0.0;
-    double std = (stDevs.get(cI) + stDevs.get(cJ)) / 2.0;
+    double sum = 0.0;
+    Double stdevI = stDevs.get(cI);
+    Double stdevJ = stDevs.get(cJ);
+    // count the number of representative points of the clusters which are within the
+    // average std of the two clusters from the midpoint uIJ (eqn 3)
+    double avgStd = (stdevI + stdevJ) / 2.0;
     for (VectorWritable vwI : repI) {
-      if (measure.distance(uIJ, vwI.get()) <= std) {
-        density++;
+      if (measure.distance(uIJ, vwI.get()) <= avgStd) {
+        sum++;
       }
     }
     for (VectorWritable vwJ : repJ) {
-      if (measure.distance(uIJ, vwJ.get()) <= std) {
-        density++;
+      if (measure.distance(uIJ, vwJ.get()) <= avgStd) {
+        sum++;
       }
     }
-    return density / (repI.size() + repJ.size());
-  }
-
-  double intraDensity(Vector clusterCenter, Vector repPoint, double avgStd) {
-    return measure.distance(clusterCenter, repPoint) <= avgStd ? 1.0 : 0.0;
+    int nI = repI.size();
+    int nJ = repJ.size();
+    return sum / (nI + nJ);
   }
 
+  /**
+   * Compute the validity index (eqn 8)
+   * 
+   * @return a double
+   */
   public double getCDbw() {
     pruneInvalidClusters();
     return intraClusterDensity() * separation();
   }
 
+  /**
+   * The average density within clusters is defined as the percentage of points that belong 
+   * to the neighborhood of representative points of the considered clusters. The goal is 
+   * the density within clusters to be significant high. (eqn 5)
+   * 
+   * @return a double
+   */
   public double intraClusterDensity() {
     pruneInvalidClusters();
-    double avgStd = 0.0;
-    for (Integer cId : representativePoints.keySet()) {
-      avgStd += stDevs.get(cId);
-    }
-    avgStd /= representativePoints.size();
-
-    double sum = 0.0;
-    for (Map.Entry<Integer, List<VectorWritable>> entry : representativePoints.entrySet()) {
-      Integer cId = entry.getKey();
-      List<VectorWritable> repI = entry.getValue();
-      double cSum = 0.0;
-      for (VectorWritable aRepI : repI) {
-        double inDensity = intraDensity(clusters.get(cId).getCenter(), aRepI.get(), avgStd);
-        double std = stDevs.get(cId);
-        if (std > 0.0) {
-          cSum += inDensity / std;
-        }
-      }
-      if (repI.size() > 0) {
-        sum += cSum / repI.size();
+    // compute the average standard deviation of the clusters
+    double stdev = 0.0;
+    for (Integer cI : representativePoints.keySet()) {
+      stdev += stDevs.get(cI);
+    }
+    int c = representativePoints.size();
+    stdev /= c;
+    // accumulate the summations
+    double sumI = 0.0;
+    for (int i = 0; i < clusters.size(); i++) {
+      Integer cI = clusters.get(i).getId();
+      List<VectorWritable> repPtsI = representativePoints.get(cI);
+      int r = repPtsI.size();
+      double sumJ = 0.0;
+      // compute the term density (eqn 6) 
+      for (int j = 0; j < r; j++) {
+        // compute f(x, vIJ) (eqn 7)
+        Vector repJ = repPtsI.get(j).get();
+        double densityIJ = (measure.distance(clusters.get(i).getCenter(), repJ) <= stdev ? 1.0 : 0.0);
+        // accumulate sumJ
+        sumJ += densityIJ / stdev;
       }
+      // accumulate sumI
+      sumI += sumJ / r;
     }
-    return sum / representativePoints.size();
+    return sumI / c;
   }
 
+  /**
+   * This function evaluates the average density in the region among clusters (eqn 1). 
+   * The goal is the density in the area among clusters to be significant low.
+   * 
+   * @return a double
+   */
   public double interClusterDensity() {
     pruneInvalidClusters();
     double sum = 0.0;
-    for (Map.Entry<Integer, List<VectorWritable>> entry1 : representativePoints.entrySet()) {
-      Integer cI = entry1.getKey();
-      List<VectorWritable> repI = entry1.getValue();
-      for (Map.Entry<Integer, List<VectorWritable>> entry2 : representativePoints.entrySet()) {
-        Integer cJ = entry2.getKey();
-        if (cI.equals(cJ)) {
+    // find the closest representative points between the clusters
+    for (int i = 0; i < clusters.size(); i++) {
+      Integer cI = clusters.get(i).getId();
+      List<VectorWritable> repI = representativePoints.get(cI);
+      for (int j = 1; j < clusters.size(); j++) {
+        Integer cJ = clusters.get(j).getId();
+        if (i == j) {
           continue;
         }
-        List<VectorWritable> repJ = entry2.getValue();
-        double minDistance = Double.MAX_VALUE;
-        Vector uIJ = null;
+        List<VectorWritable> repJ = representativePoints.get(cJ);
+        double minDistance = Double.MAX_VALUE; // the distance between the closest representative points
+        Vector uIJ = null; // the midpoint between the closest representative points
+        // find the closest representative points between the i-th and j-th clusters
         for (VectorWritable aRepI : repI) {
           for (VectorWritable aRepJ : repJ) {
-            Vector vI = aRepI.get();
-            Vector vJ = aRepJ.get();
-            double distance = measure.distance(vI, vJ);
+            Vector closRepI = aRepI.get();
+            Vector closRepJ = aRepJ.get();
+            double distance = measure.distance(closRepI, closRepJ);
             if (distance < minDistance) {
+              // set the distance and compute the midpoint
               minDistance = distance;
-              uIJ = vI.plus(vJ).divide(2);
+              uIJ = closRepI.plus(closRepJ).divide(2);
             }
           }
         }
@@ -275,31 +319,42 @@ public class CDbwEvaluator {
         sum += density;
       }
     }
-    //System.out.println("interClusterDensity=" + sum);
+    log.info("interClusterDensity=" + sum);
     return sum;
   }
 
+  /**
+   * Calculate the separation of clusters (eqn 4) taking into account both the distances between the closest
+   * clusters and the Inter-cluster density. The goal is the distances among clusters to be high while 
+   * the density in the area among them to be low.
+   * 
+   * @return a double
+   */
   public double separation() {
     pruneInvalidClusters();
-    double minDistance = Double.MAX_VALUE;
-    for (Map.Entry<Integer, List<VectorWritable>> entry1 : representativePoints.entrySet()) {
-      Integer cI = entry1.getKey();
-      List<VectorWritable> repI = entry1.getValue();
-      for (Map.Entry<Integer, List<VectorWritable>> entry2 : representativePoints.entrySet()) {
-        if (cI.equals(entry2.getKey())) {
+    double minDistanceSum = 0;
+    for (int i = 0; i < clusters.size(); i++) {
+      Integer cI = clusters.get(i).getId();
+      List<VectorWritable> closRepI = representativePoints.get(cI);
+      for (int j = 0; j < clusters.size(); j++) {
+        if (i == j) {
           continue;
         }
-        List<VectorWritable> repJ = entry2.getValue();
-        for (VectorWritable aRepI : repI) {
-          for (VectorWritable aRepJ : repJ) {
+        // find min{d(closRepI, closRepJ)}
+        Integer cJ = clusters.get(j).getId();
+        List<VectorWritable> closRepJ = representativePoints.get(cJ);
+        double minDistance = Double.MAX_VALUE;
+        for (VectorWritable aRepI : closRepI) {
+          for (VectorWritable aRepJ : closRepJ) {
             double distance = measure.distance(aRepI.get(), aRepJ.get());
             if (distance < minDistance) {
               minDistance = distance;
             }
           }
         }
+        minDistanceSum += minDistance;
       }
     }
-    return minDistance / (1.0 + interClusterDensity());
+    return minDistanceSum / (1.0 + interClusterDensity());
   }
 }

Modified: mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java?rev=1002376&r1=1002375&r2=1002376&view=diff
==============================================================================
--- mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java (original)
+++ mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java Tue Sep 28 22:08:19 2010
@@ -58,7 +58,7 @@ public final class TestCDbwEvaluator ext
 
   private Map<Integer, List<VectorWritable>> representativePoints;
 
-  private Map<Integer, Cluster> clusters;
+  private List<Cluster> clusters;
 
   @Override
   @Before
@@ -101,13 +101,13 @@ public final class TestCDbwEvaluator ext
    * @param measure the DistanceMeasure
    */
   private void initData(double dC, double dP, DistanceMeasure measure) {
-    clusters = new HashMap<Integer, Cluster>();
-    clusters.put(1, new Canopy(new DenseVector(new double[] { -dC, -dC }), 1, measure));
-    clusters.put(3, new Canopy(new DenseVector(new double[] { -dC, dC }), 3, measure));
-    clusters.put(5, new Canopy(new DenseVector(new double[] { dC, dC }), 5, measure));
-    clusters.put(7, new Canopy(new DenseVector(new double[] { dC, -dC }), 7, measure));
+    clusters = new ArrayList<Cluster>();
+    clusters.add(new Canopy(new DenseVector(new double[] { -dC, -dC }), 1, measure));
+    clusters.add(new Canopy(new DenseVector(new double[] { -dC, dC }), 3, measure));
+    clusters.add(new Canopy(new DenseVector(new double[] { dC, dC }), 5, measure));
+    clusters.add(new Canopy(new DenseVector(new double[] { dC, -dC }), 7, measure));
     representativePoints = new HashMap<Integer, List<VectorWritable>>();
-    for (Cluster cluster : clusters.values()) {
+    for (Cluster cluster : clusters) {
       List<VectorWritable> points = new ArrayList<VectorWritable>();
       representativePoints.put(cluster.getId(), points);
       points.add(new VectorWritable(cluster.getCenter().clone()));
@@ -124,9 +124,9 @@ public final class TestCDbwEvaluator ext
     initData(1, 0.25, measure);
     CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure);
     assertEquals("inter cluster density", 0.0, evaluator.interClusterDensity(), EPSILON);
-    assertEquals("separation", 1.5, evaluator.separation(), EPSILON);
+    assertEquals("separation", 20.485281374238568, evaluator.separation(), EPSILON);
     assertEquals("intra cluster density", 0.8944271909999157, evaluator.intraClusterDensity(), EPSILON);
-    assertEquals("CDbw", 1.3416407864998736, evaluator.getCDbw(), EPSILON);
+    assertEquals("CDbw", 18.322592676403097, evaluator.getCDbw(), EPSILON);
   }
 
   @Test
@@ -135,9 +135,9 @@ public final class TestCDbwEvaluator ext
     initData(1, 0.5, measure);
     CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure);
     assertEquals("inter cluster density", 0.0, evaluator.interClusterDensity(), EPSILON);
-    assertEquals("separation", 1.0, evaluator.separation(), EPSILON);
+    assertEquals("separation", 13.656854249492381, evaluator.separation(), EPSILON);
     assertEquals("intra cluster density", 0.44721359549995787, evaluator.intraClusterDensity(), EPSILON);
-    assertEquals("CDbw", 0.44721359549995787, evaluator.getCDbw(), EPSILON);
+    assertEquals("CDbw", 6.107530892134367, evaluator.getCDbw(), EPSILON);
   }
 
   @Test
@@ -145,10 +145,10 @@ public final class TestCDbwEvaluator ext
     DistanceMeasure measure = new EuclideanDistanceMeasure();
     initData(1, 0.75, measure);
     CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure);
-    assertEquals("inter cluster density", 1.017921815355728, evaluator.interClusterDensity(), EPSILON);
-    assertEquals("separation", 0.24777966925931558, evaluator.separation(), EPSILON);
+    assertEquals("inter cluster density", 0.7634413615167959, evaluator.interClusterDensity(), EPSILON);
+    assertEquals("separation", 3.8722167199667066, evaluator.separation(), EPSILON);
     assertEquals("intra cluster density", 0.29814239699997197, evaluator.intraClusterDensity(), EPSILON);
-    assertEquals("CDbw", 0.07387362452083261, evaluator.getCDbw(), EPSILON);
+    assertEquals("CDbw", 1.1544719745942431, evaluator.getCDbw(), EPSILON);
   }
 
   @Test
@@ -156,14 +156,14 @@ public final class TestCDbwEvaluator ext
     DistanceMeasure measure = new EuclideanDistanceMeasure();
     initData(1, 0.25, measure);
     Canopy cluster = new Canopy(new DenseVector(new double[] { 10, 10 }), 19, measure);
-    clusters.put(cluster.getId(), cluster);
+    clusters.add(cluster);
     List<VectorWritable> points = new ArrayList<VectorWritable>();
     representativePoints.put(cluster.getId(), points);
     CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure);
     assertEquals("inter cluster density", 0.0, evaluator.interClusterDensity(), EPSILON);
-    assertEquals("separation", 1.5, evaluator.separation(), EPSILON);
+    assertEquals("separation", 20.485281374238568, evaluator.separation(), EPSILON);
     assertEquals("intra cluster density", 0.8944271909999157, evaluator.intraClusterDensity(), EPSILON);
-    assertEquals("CDbw", 1.3416407864998736, evaluator.getCDbw(), EPSILON);
+    assertEquals("CDbw", 18.322592676403097, evaluator.getCDbw(), EPSILON);
   }
 
   @Test
@@ -171,15 +171,15 @@ public final class TestCDbwEvaluator ext
     DistanceMeasure measure = new EuclideanDistanceMeasure();
     initData(1, 0.25, measure);
     Canopy cluster = new Canopy(new DenseVector(new double[] { 0, 0 }), 19, measure);
-    clusters.put(cluster.getId(), cluster);
+    clusters.add(cluster);
     List<VectorWritable> points = new ArrayList<VectorWritable>();
     points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] { 1, 1 }))));
     representativePoints.put(cluster.getId(), points);
     CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure);
     assertEquals("inter cluster density", 0.0, evaluator.interClusterDensity(), EPSILON);
-    assertEquals("separation", 1.5, evaluator.separation(), EPSILON);
+    assertEquals("separation", 20.485281374238568, evaluator.separation(), EPSILON);
     assertEquals("intra cluster density", 0.8944271909999157, evaluator.intraClusterDensity(), EPSILON);
-    assertEquals("CDbw", 1.3416407864998736, evaluator.getCDbw(), EPSILON);
+    assertEquals("CDbw", 18.322592676403097, evaluator.getCDbw(), EPSILON);
   }
 
   /**
@@ -191,7 +191,7 @@ public final class TestCDbwEvaluator ext
     DistanceMeasure measure = new EuclideanDistanceMeasure();
     initData(1, 0.25, measure);
     Canopy cluster = new Canopy(new DenseVector(new double[] { 0, 0 }), 19, measure);
-    clusters.put(cluster.getId(), cluster);
+    clusters.add(cluster);
     List<VectorWritable> points = new ArrayList<VectorWritable>();
     points.add(new VectorWritable(cluster.getCenter()));
     points.add(new VectorWritable(cluster.getCenter()));
@@ -199,9 +199,9 @@ public final class TestCDbwEvaluator ext
     representativePoints.put(cluster.getId(), points);
     CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure);
     assertEquals("inter cluster density", 0.0, evaluator.interClusterDensity(), EPSILON);
-    assertEquals("separation", 1.5, evaluator.separation(), EPSILON);
+    assertEquals("separation", 20.485281374238568, evaluator.separation(), EPSILON);
     assertEquals("intra cluster density", 0.8944271909999157, evaluator.intraClusterDensity(), EPSILON);
-    assertEquals("CDbw", 1.3416407864998736, evaluator.getCDbw(), EPSILON);
+    assertEquals("CDbw", 18.322592676403097, evaluator.getCDbw(), EPSILON);
   }
 
   @Test