You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by je...@apache.org on 2010/04/28 19:37:13 UTC

svn commit: r939019 [2/2] - in /lucene/mahout/trunk: core/src/main/java/org/apache/mahout/clustering/ core/src/main/java/org/apache/mahout/clustering/canopy/ core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/ core/src/main/java/org/apache/mah...

Modified: lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java?rev=939019&r1=939018&r2=939019&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java (original)
+++ lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java Wed Apr 28 17:37:12 2010
@@ -1,16 +1,38 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.mahout.clustering.cdbw;
 
 import java.io.File;
 import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.IntWritable;
 import org.apache.hadoop.io.SequenceFile;
+import org.apache.mahout.clustering.Cluster;
 import org.apache.mahout.clustering.ClusterBase;
 import org.apache.mahout.clustering.ClusteringTestUtils;
+import org.apache.mahout.clustering.canopy.Canopy;
 import org.apache.mahout.clustering.canopy.CanopyClusteringJob;
 import org.apache.mahout.clustering.canopy.CanopyDriver;
 import org.apache.mahout.clustering.dirichlet.DirichletDriver;
@@ -33,6 +55,10 @@ public class TestCDbwEvaluator extends M
 
   private List<VectorWritable> sampleData;
 
+  private Map<Integer, List<VectorWritable>> representativePoints;
+
+  Map<Integer, Cluster> clusters;
+
   @Override
   protected void setUp() throws Exception {
     super.setUp();
@@ -76,10 +102,61 @@ public class TestCDbwEvaluator extends M
     }
   }
 
+  /**
+   * Initialize synthetic data using 4 clusters dC units from origin having 4 representative points dP from each center
+   * @param dC a double cluster center offset
+   * @param dP a double representative point offset
+   */
+  private void initData(double dC, double dP) {
+    clusters = new HashMap<Integer, Cluster>();
+    clusters.put(1, new Canopy(new DenseVector(new double[] { -dC, -dC }), 1));
+    clusters.put(3, new Canopy(new DenseVector(new double[] { -dC, dC }), 3));
+    clusters.put(5, new Canopy(new DenseVector(new double[] { dC, dC }), 5));
+    clusters.put(7, new Canopy(new DenseVector(new double[] { dC, -dC }), 7));
+    representativePoints = new HashMap<Integer, List<VectorWritable>>();
+    for (Cluster cluster : clusters.values()) {
+      ArrayList<VectorWritable> points = new ArrayList<VectorWritable>();
+      representativePoints.put(cluster.getId(), points);
+      points.add(new VectorWritable(cluster.getCenter().clone()));
+      points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] { dP, dP }))));
+      points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] { dP, -dP }))));
+      points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] { -dP, -dP }))));
+      points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] { -dP, dP }))));
+    }
+  }
+
+  public void testCDbw0() {
+    initData(1, 0.25);
+    CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, new EuclideanDistanceMeasure());
+    assertEquals("inter cluster density", 0.0, evaluator.interClusterDensity());
+    assertEquals("separation", 1.5, evaluator.separation());
+    assertEquals("intra cluster density", 0.8944271909999157, evaluator.intraClusterDensity());
+    assertEquals("CDbw", 1.3416407864998736, evaluator.CDbw());
+  }
+
+  public void testCDbw1() {
+    initData(1, 0.5);
+    CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, new EuclideanDistanceMeasure());
+    assertEquals("inter cluster density", 0.0, evaluator.interClusterDensity());
+    assertEquals("separation", 1.0, evaluator.separation());
+    assertEquals("intra cluster density", 0.44721359549995787, evaluator.intraClusterDensity());
+    assertEquals("CDbw", 0.44721359549995787, evaluator.CDbw());
+  }
+
+  public void testCDbw2() {
+    initData(1, 0.75);
+    CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, new EuclideanDistanceMeasure());
+    assertEquals("inter cluster density", 1.017921815355728, evaluator.interClusterDensity());
+    assertEquals("separation", 0.24777966925931558, evaluator.separation());
+    assertEquals("intra cluster density", 0.29814239699997197, evaluator.intraClusterDensity());
+    assertEquals("CDbw", 0.07387362452083261, evaluator.CDbw());
+  }
+
   public void testCanopy() throws Exception { // now run the Job
     CanopyClusteringJob.runJob("testdata", "output", EuclideanDistanceMeasure.class.getName(), 3.1, 2.1);
     int numIterations = 2;
-    CDbwDriver.runJob("output/clusters-0", "output/clusteredPoints", "output", EuclideanDistanceMeasure.class.getName(), numIterations, 1);
+    CDbwDriver.runJob("output/clusters-0", "output/clusteredPoints", "output", EuclideanDistanceMeasure.class.getName(),
+        numIterations, 1);
     checkRefPoints(numIterations);
   }
 
@@ -89,7 +166,8 @@ public class TestCDbwEvaluator extends M
     // now run the KMeans job
     KMeansDriver.runJob("testdata", "output/clusters-0", "output", EuclideanDistanceMeasure.class.getName(), 0.001, 10, 1);
     int numIterations = 2;
-    CDbwDriver.runJob("output/clusters-2", "output/clusteredPoints", "output", EuclideanDistanceMeasure.class.getName(), numIterations, 1);
+    CDbwDriver.runJob("output/clusters-2", "output/clusteredPoints", "output", EuclideanDistanceMeasure.class.getName(),
+        numIterations, 1);
     checkRefPoints(numIterations);
   }
 
@@ -97,16 +175,19 @@ public class TestCDbwEvaluator extends M
     // now run the Canopy job to prime kMeans canopies
     CanopyDriver.runJob("testdata", "output/clusters-0", EuclideanDistanceMeasure.class.getName(), 3.1, 2.1);
     // now run the KMeans job
-    FuzzyKMeansDriver.runJob("testdata", "output/clusters-0", "output", EuclideanDistanceMeasure.class.getName(), 0.001, 10, 1, 1, 2);
+    FuzzyKMeansDriver.runJob("testdata", "output/clusters-0", "output", EuclideanDistanceMeasure.class.getName(), 0.001, 10, 1, 1,
+        2);
     int numIterations = 2;
-    CDbwDriver.runJob("output/clusters-4", "output/clusteredPoints", "output", EuclideanDistanceMeasure.class.getName(), numIterations, 1);
+    CDbwDriver.runJob("output/clusters-4", "output/clusteredPoints", "output", EuclideanDistanceMeasure.class.getName(),
+        numIterations, 1);
     checkRefPoints(numIterations);
   }
 
   public void testMeanShift() throws Exception {
     MeanShiftCanopyJob.runJob("testdata", "output", EuclideanDistanceMeasure.class.getName(), 2.1, 1.0, 0.001, 10);
     int numIterations = 2;
-    CDbwDriver.runJob("output/clusters-2", "output/clusteredPoints", "output", EuclideanDistanceMeasure.class.getName(), numIterations, 1);
+    CDbwDriver.runJob("output/clusters-2", "output/clusteredPoints", "output", EuclideanDistanceMeasure.class.getName(),
+        numIterations, 1);
     checkRefPoints(numIterations);
   }
 
@@ -115,7 +196,8 @@ public class TestCDbwEvaluator extends M
     DirichletDriver.runJob("testdata", "output", L1ModelDistribution.class.getName(), prototype.getClass().getName(), prototype
         .size(), 15, 5, 1.0, 1);
     int numIterations = 2;
-    CDbwDriver.runJob("output/clusters-5", "output/clusteredPoints", "output", EuclideanDistanceMeasure.class.getName(), numIterations, 1);
+    CDbwDriver.runJob("output/clusters-5", "output/clusteredPoints", "output", EuclideanDistanceMeasure.class.getName(),
+        numIterations, 1);
     checkRefPoints(numIterations);
   }