You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by je...@apache.org on 2010/04/28 19:37:13 UTC
svn commit: r939019 [2/2] - in /lucene/mahout/trunk:
core/src/main/java/org/apache/mahout/clustering/
core/src/main/java/org/apache/mahout/clustering/canopy/
core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/
core/src/main/java/org/apache/mah...
Modified: lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java?rev=939019&r1=939018&r2=939019&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java (original)
+++ lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java Wed Apr 28 17:37:12 2010
@@ -1,16 +1,38 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
package org.apache.mahout.clustering.cdbw;
import java.io.File;
import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
import java.util.List;
+import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
+import org.apache.mahout.clustering.Cluster;
import org.apache.mahout.clustering.ClusterBase;
import org.apache.mahout.clustering.ClusteringTestUtils;
+import org.apache.mahout.clustering.canopy.Canopy;
import org.apache.mahout.clustering.canopy.CanopyClusteringJob;
import org.apache.mahout.clustering.canopy.CanopyDriver;
import org.apache.mahout.clustering.dirichlet.DirichletDriver;
@@ -33,6 +55,10 @@ public class TestCDbwEvaluator extends M
private List<VectorWritable> sampleData;
+ private Map<Integer, List<VectorWritable>> representativePoints;
+
+ Map<Integer, Cluster> clusters;
+
@Override
protected void setUp() throws Exception {
super.setUp();
@@ -76,10 +102,61 @@ public class TestCDbwEvaluator extends M
}
}
+ /**
+ * Initialize synthetic data using 4 clusters dC units from origin having 4 representative points dP from each center
+ * @param dC a double cluster center offset
+ * @param dP a double representative point offset
+ */
+ private void initData(double dC, double dP) {
+ clusters = new HashMap<Integer, Cluster>();
+ clusters.put(1, new Canopy(new DenseVector(new double[] { -dC, -dC }), 1));
+ clusters.put(3, new Canopy(new DenseVector(new double[] { -dC, dC }), 3));
+ clusters.put(5, new Canopy(new DenseVector(new double[] { dC, dC }), 5));
+ clusters.put(7, new Canopy(new DenseVector(new double[] { dC, -dC }), 7));
+ representativePoints = new HashMap<Integer, List<VectorWritable>>();
+ for (Cluster cluster : clusters.values()) {
+ ArrayList<VectorWritable> points = new ArrayList<VectorWritable>();
+ representativePoints.put(cluster.getId(), points);
+ points.add(new VectorWritable(cluster.getCenter().clone()));
+ points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] { dP, dP }))));
+ points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] { dP, -dP }))));
+ points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] { -dP, -dP }))));
+ points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] { -dP, dP }))));
+ }
+ }
+
+ public void testCDbw0() {
+ initData(1, 0.25);
+ CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, new EuclideanDistanceMeasure());
+ assertEquals("inter cluster density", 0.0, evaluator.interClusterDensity());
+ assertEquals("separation", 1.5, evaluator.separation());
+ assertEquals("intra cluster density", 0.8944271909999157, evaluator.intraClusterDensity());
+ assertEquals("CDbw", 1.3416407864998736, evaluator.CDbw());
+ }
+
+ public void testCDbw1() {
+ initData(1, 0.5);
+ CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, new EuclideanDistanceMeasure());
+ assertEquals("inter cluster density", 0.0, evaluator.interClusterDensity());
+ assertEquals("separation", 1.0, evaluator.separation());
+ assertEquals("intra cluster density", 0.44721359549995787, evaluator.intraClusterDensity());
+ assertEquals("CDbw", 0.44721359549995787, evaluator.CDbw());
+ }
+
+ public void testCDbw2() {
+ initData(1, 0.75);
+ CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, new EuclideanDistanceMeasure());
+ assertEquals("inter cluster density", 1.017921815355728, evaluator.interClusterDensity());
+ assertEquals("separation", 0.24777966925931558, evaluator.separation());
+ assertEquals("intra cluster density", 0.29814239699997197, evaluator.intraClusterDensity());
+ assertEquals("CDbw", 0.07387362452083261, evaluator.CDbw());
+ }
+
public void testCanopy() throws Exception { // now run the Job
CanopyClusteringJob.runJob("testdata", "output", EuclideanDistanceMeasure.class.getName(), 3.1, 2.1);
int numIterations = 2;
- CDbwDriver.runJob("output/clusters-0", "output/clusteredPoints", "output", EuclideanDistanceMeasure.class.getName(), numIterations, 1);
+ CDbwDriver.runJob("output/clusters-0", "output/clusteredPoints", "output", EuclideanDistanceMeasure.class.getName(),
+ numIterations, 1);
checkRefPoints(numIterations);
}
@@ -89,7 +166,8 @@ public class TestCDbwEvaluator extends M
// now run the KMeans job
KMeansDriver.runJob("testdata", "output/clusters-0", "output", EuclideanDistanceMeasure.class.getName(), 0.001, 10, 1);
int numIterations = 2;
- CDbwDriver.runJob("output/clusters-2", "output/clusteredPoints", "output", EuclideanDistanceMeasure.class.getName(), numIterations, 1);
+ CDbwDriver.runJob("output/clusters-2", "output/clusteredPoints", "output", EuclideanDistanceMeasure.class.getName(),
+ numIterations, 1);
checkRefPoints(numIterations);
}
@@ -97,16 +175,19 @@ public class TestCDbwEvaluator extends M
// now run the Canopy job to prime kMeans canopies
CanopyDriver.runJob("testdata", "output/clusters-0", EuclideanDistanceMeasure.class.getName(), 3.1, 2.1);
// now run the KMeans job
- FuzzyKMeansDriver.runJob("testdata", "output/clusters-0", "output", EuclideanDistanceMeasure.class.getName(), 0.001, 10, 1, 1, 2);
+ FuzzyKMeansDriver.runJob("testdata", "output/clusters-0", "output", EuclideanDistanceMeasure.class.getName(), 0.001, 10, 1, 1,
+ 2);
int numIterations = 2;
- CDbwDriver.runJob("output/clusters-4", "output/clusteredPoints", "output", EuclideanDistanceMeasure.class.getName(), numIterations, 1);
+ CDbwDriver.runJob("output/clusters-4", "output/clusteredPoints", "output", EuclideanDistanceMeasure.class.getName(),
+ numIterations, 1);
checkRefPoints(numIterations);
}
public void testMeanShift() throws Exception {
MeanShiftCanopyJob.runJob("testdata", "output", EuclideanDistanceMeasure.class.getName(), 2.1, 1.0, 0.001, 10);
int numIterations = 2;
- CDbwDriver.runJob("output/clusters-2", "output/clusteredPoints", "output", EuclideanDistanceMeasure.class.getName(), numIterations, 1);
+ CDbwDriver.runJob("output/clusters-2", "output/clusteredPoints", "output", EuclideanDistanceMeasure.class.getName(),
+ numIterations, 1);
checkRefPoints(numIterations);
}
@@ -115,7 +196,8 @@ public class TestCDbwEvaluator extends M
DirichletDriver.runJob("testdata", "output", L1ModelDistribution.class.getName(), prototype.getClass().getName(), prototype
.size(), 15, 5, 1.0, 1);
int numIterations = 2;
- CDbwDriver.runJob("output/clusters-5", "output/clusteredPoints", "output", EuclideanDistanceMeasure.class.getName(), numIterations, 1);
+ CDbwDriver.runJob("output/clusters-5", "output/clusteredPoints", "output", EuclideanDistanceMeasure.class.getName(),
+ numIterations, 1);
checkRefPoints(numIterations);
}