You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by je...@apache.org on 2010/04/28 19:37:13 UTC
svn commit: r939019 [1/2] - in /lucene/mahout/trunk:
core/src/main/java/org/apache/mahout/clustering/
core/src/main/java/org/apache/mahout/clustering/canopy/
core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/
core/src/main/java/org/apache/mah...
Author: jeastman
Date: Wed Apr 28 17:37:12 2010
New Revision: 939019
URL: http://svn.apache.org/viewvc?rev=939019&view=rev
Log:
MAHOUT-236:
- replaced NamedVector references with Vector in clustering code
- replaced NamedVector references with Vector in clustering examples
- removed coercions to RandomAccessSparseVector from cluster constructors
- fixed FuzzyKMeans tests to reflect new clustering implementation
- moved CDbwDispantPointWritable to core WeightedPointWritable
- updated CDbw evaluator with above
all tests run
Added:
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/WeightedPointWritable.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java
Removed:
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwDistantPointWritable.java
Modified:
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/ClusterBase.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/Canopy.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansClusterer.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/SoftCluster.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/Cluster.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterer.java
lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java
lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java
lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/dirichlet/DisplayDirichlet.java
lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/fuzzykmeans/DisplayFuzzyKMeans.java
lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/kmeans/DisplayKMeans.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwDriver.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwMapper.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwReducer.java
lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/ClusterBase.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/ClusterBase.java?rev=939019&r1=939018&r2=939019&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/ClusterBase.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/ClusterBase.java Wed Apr 28 17:37:12 2010
@@ -26,7 +26,6 @@ import java.util.Locale;
import org.apache.hadoop.io.Writable;
import org.apache.mahout.math.JsonVectorAdapter;
-import org.apache.mahout.math.RandomAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.Vector.Element;
@@ -55,7 +54,7 @@ public abstract class ClusterBase implem
private int id;
// the current cluster center
- private Vector center = new RandomAccessSparseVector(0);
+ private Vector center;
// the number of points in the cluster
private int numPoints;
Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/WeightedPointWritable.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/WeightedPointWritable.java?rev=939019&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/WeightedPointWritable.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/WeightedPointWritable.java Wed Apr 28 17:37:12 2010
@@ -0,0 +1,74 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.math.VectorWritable;
+
+public class WeightedPointWritable implements Writable {
+
+ /**
+ * @return the weight
+ */
+ public double getWeight() {
+ return weight;
+ }
+
+ /**
+ * @return the point
+ */
+ public VectorWritable getPoint() {
+ return point;
+ }
+
+ public WeightedPointWritable(double weight, VectorWritable point) {
+ super();
+ this.weight = weight;
+ this.point = point;
+ }
+
+ public WeightedPointWritable() {
+ super();
+ }
+
+ private double weight;
+
+ private VectorWritable point;
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ weight = in.readDouble();
+ point = new VectorWritable();
+ point.readFields(in);
+ }
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ out.writeDouble(weight);
+ point.write(out);
+ }
+
+ public String toString() {
+ return String.valueOf(weight) + ": " + (point == null ? "null" : ClusterBase.formatVector(point.get(), null));
+ }
+
+}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/Canopy.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/Canopy.java?rev=939019&r1=939018&r2=939019&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/Canopy.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/Canopy.java Wed Apr 28 17:37:12 2010
@@ -49,8 +49,8 @@ public class Canopy extends ClusterBase
*/
public Canopy(Vector point, int canopyId) {
this.setId(canopyId);
- this.setCenter(new RandomAccessSparseVector(point.clone()));
- this.setPointTotal(getCenter().clone());
+ this.setCenter(point.clone());
+ this.setPointTotal(point.clone());
this.setNumPoints(1);
}
@@ -65,7 +65,7 @@ public class Canopy extends ClusterBase
super.readFields(in);
VectorWritable temp = new VectorWritable();
temp.readFields(in);
- this.setCenter(new RandomAccessSparseVector(temp.get()));
+ this.setCenter(temp.get());
this.setPointTotal(getCenter().clone());
this.setNumPoints(1);
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansClusterer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansClusterer.java?rev=939019&r1=939018&r2=939019&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansClusterer.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansClusterer.java Wed Apr 28 17:37:12 2010
@@ -201,10 +201,16 @@ public class FuzzyKMeansClusterer {
* the initial List<SoftCluster> of clusters
* @param measure
* the DistanceMeasure to use
+ * @param threshold
+ * the double convergence threshold
+ * @param m
+ * the double "fuzzyness" argument (>1)
* @param numIter
* the maximum number of iterations
+ * @return
+ * a List<List<SoftCluster>> of clusters produced per iteration
*/
- public static List<List<SoftCluster>> clusterPoints(List<NamedVector> points,
+ public static List<List<SoftCluster>> clusterPoints(List<Vector> points,
List<SoftCluster> clusters,
DistanceMeasure measure,
double threshold,
@@ -237,7 +243,7 @@ public class FuzzyKMeansClusterer {
* the List<Cluster> clusters
* @return
*/
- public static boolean runFuzzyKMeansIteration(List<NamedVector> points,
+ public static boolean runFuzzyKMeansIteration(List<Vector> points,
List<SoftCluster> clusterList,
FuzzyKMeansClusterer clusterer) {
for (Vector point : points) {
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/SoftCluster.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/SoftCluster.java?rev=939019&r1=939018&r2=939019&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/SoftCluster.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/SoftCluster.java Wed Apr 28 17:37:12 2010
@@ -23,7 +23,6 @@ import java.io.IOException;
import org.apache.mahout.clustering.ClusterBase;
import org.apache.mahout.math.AbstractVector;
-import org.apache.mahout.math.RandomAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.math.function.Functions;
@@ -93,7 +92,7 @@ public class SoftCluster extends Cluster
* the center point
*/
public SoftCluster(Vector center) {
- setCenter(new RandomAccessSparseVector(center));
+ setCenter(center.clone());
this.pointProbSum = 0;
this.weightedPointTotal = getCenter().like();
}
@@ -112,7 +111,7 @@ public class SoftCluster extends Cluster
converged = in.readBoolean();
VectorWritable temp = new VectorWritable();
temp.readFields(in);
- this.setCenter(new RandomAccessSparseVector(temp.get()));
+ this.setCenter(temp.get());
this.pointProbSum = 0;
this.weightedPointTotal = getCenter().like();
}
@@ -124,6 +123,8 @@ public class SoftCluster extends Cluster
*/
@Override
public Vector computeCentroid() {
+ if (centroid != null)
+ return centroid;
if (pointProbSum == 0) {
return weightedPointTotal;
} else if (centroid == null) {
@@ -157,7 +158,7 @@ public class SoftCluster extends Cluster
@Override
public String toString() {
- return getIdentifier() + ": " + getCenter().asFormatString();
+ return asFormatString(null);
}
@Override
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/Cluster.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/Cluster.java?rev=939019&r1=939018&r2=939019&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/Cluster.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/Cluster.java Wed Apr 28 17:37:12 2010
@@ -104,7 +104,7 @@ public class Cluster extends ClusterBase
this.converged = in.readBoolean();
VectorWritable temp = new VectorWritable();
temp.readFields(in);
- this.setCenter(new RandomAccessSparseVector(temp.get()));
+ this.setCenter(temp.get());
this.setNumPoints(0);
this.setPointTotal(getCenter().like());
this.pointSquaredTotal = getCenter().like();
@@ -134,7 +134,7 @@ public class Cluster extends ClusterBase
*/
public Cluster(Vector center) {
super();
- this.setCenter(new RandomAccessSparseVector(center));
+ this.setCenter(center.clone());
this.setNumPoints(0);
this.setPointTotal(getCenter().like());
this.pointSquaredTotal = getCenter().like();
@@ -152,7 +152,7 @@ public class Cluster extends ClusterBase
public Cluster(Vector center, int clusterId) {
super();
this.setId(clusterId);
- this.setCenter(new RandomAccessSparseVector(center));
+ this.setCenter(center.clone());
this.setNumPoints(0);
this.setPointTotal(getCenter().like());
this.pointSquaredTotal = getCenter().like();
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterer.java?rev=939019&r1=939018&r2=939019&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterer.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterer.java Wed Apr 28 17:37:12 2010
@@ -111,7 +111,7 @@ public class KMeansClusterer {
* @param maxIter
* the maximum number of iterations
*/
- public static List<List<Cluster>> clusterPoints(List<NamedVector> points,
+ public static List<List<Cluster>> clusterPoints(List<Vector> points,
List<Cluster> clusters,
DistanceMeasure measure,
int maxIter,
@@ -146,7 +146,7 @@ public class KMeansClusterer {
* a DistanceMeasure to use
* @return
*/
- public static boolean runKMeansIteration(List<NamedVector> points,
+ public static boolean runKMeansIteration(List<Vector> points,
List<Cluster> clusters,
DistanceMeasure measure,
double distanceThreshold) {
Modified: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java?rev=939019&r1=939018&r2=939019&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java (original)
+++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java Wed Apr 28 17:37:12 2010
@@ -37,14 +37,13 @@ import org.apache.mahout.common.DummyRep
import org.apache.mahout.common.MahoutTestCase;
import org.apache.mahout.common.distance.DistanceMeasure;
import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
-import org.apache.mahout.math.NamedVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
public class TestFuzzyKmeansClustering extends MahoutTestCase {
-
+
private FileSystem fs;
-
+
private static void rmr(String path) {
File f = new File(path);
if (f.exists()) {
@@ -57,7 +56,7 @@ public class TestFuzzyKmeansClustering e
f.delete();
}
}
-
+
@Override
protected void setUp() throws Exception {
super.setUp();
@@ -66,86 +65,98 @@ public class TestFuzzyKmeansClustering e
Configuration conf = new Configuration();
fs = FileSystem.get(conf);
}
-
+
private static double round(double val, int places) {
long factor = (long) Math.pow(10, places);
-
+
// Shift the decimal the correct number of places
// to the right.
val *= factor;
-
+
// Round to the nearest integer.
long tmp = Math.round(val);
-
+
// Shift the decimal the correct number of places
// back to the left.
return (double) tmp / factor;
}
-
+
private static Vector tweakValue(Vector point) {
return point.plus(0.1);
-
+
}
-
- private static void computeCluster(List<NamedVector> points,
- List<SoftCluster> clusterList,
- FuzzyKMeansClusterer clusterer,
- Map<String,String> pointClusterInfo) {
-
- for (NamedVector point : points) {
- StringBuilder outputValue = new StringBuilder("[");
+
+ private static void computeCluster(List<Vector> points, List<SoftCluster> clusterList, FuzzyKMeansClusterer clusterer,
+ Map<Integer, List<Vector>> pointClusterInfo) {
+
+ for (Vector point : points) {
List<Double> clusterDistanceList = new ArrayList<Double>();
+ SoftCluster closestCluster = null;
+ double closestDistance = Double.MAX_VALUE;
for (SoftCluster cluster : clusterList) {
- clusterDistanceList.add(clusterer.getMeasure().distance(point, cluster.getCenter()));
+ double distance = clusterer.getMeasure().distance(point, cluster.getCenter());
+ if (distance < closestDistance) {
+ closestDistance = distance;
+ closestCluster = cluster;
+ }
+ clusterDistanceList.add(distance);
+ }
+ List<Vector> list = pointClusterInfo.get(closestCluster.getId());
+ if (list == null) {
+ list = new ArrayList<Vector>();
+ pointClusterInfo.put(closestCluster.getId(), list);
}
+ list.add(point);
+ double totalProb = 0;
for (int i = 0; i < clusterList.size(); i++) {
+ SoftCluster cluster = clusterList.get(i);
double probWeight = clusterer.computeProbWeight(clusterDistanceList.get(i), clusterDistanceList);
- outputValue.append(clusterList.get(i).getId()).append(':').append(probWeight).append(' ');
+ totalProb += probWeight;
}
- pointClusterInfo.put(point.getName(), outputValue.toString().trim() + ']');
+ assertTrue("total probability", Math.abs(1.0 - totalProb) < 0.0001);
+ }
+
+ for (SoftCluster cluster : clusterList) {
+ System.out.println(cluster.asFormatString(null));
+ List<Vector> list = pointClusterInfo.get(cluster.getId());
+ if (list != null)
+ for (Vector vector : list) {
+ System.out.println("\t" + vector);
+ }
}
}
-
+
public void testReferenceImplementation() throws Exception {
- List<NamedVector> points = TestKmeansClustering.getPoints(TestKmeansClustering.reference);
+ List<Vector> points = TestKmeansClustering.getPoints(TestKmeansClustering.reference);
for (int k = 0; k < points.size(); k++) {
System.out.println("test k= " + k);
-
+
List<SoftCluster> clusterList = new ArrayList<SoftCluster>();
// pick k initial cluster centers at random
for (int i = 0; i < k + 1; i++) {
Vector vec = tweakValue(points.get(i));
- SoftCluster cluster = new SoftCluster(vec);
+ SoftCluster cluster = new SoftCluster(vec, i);
// add the center so the centroid will be correct upon output
- cluster.addPoint(cluster.getCenter(), 1);
-
+ //cluster.addPoint(cluster.getCenter(), 1);
clusterList.add(cluster);
}
- Map<String,String> pointClusterInfo = new HashMap<String,String>();
+ Map<Integer, List<Vector>> pointClusterInfo = new HashMap<Integer, List<Vector>>();
// run reference FuzzyKmeans algorithm
- List<List<SoftCluster>> clusters = FuzzyKMeansClusterer.clusterPoints(points, clusterList,
- new EuclideanDistanceMeasure(), 0.001, 2, 2);
- computeCluster(points, clusters.get(clusters.size() - 1), new FuzzyKMeansClusterer(
- new EuclideanDistanceMeasure(), 0.001, 2), pointClusterInfo);
-
- // iterate for each point
- for (String value : pointClusterInfo.values()) {
- String clusterInfoStr = value.substring(1, value.length() - 1);
- String[] clusterInfoList = clusterInfoStr.split(" ");
- assertEquals("Number of clusters", k + 1, clusterInfoList.length);
- double prob = 0.0;
- for (String clusterInfo : clusterInfoList) {
- String[] clusterProb = clusterInfo.split(":");
-
- double clusterProbVal = Double.parseDouble(clusterProb[1]);
- prob += clusterProbVal;
- }
- prob = round(prob, 1);
- assertEquals("Sum of cluster Membership problability should be equal to=", 1.0, prob);
+ List<List<SoftCluster>> clusters = FuzzyKMeansClusterer.clusterPoints(points, clusterList, new EuclideanDistanceMeasure(),
+ 0.001, 2, 2);
+ computeCluster(points, clusters.get(clusters.size() - 1), new FuzzyKMeansClusterer(new EuclideanDistanceMeasure(), 0.001, 2),
+ pointClusterInfo);
+
+ // iterate for each cluster
+ int size = 0;
+ for (int cId : pointClusterInfo.keySet()) {
+ List<Vector> pts = pointClusterInfo.get(cId);
+ size += pts.size();
}
+ assertEquals("total size", size, points.size());
}
}
-
+
public void testFuzzyKMeansMRJob() throws Exception {
List<VectorWritable> points = TestKmeansClustering.getPointsWritable(TestKmeansClustering.reference);
File testData = new File("testdata");
@@ -158,7 +169,7 @@ public class TestFuzzyKmeansClustering e
}
Configuration conf = new Configuration();
ClusteringTestUtils.writePointsToFile(points, "testdata/points/file1", fs, conf);
-
+
for (int k = 0; k < points.size(); k++) {
System.out.println("testKFuzzyKMeansMRJob k= " + k);
// pick k initial cluster centers at random
@@ -168,21 +179,21 @@ public class TestFuzzyKmeansClustering e
if (fs.exists(path)) {
fs.delete(path, true);
}
-
+
testData = new File("testdata/clusters");
if (!testData.exists()) {
testData.mkdir();
}
-
+
/*
* BufferedWriter writer = new BufferedWriter(new OutputStreamWriter( new
* FileOutputStream("testdata/clusters/part-00000"), Charset .forName("UTF-8")));
*/
- SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf,
- new Path("testdata/clusters/part-00000"), Text.class, SoftCluster.class);
+ SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, new Path("testdata/clusters/part-00000"), Text.class,
+ SoftCluster.class);
for (int i = 0; i < k + 1; i++) {
Vector vec = tweakValue(points.get(i).get());
-
+
SoftCluster cluster = new SoftCluster(vec);
// add the center so the centroid will be correct upon output
cluster.addPoint(cluster.getCenter(), 1);
@@ -190,10 +201,10 @@ public class TestFuzzyKmeansClustering e
* writer.write(cluster.getIdentifier() + '\t' + SoftCluster.formatCluster(cluster) + '\n');
*/
writer.append(new Text(cluster.getIdentifier()), cluster);
-
+
}
writer.close();
-
+
Path outPath = new Path("output");
fs = FileSystem.get(outPath.toUri(), conf);
if (fs.exists(outPath)) {
@@ -201,9 +212,9 @@ public class TestFuzzyKmeansClustering e
}
fs.mkdirs(outPath);
// now run the Job
- FuzzyKMeansDriver.runJob("testdata/points", "testdata/clusters", "output",
- EuclideanDistanceMeasure.class.getName(), 0.001, 2, 1, k + 1, 2);
-
+ FuzzyKMeansDriver.runJob("testdata/points", "testdata/clusters", "output", EuclideanDistanceMeasure.class.getName(), 0.001,
+ 2, 1, k + 1, 2);
+
// now compare the expected clusters with actual
File outDir = new File("output/clusteredPoints");
assertTrue("output dir exists?", outDir.exists());
@@ -214,199 +225,196 @@ public class TestFuzzyKmeansClustering e
VectorWritable out = new VectorWritable();
while (reader.next(key, out)) {
// make sure we can read all the clusters
- }
+ }
reader.close();
-
+
}
-
+
}
-
+
public void testFuzzyKMeansMapper() throws Exception {
List<VectorWritable> points = TestKmeansClustering.getPointsWritable(TestKmeansClustering.reference);
-
+
for (int k = 0; k < points.size(); k++) {
System.out.println("testKFuzzyKMeansMRJob k= " + k);
// pick k initial cluster centers at random
List<SoftCluster> clusterList = new ArrayList<SoftCluster>();
-
+
for (int i = 0; i < k + 1; i++) {
Vector vec = tweakValue(points.get(i).get());
-
+
SoftCluster cluster = new SoftCluster(vec, i);
cluster.addPoint(cluster.getCenter(), 1);
clusterList.add(cluster);
}
-
+
// run mapper
FuzzyKMeansMapper mapper = new FuzzyKMeansMapper();
mapper.config(clusterList);
-
+
JobConf conf = new JobConf();
- conf.set(FuzzyKMeansConfigKeys.DISTANCE_MEASURE_KEY,
- "org.apache.mahout.common.distance.EuclideanDistanceMeasure");
+ conf.set(FuzzyKMeansConfigKeys.DISTANCE_MEASURE_KEY, "org.apache.mahout.common.distance.EuclideanDistanceMeasure");
conf.set(FuzzyKMeansConfigKeys.CLUSTER_CONVERGENCE_KEY, "0.001");
conf.set(FuzzyKMeansConfigKeys.M_KEY, "2");
mapper.configure(conf);
-
- DummyOutputCollector<Text,FuzzyKMeansInfo> mapCollector = new DummyOutputCollector<Text,FuzzyKMeansInfo>();
+
+ DummyOutputCollector<Text, FuzzyKMeansInfo> mapCollector = new DummyOutputCollector<Text, FuzzyKMeansInfo>();
for (VectorWritable point : points) {
mapper.map(new Text(), point, mapCollector, null);
}
-
+
// now verify mapper output
assertEquals("Mapper Keys", k + 1, mapCollector.getData().size());
-
- Map<Vector,Double> pointTotalProbMap = new HashMap<Vector,Double>();
-
+
+ Map<Vector, Double> pointTotalProbMap = new HashMap<Vector, Double>();
+
for (Text key : mapCollector.getKeys()) {
// SoftCluster cluster = SoftCluster.decodeCluster(key);
List<FuzzyKMeansInfo> values = mapCollector.getValue(key);
-
+
for (FuzzyKMeansInfo value : values) {
-
+
Double val = pointTotalProbMap.get(value.getVector());
double probVal = 0.0;
if (val != null) {
probVal = val;
}
-
+
pointTotalProbMap.put(value.getVector(), probVal + value.getProbability());
}
}
-
- for (Map.Entry<Vector,Double> entry : pointTotalProbMap.entrySet()) {
+
+ for (Map.Entry<Vector, Double> entry : pointTotalProbMap.entrySet()) {
Vector key = entry.getKey();
double value = round(entry.getValue(), 1);
-
+
assertEquals("total Prob for Point:" + key, 1.0, value);
}
}
}
-
+
public void testFuzzyKMeansCombiner() throws Exception {
List<VectorWritable> points = TestKmeansClustering.getPointsWritable(TestKmeansClustering.reference);
-
+
for (int k = 0; k < points.size(); k++) {
System.out.println("testKFuzzyKMeansMRJob k= " + k);
// pick k initial cluster centers at random
List<SoftCluster> clusterList = new ArrayList<SoftCluster>();
-
+
for (int i = 0; i < k + 1; i++) {
Vector vec = tweakValue(points.get(i).get());
-
+
SoftCluster cluster = new SoftCluster(vec, i);
cluster.addPoint(cluster.getCenter(), 1);
clusterList.add(cluster);
}
-
+
// run mapper
FuzzyKMeansMapper mapper = new FuzzyKMeansMapper();
mapper.config(clusterList);
-
+
JobConf conf = new JobConf();
- conf.set(FuzzyKMeansConfigKeys.DISTANCE_MEASURE_KEY,
- "org.apache.mahout.common.distance.EuclideanDistanceMeasure");
+ conf.set(FuzzyKMeansConfigKeys.DISTANCE_MEASURE_KEY, "org.apache.mahout.common.distance.EuclideanDistanceMeasure");
conf.set(FuzzyKMeansConfigKeys.CLUSTER_CONVERGENCE_KEY, "0.001");
conf.set(FuzzyKMeansConfigKeys.M_KEY, "2");
mapper.configure(conf);
-
- DummyOutputCollector<Text,FuzzyKMeansInfo> mapCollector = new DummyOutputCollector<Text,FuzzyKMeansInfo>();
+
+ DummyOutputCollector<Text, FuzzyKMeansInfo> mapCollector = new DummyOutputCollector<Text, FuzzyKMeansInfo>();
for (VectorWritable point : points) {
mapper.map(new Text(), point, mapCollector, null);
}
-
+
// run combiner
- DummyOutputCollector<Text,FuzzyKMeansInfo> combinerCollector = new DummyOutputCollector<Text,FuzzyKMeansInfo>();
+ DummyOutputCollector<Text, FuzzyKMeansInfo> combinerCollector = new DummyOutputCollector<Text, FuzzyKMeansInfo>();
FuzzyKMeansCombiner combiner = new FuzzyKMeansCombiner();
combiner.configure(conf);
-
+
for (Text key : mapCollector.getKeys()) {
-
+
List<FuzzyKMeansInfo> values = mapCollector.getValue(key);
combiner.reduce(new Text(key), values.iterator(), combinerCollector, null);
}
-
+
// now verify the combiner output
assertEquals("Combiner Output", k + 1, combinerCollector.getData().size());
-
+
for (Text key : combinerCollector.getKeys()) {
List<FuzzyKMeansInfo> values = combinerCollector.getValue(key);
assertEquals("too many values", 1, values.size());
}
}
}
-
+
public void testFuzzyKMeansReducer() throws Exception {
List<VectorWritable> points = TestKmeansClustering.getPointsWritable(TestKmeansClustering.reference);
-
+
for (int k = 0; k < points.size(); k++) {
System.out.println("testKFuzzyKMeansMRJob k= " + k);
// pick k initial cluster centers at random
List<SoftCluster> clusterList = new ArrayList<SoftCluster>();
-
+
for (int i = 0; i < k + 1; i++) {
Vector vec = tweakValue(points.get(i).get());
-
+
SoftCluster cluster = new SoftCluster(vec, i);
// cluster.addPoint(cluster.getCenter(), 1);
clusterList.add(cluster);
}
-
+
// run mapper
FuzzyKMeansMapper mapper = new FuzzyKMeansMapper();
mapper.config(clusterList);
-
+
JobConf conf = new JobConf();
- conf.set(FuzzyKMeansConfigKeys.DISTANCE_MEASURE_KEY,
- "org.apache.mahout.common.distance.EuclideanDistanceMeasure");
+ conf.set(FuzzyKMeansConfigKeys.DISTANCE_MEASURE_KEY, "org.apache.mahout.common.distance.EuclideanDistanceMeasure");
conf.set(FuzzyKMeansConfigKeys.CLUSTER_CONVERGENCE_KEY, "0.001");
conf.set(FuzzyKMeansConfigKeys.M_KEY, "2");
mapper.configure(conf);
-
- DummyOutputCollector<Text,FuzzyKMeansInfo> mapCollector = new DummyOutputCollector<Text,FuzzyKMeansInfo>();
+
+ DummyOutputCollector<Text, FuzzyKMeansInfo> mapCollector = new DummyOutputCollector<Text, FuzzyKMeansInfo>();
for (VectorWritable point : points) {
mapper.map(new Text(), point, mapCollector, null);
}
-
+
// run combiner
- DummyOutputCollector<Text,FuzzyKMeansInfo> combinerCollector = new DummyOutputCollector<Text,FuzzyKMeansInfo>();
+ DummyOutputCollector<Text, FuzzyKMeansInfo> combinerCollector = new DummyOutputCollector<Text, FuzzyKMeansInfo>();
FuzzyKMeansCombiner combiner = new FuzzyKMeansCombiner();
combiner.configure(conf);
-
+
for (Text key : mapCollector.getKeys()) {
List<FuzzyKMeansInfo> values = mapCollector.getValue(key);
combiner.reduce(new Text(key), values.iterator(), combinerCollector, null);
}
-
+
// run reducer
- DummyOutputCollector<Text,SoftCluster> reducerCollector = new DummyOutputCollector<Text,SoftCluster>();
+ DummyOutputCollector<Text, SoftCluster> reducerCollector = new DummyOutputCollector<Text, SoftCluster>();
FuzzyKMeansReducer reducer = new FuzzyKMeansReducer();
reducer.config(clusterList);
reducer.configure(conf);
-
+
for (Text key : combinerCollector.getKeys()) {
List<FuzzyKMeansInfo> values = combinerCollector.getValue(key);
reducer.reduce(new Text(key), values.iterator(), reducerCollector, new DummyReporter());
}
-
+
// now verify the reducer output
assertEquals("Reducer Output", k + 1, combinerCollector.getData().size());
-
+
// compute the reference result after one iteration and compare
List<SoftCluster> reference = new ArrayList<SoftCluster>();
for (int i = 0; i < k + 1; i++) {
Vector vec = tweakValue(points.get(i).get());
reference.add(new SoftCluster(vec, i));
}
- List<NamedVector> pointsVectors = new ArrayList<NamedVector>();
+ List<Vector> pointsVectors = new ArrayList<Vector>();
for (VectorWritable point : points) {
- pointsVectors.add((NamedVector) point.get());
+ pointsVectors.add((Vector) point.get());
}
-
+
DistanceMeasure measure = new EuclideanDistanceMeasure();
FuzzyKMeansClusterer clusterer = new FuzzyKMeansClusterer(measure, 0.001, 2);
FuzzyKMeansClusterer.runFuzzyKMeansIteration(pointsVectors, reference, clusterer);
-
+
for (SoftCluster key : reference) {
String clusterId = key.getIdentifier();
List<SoftCluster> values = reducerCollector.getValue(new Text(clusterId));
@@ -414,39 +422,38 @@ public class TestFuzzyKmeansClustering e
System.out.println("ref= " + key.toString() + " cluster= " + cluster.toString());
cluster.recomputeCenter();
assertEquals("key center: " + key.getCenter().asFormatString() + " does not equal cluster: "
- + cluster.getCenter().asFormatString(), key.getCenter(), cluster.getCenter());
+ + cluster.getCenter().asFormatString(), key.getCenter(), cluster.getCenter());
}
}
}
-
+
public void testFuzzyKMeansClusterMapper() throws Exception {
List<VectorWritable> points = TestKmeansClustering.getPointsWritable(TestKmeansClustering.reference);
-
+
for (int k = 0; k < points.size(); k++) {
System.out.println("testKFuzzyKMeansMRJob k= " + k);
// pick k initial cluster centers at random
List<SoftCluster> clusterList = new ArrayList<SoftCluster>();
-
+
for (int i = 0; i < k + 1; i++) {
Vector vec = tweakValue(points.get(i).get());
-
+
SoftCluster cluster = new SoftCluster(vec, i);
cluster.addPoint(cluster.getCenter(), 1);
clusterList.add(cluster);
}
-
+
// run mapper
FuzzyKMeansMapper mapper = new FuzzyKMeansMapper();
mapper.config(clusterList);
-
+
JobConf conf = new JobConf();
- conf.set(FuzzyKMeansConfigKeys.DISTANCE_MEASURE_KEY,
- "org.apache.mahout.common.distance.EuclideanDistanceMeasure");
+ conf.set(FuzzyKMeansConfigKeys.DISTANCE_MEASURE_KEY, "org.apache.mahout.common.distance.EuclideanDistanceMeasure");
conf.set(FuzzyKMeansConfigKeys.CLUSTER_CONVERGENCE_KEY, "0.001");
conf.set(FuzzyKMeansConfigKeys.M_KEY, "2");
mapper.configure(conf);
-
- DummyOutputCollector<Text,FuzzyKMeansInfo> mapCollector = new DummyOutputCollector<Text,FuzzyKMeansInfo>();
+
+ DummyOutputCollector<Text, FuzzyKMeansInfo> mapCollector = new DummyOutputCollector<Text, FuzzyKMeansInfo>();
for (VectorWritable point : points) {
mapper.map(new Text(), point, mapCollector, null);
}
@@ -454,30 +461,30 @@ public class TestFuzzyKmeansClustering e
softCluster.recomputeCenter();
}
// run combiner
- DummyOutputCollector<Text,FuzzyKMeansInfo> combinerCollector = new DummyOutputCollector<Text,FuzzyKMeansInfo>();
+ DummyOutputCollector<Text, FuzzyKMeansInfo> combinerCollector = new DummyOutputCollector<Text, FuzzyKMeansInfo>();
FuzzyKMeansCombiner combiner = new FuzzyKMeansCombiner();
combiner.configure(conf);
-
+
for (Text key : mapCollector.getKeys()) {
-
+
List<FuzzyKMeansInfo> values = mapCollector.getValue(key);
combiner.reduce(new Text(key), values.iterator(), combinerCollector, null);
}
-
+
// run reducer
- DummyOutputCollector<Text,SoftCluster> reducerCollector = new DummyOutputCollector<Text,SoftCluster>();
+ DummyOutputCollector<Text, SoftCluster> reducerCollector = new DummyOutputCollector<Text, SoftCluster>();
FuzzyKMeansReducer reducer = new FuzzyKMeansReducer();
reducer.config(clusterList);
reducer.configure(conf);
-
+
for (Text key : combinerCollector.getKeys()) {
List<FuzzyKMeansInfo> values = combinerCollector.getValue(key);
reducer.reduce(new Text(key), values.iterator(), reducerCollector, null);
}
-
+
// run clusterMapper
List<SoftCluster> reducerCluster = new ArrayList<SoftCluster>();
-
+
for (Text key : reducerCollector.getKeys()) {
List<SoftCluster> values = reducerCollector.getValue(key);
reducerCluster.add(values.get(0));
@@ -485,17 +492,17 @@ public class TestFuzzyKmeansClustering e
for (SoftCluster softCluster : reducerCluster) {
softCluster.recomputeCenter();
}
-
- DummyOutputCollector<IntWritable,VectorWritable> clusterMapperCollector = new DummyOutputCollector<IntWritable,VectorWritable>();
-
+
+ DummyOutputCollector<IntWritable, VectorWritable> clusterMapperCollector = new DummyOutputCollector<IntWritable, VectorWritable>();
+
FuzzyKMeansClusterMapper clusterMapper = new FuzzyKMeansClusterMapper();
clusterMapper.config(reducerCluster);
clusterMapper.configure(conf);
-
+
for (VectorWritable point : points) {
clusterMapper.map(new Text(), point, clusterMapperCollector, null);
}
-
+
// now run for one iteration of referencefuzzykmeans and compare the
// results
// compute the reference result after one iteration and compare
@@ -504,36 +511,32 @@ public class TestFuzzyKmeansClustering e
Vector vec = tweakValue(points.get(i).get());
reference.add(new SoftCluster(vec, i));
}
- Map<String,String> pointClusterInfo = new HashMap<String,String>();
- List<NamedVector> pointsVectors = new ArrayList<NamedVector>();
+ Map<Integer, List<Vector>> refClusters = new HashMap<Integer, List<Vector>>();
+ List<Vector> pointsVectors = new ArrayList<Vector>();
for (VectorWritable point : points) {
- pointsVectors.add((NamedVector) point.get());
+ pointsVectors.add((Vector) point.get());
}
-
+
List<List<SoftCluster>> clusters = FuzzyKMeansClusterer.clusterPoints(pointsVectors, reference,
- new EuclideanDistanceMeasure(), 0.001, 2, 1);
- computeCluster(pointsVectors, clusters.get(clusters.size() - 1), new FuzzyKMeansClusterer(
- new EuclideanDistanceMeasure(), 0.001, 2), pointClusterInfo);
-
- // Now compare the clustermapper results with reducer
- for (IntWritable key : clusterMapperCollector.getKeys()) {
- List<VectorWritable> value = clusterMapperCollector.getValue(key);
-
- String refValue = pointClusterInfo.get(key.toString());
- String clusterInfoStr = refValue.substring(1, refValue.length() - 1);
- String[] refClusterInfoList = clusterInfoStr.split(" ");
- assertEquals("Number of clusters", k + 1, refClusterInfoList.length);
- Map<String,Double> refClusterInfoMap = new HashMap<String,Double>();
- for (String clusterInfo : refClusterInfoList) {
- String[] clusterProb = clusterInfo.split(":");
- double clusterProbVal = Double.parseDouble(clusterProb[1]);
- refClusterInfoMap.put(clusterProb[0], clusterProbVal);
- }
-
- VectorWritable kMeansOutput = value.get(0);
- // TODO: fail("test this output");
+ new EuclideanDistanceMeasure(), 0.001, 2, 1);
+
+ computeCluster(pointsVectors, clusters.get(clusters.size() - 1), new FuzzyKMeansClusterer(new EuclideanDistanceMeasure(),
+ 0.001, 2), refClusters);
+
+ // Now compare the clustermapper results with reference implementation
+ assertEquals("mapper and reference sizes", refClusters.size(), clusterMapperCollector.getKeys().size());
+ for (int pcId : refClusters.keySet()) {
+ assertEquals("cluster " + pcId + " sizes", refClusters.get(pcId).size(), clusterMapperCollector.getValue(
+ new IntWritable(pcId)).size());
+ }
+ // make sure all points are allocated to a cluster
+ int size = 0;
+ for (int cId : refClusters.keySet()) {
+ List<Vector> pts = refClusters.get(cId);
+ size += pts.size();
}
+ assertEquals("total size", size, points.size());
}
}
-
+
}
Modified: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java?rev=939019&r1=939018&r2=939019&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java (original)
+++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java Wed Apr 28 17:37:12 2010
@@ -40,23 +40,21 @@ import org.apache.mahout.common.distance
import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
import org.apache.mahout.common.distance.ManhattanDistanceMeasure;
import org.apache.mahout.math.DenseVector;
-import org.apache.mahout.math.NamedVector;
import org.apache.mahout.math.RandomAccessSparseVector;
import org.apache.mahout.math.SequentialAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
public class TestKmeansClustering extends MahoutTestCase {
-
- public static final double[][] reference = { {1, 1}, {2, 1}, {1, 2}, {2, 2}, {3, 3}, {4, 4}, {5, 4},
- {4, 5}, {5, 5}};
-
- private static final int[][] expectedNumPoints = { {9}, {4, 5}, {4, 4, 1}, {1, 2, 1, 5}, {1, 1, 1, 2, 4},
- {1, 1, 1, 1, 1, 4}, {1, 1, 1, 1, 1, 2, 2},
- {1, 1, 1, 1, 1, 1, 2, 1}, {1, 1, 1, 1, 1, 1, 1, 1, 1}};
-
+
+ public static final double[][] reference = { { 1, 1 }, { 2, 1 }, { 1, 2 }, { 2, 2 }, { 3, 3 }, { 4, 4 }, { 5, 4 }, { 4, 5 },
+ { 5, 5 } };
+
+ private static final int[][] expectedNumPoints = { { 9 }, { 4, 5 }, { 4, 4, 1 }, { 1, 2, 1, 5 }, { 1, 1, 1, 2, 4 },
+ { 1, 1, 1, 1, 1, 4 }, { 1, 1, 1, 1, 1, 2, 2 }, { 1, 1, 1, 1, 1, 1, 2, 1 }, { 1, 1, 1, 1, 1, 1, 1, 1, 1 } };
+
private FileSystem fs;
-
+
private static void rmr(String path) {
File f = new File(path);
if (f.exists()) {
@@ -69,7 +67,7 @@ public class TestKmeansClustering extend
f.delete();
}
}
-
+
@Override
protected void setUp() throws Exception {
super.setUp();
@@ -78,32 +76,32 @@ public class TestKmeansClustering extend
Configuration conf = new Configuration();
fs = FileSystem.get(conf);
}
-
+
public static List<VectorWritable> getPointsWritable(double[][] raw) {
List<VectorWritable> points = new ArrayList<VectorWritable>();
for (int i = 0; i < raw.length; i++) {
double[] fr = raw[i];
Vector vec = new RandomAccessSparseVector(fr.length);
vec.assign(fr);
- points.add(new VectorWritable(new NamedVector(vec, String.valueOf(i))));
+ points.add(new VectorWritable(vec));
}
return points;
}
-
- public static List<NamedVector> getPoints(double[][] raw) {
- List<NamedVector> points = new ArrayList<NamedVector>();
+
+ public static List<Vector> getPoints(double[][] raw) {
+ List<Vector> points = new ArrayList<Vector>();
for (int i = 0; i < raw.length; i++) {
double[] fr = raw[i];
- NamedVector vec = new NamedVector(new SequentialAccessSparseVector(fr.length), String.valueOf(i));
+ Vector vec = new SequentialAccessSparseVector(fr.length);
vec.assign(fr);
points.add(vec);
}
return points;
}
-
+
/** Story: Test the reference implementation */
public void testReferenceImplementation() throws Exception {
- List<NamedVector> points = getPoints(reference);
+ List<Vector> points = getPoints(reference);
DistanceMeasure measure = new EuclideanDistanceMeasure();
// try all possible values of k
for (int k = 0; k < points.size(); k++) {
@@ -116,8 +114,7 @@ public class TestKmeansClustering extend
}
// iterate clusters until they converge
int maxIter = 10;
- List<List<Cluster>> clustersList = KMeansClusterer.clusterPoints(points, clusters, measure, maxIter,
- 0.001);
+ List<List<Cluster>> clustersList = KMeansClusterer.clusterPoints(points, clusters, measure, maxIter, 0.001);
clusters = clustersList.get(clustersList.size() - 1);
for (int c = 0; c < clusters.size(); c++) {
Cluster cluster = clusters.get(c);
@@ -126,9 +123,9 @@ public class TestKmeansClustering extend
}
}
}
-
+
public void testStd() {
- List<NamedVector> points = getPoints(reference);
+ List<Vector> points = getPoints(reference);
Cluster c = new Cluster(points.get(0));
for (Vector p : points) {
c.addPoint(p);
@@ -138,30 +135,29 @@ public class TestKmeansClustering extend
}
}
- private static Map<String,Cluster> loadClusterMap(List<Cluster> clusters) {
- Map<String,Cluster> clusterMap = new HashMap<String,Cluster>();
-
+ private static Map<String, Cluster> loadClusterMap(List<Cluster> clusters) {
+ Map<String, Cluster> clusterMap = new HashMap<String, Cluster>();
+
for (Cluster cluster : clusters) {
clusterMap.put(cluster.getIdentifier(), cluster);
}
return clusterMap;
}
-
+
/** Story: test that the mapper will map input points to the nearest cluster */
public void testKMeansMapper() throws Exception {
KMeansMapper mapper = new KMeansMapper();
JobConf conf = new JobConf();
- conf.set(KMeansConfigKeys.DISTANCE_MEASURE_KEY,
- "org.apache.mahout.common.distance.EuclideanDistanceMeasure");
+ conf.set(KMeansConfigKeys.DISTANCE_MEASURE_KEY, "org.apache.mahout.common.distance.EuclideanDistanceMeasure");
conf.set(KMeansConfigKeys.CLUSTER_CONVERGENCE_KEY, "0.001");
conf.set(KMeansConfigKeys.CLUSTER_PATH_KEY, "");
mapper.configure(conf);
List<VectorWritable> points = getPointsWritable(reference);
for (int k = 0; k < points.size(); k++) {
// pick k initial cluster centers at random
- DummyOutputCollector<Text,KMeansInfo> collector = new DummyOutputCollector<Text,KMeansInfo>();
+ DummyOutputCollector<Text, KMeansInfo> collector = new DummyOutputCollector<Text, KMeansInfo>();
List<Cluster> clusters = new ArrayList<Cluster>();
-
+
for (int i = 0; i < k + 1; i++) {
Cluster cluster = new Cluster(points.get(i).get(), i);
// add the center so the centroid will be correct upon output
@@ -169,7 +165,7 @@ public class TestKmeansClustering extend
clusters.add(cluster);
}
mapper.config(clusters);
-
+
// map the data
for (VectorWritable point : points) {
mapper.map(new Text(), point, collector, null);
@@ -177,21 +173,20 @@ public class TestKmeansClustering extend
assertEquals("Number of map results", k + 1, collector.getData().size());
// now verify that all points are correctly allocated
EuclideanDistanceMeasure euclideanDistanceMeasure = new EuclideanDistanceMeasure();
- Map<String,Cluster> clusterMap = loadClusterMap(clusters);
+ Map<String, Cluster> clusterMap = loadClusterMap(clusters);
for (Text key : collector.getKeys()) {
Cluster cluster = clusterMap.get(key.toString());
List<KMeansInfo> values = collector.getValue(key);
for (KMeansInfo value : values) {
double distance = euclideanDistanceMeasure.distance(cluster.getCenter(), value.getPointTotal());
for (Cluster c : clusters) {
- assertTrue("distance error", distance <= euclideanDistanceMeasure.distance(value.getPointTotal(),
- c.getCenter()));
+ assertTrue("distance error", distance <= euclideanDistanceMeasure.distance(value.getPointTotal(), c.getCenter()));
}
}
}
}
}
-
+
/**
* Story: test that the combiner will produce partial cluster totals for all of the clusters and points that
* it sees
@@ -199,19 +194,18 @@ public class TestKmeansClustering extend
public void testKMeansCombiner() throws Exception {
KMeansMapper mapper = new KMeansMapper();
JobConf conf = new JobConf();
- conf.set(KMeansConfigKeys.DISTANCE_MEASURE_KEY,
- "org.apache.mahout.common.distance.EuclideanDistanceMeasure");
+ conf.set(KMeansConfigKeys.DISTANCE_MEASURE_KEY, "org.apache.mahout.common.distance.EuclideanDistanceMeasure");
conf.set(KMeansConfigKeys.CLUSTER_CONVERGENCE_KEY, "0.001");
conf.set(KMeansConfigKeys.CLUSTER_PATH_KEY, "");
mapper.configure(conf);
List<VectorWritable> points = getPointsWritable(reference);
for (int k = 0; k < points.size(); k++) {
// pick k initial cluster centers at random
- DummyOutputCollector<Text,KMeansInfo> collector = new DummyOutputCollector<Text,KMeansInfo>();
+ DummyOutputCollector<Text, KMeansInfo> collector = new DummyOutputCollector<Text, KMeansInfo>();
List<Cluster> clusters = new ArrayList<Cluster>();
for (int i = 0; i < k + 1; i++) {
Vector vec = points.get(i).get();
-
+
Cluster cluster = new Cluster(vec, i);
// add the center so the centroid will be correct upon output
cluster.addPoint(cluster.getCenter());
@@ -224,11 +218,11 @@ public class TestKmeansClustering extend
}
// now combine the data
KMeansCombiner combiner = new KMeansCombiner();
- DummyOutputCollector<Text,KMeansInfo> collector2 = new DummyOutputCollector<Text,KMeansInfo>();
+ DummyOutputCollector<Text, KMeansInfo> collector2 = new DummyOutputCollector<Text, KMeansInfo>();
for (Text key : collector.getKeys()) {
combiner.reduce(new Text(key), collector.getValue(key).iterator(), collector2, null);
}
-
+
assertEquals("Number of map results", k + 1, collector2.getData().size());
// now verify that all points are accounted for
int count = 0;
@@ -238,7 +232,7 @@ public class TestKmeansClustering extend
assertEquals("too many values", 1, values.size());
// String value = values.get(0).toString();
KMeansInfo info = values.get(0);
-
+
count += info.getPoints();
total = total.plus(info.getPointTotal());
}
@@ -247,7 +241,7 @@ public class TestKmeansClustering extend
assertEquals("point total[1]", 27, (int) total.get(1));
}
}
-
+
/**
* Story: test that the reducer will sum the partial cluster totals for all of the clusters and points that
* it sees
@@ -256,8 +250,7 @@ public class TestKmeansClustering extend
KMeansMapper mapper = new KMeansMapper();
EuclideanDistanceMeasure euclideanDistanceMeasure = new EuclideanDistanceMeasure();
JobConf conf = new JobConf();
- conf.set(KMeansConfigKeys.DISTANCE_MEASURE_KEY,
- "org.apache.mahout.common.distance.EuclideanDistanceMeasure");
+ conf.set(KMeansConfigKeys.DISTANCE_MEASURE_KEY, "org.apache.mahout.common.distance.EuclideanDistanceMeasure");
conf.set(KMeansConfigKeys.CLUSTER_CONVERGENCE_KEY, "0.001");
conf.set(KMeansConfigKeys.CLUSTER_PATH_KEY, "");
mapper.configure(conf);
@@ -265,7 +258,7 @@ public class TestKmeansClustering extend
for (int k = 0; k < points.size(); k++) {
System.out.println("K = " + k);
// pick k initial cluster centers at random
- DummyOutputCollector<Text,KMeansInfo> collector = new DummyOutputCollector<Text,KMeansInfo>();
+ DummyOutputCollector<Text, KMeansInfo> collector = new DummyOutputCollector<Text, KMeansInfo>();
List<Cluster> clusters = new ArrayList<Cluster>();
for (int i = 0; i < k + 1; i++) {
Vector vec = points.get(i).get();
@@ -281,40 +274,39 @@ public class TestKmeansClustering extend
}
// now combine the data
KMeansCombiner combiner = new KMeansCombiner();
- DummyOutputCollector<Text,KMeansInfo> collector2 = new DummyOutputCollector<Text,KMeansInfo>();
+ DummyOutputCollector<Text, KMeansInfo> collector2 = new DummyOutputCollector<Text, KMeansInfo>();
for (Text key : collector.getKeys()) {
combiner.reduce(new Text(key), collector.getValue(key).iterator(), collector2, null);
}
-
+
// now reduce the data
KMeansReducer reducer = new KMeansReducer();
reducer.configure(conf);
reducer.config(clusters);
- DummyOutputCollector<Text,Cluster> collector3 = new DummyOutputCollector<Text,Cluster>();
+ DummyOutputCollector<Text, Cluster> collector3 = new DummyOutputCollector<Text, Cluster>();
for (Text key : collector2.getKeys()) {
reducer.reduce(new Text(key), collector2.getValue(key).iterator(), collector3, new DummyReporter());
}
-
+
assertEquals("Number of map results", k + 1, collector3.getData().size());
-
+
// compute the reference result after one iteration and compare
List<Cluster> reference = new ArrayList<Cluster>();
for (int i = 0; i < k + 1; i++) {
Vector vec = points.get(i).get();
reference.add(new Cluster(vec, i));
}
- List<NamedVector> pointsVectors = new ArrayList<NamedVector>();
+ List<Vector> pointsVectors = new ArrayList<Vector>();
for (VectorWritable point : points) {
- pointsVectors.add((NamedVector) point.get());
+ pointsVectors.add((Vector) point.get());
}
- boolean converged = KMeansClusterer.runKMeansIteration(pointsVectors, reference,
- euclideanDistanceMeasure, 0.001);
+ boolean converged = KMeansClusterer.runKMeansIteration(pointsVectors, reference, euclideanDistanceMeasure, 0.001);
if (k == 8) {
assertTrue("not converged? " + k, converged);
} else {
assertFalse("converged? " + k, converged);
}
-
+
// now verify that all clusters have correct centers
converged = true;
for (int i = 0; i < reference.size(); i++) {
@@ -335,7 +327,7 @@ public class TestKmeansClustering extend
}
}
}
-
+
/** Story: User wishes to run kmeans job on reference data */
public void testKMeansMRJob() throws Exception {
List<VectorWritable> points = getPointsWritable(reference);
@@ -347,7 +339,7 @@ public class TestKmeansClustering extend
if (!testData.exists()) {
testData.mkdir();
}
-
+
Configuration conf = new Configuration();
ClusteringTestUtils.writePointsToFile(points, "testdata/points/file1", fs, conf);
ClusteringTestUtils.writePointsToFile(points, "testdata/points/file2", fs, conf);
@@ -358,10 +350,10 @@ public class TestKmeansClustering extend
Path path = new Path("testdata/clusters/part-00000");
FileSystem fs = FileSystem.get(path.toUri(), job);
SequenceFile.Writer writer = new SequenceFile.Writer(fs, job, path, Text.class, Cluster.class);
-
+
for (int i = 0; i < k + 1; i++) {
Vector vec = points.get(i).get();
-
+
Cluster cluster = new Cluster(vec, i);
// add the center so the centroid will be correct upon output
cluster.addPoint(cluster.getCenter());
@@ -370,24 +362,24 @@ public class TestKmeansClustering extend
writer.close();
// now run the Job
HadoopUtil.overwriteOutput("output");
- KMeansDriver.runJob("testdata/points", "testdata/clusters", "output", EuclideanDistanceMeasure.class
- .getName(), 0.001, 10, k + 1);
+ KMeansDriver.runJob("testdata/points", "testdata/clusters", "output", EuclideanDistanceMeasure.class.getName(), 0.001, 10,
+ k + 1);
// now compare the expected clusters with actual
File outDir = new File("output/clusteredPoints");
assertTrue("output dir exists?", outDir.exists());
// assertEquals("output dir files?", 4, outFiles.length);
SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path("output/clusteredPoints/part-00000"), conf);
int[] expect = expectedNumPoints[k];
- DummyOutputCollector<IntWritable,VectorWritable> collector = new DummyOutputCollector<IntWritable,VectorWritable>();
+ DummyOutputCollector<IntWritable, VectorWritable> collector = new DummyOutputCollector<IntWritable, VectorWritable>();
// The key is the clusterId
IntWritable clusterId = new IntWritable(0);
// The value is the vector
VectorWritable value = new VectorWritable();
while (reader.next(clusterId, value)) {
- collector.collect(clusterId, value);
+ collector.collect(clusterId, value);
clusterId = new IntWritable(0);
value = new VectorWritable();
-
+
}
reader.close();
if (k == 2)
@@ -399,7 +391,7 @@ public class TestKmeansClustering extend
}
}
}
-
+
/** Story: User wants to use canopy clustering to input the initial clusters for kmeans job. */
public void testKMeansWithCanopyClusterInput() throws Exception {
List<VectorWritable> points = getPointsWritable(reference);
@@ -414,23 +406,21 @@ public class TestKmeansClustering extend
Configuration conf = new Configuration();
ClusteringTestUtils.writePointsToFile(points, "testdata/points/file1", fs, conf);
ClusteringTestUtils.writePointsToFile(points, "testdata/points/file2", fs, conf);
-
+
// now run the Canopy job
- CanopyDriver.runJob("testdata/points", "testdata/canopies", ManhattanDistanceMeasure.class.getName(),
- 3.1, 2.1);
-
+ CanopyDriver.runJob("testdata/points", "testdata/canopies", ManhattanDistanceMeasure.class.getName(), 3.1, 2.1);
+
// now run the KMeans job
- KMeansDriver.runJob("testdata/points", "testdata/canopies", "output", EuclideanDistanceMeasure.class
- .getName(), 0.001, 10, 1);
-
+ KMeansDriver.runJob("testdata/points", "testdata/canopies", "output", EuclideanDistanceMeasure.class.getName(), 0.001, 10, 1);
+
// now compare the expected clusters with actual
File outDir = new File("output/clusteredPoints");
assertTrue("output dir exists?", outDir.exists());
String[] outFiles = outDir.list();
assertEquals("output dir files?", 4, outFiles.length);
- DummyOutputCollector<IntWritable,VectorWritable> collector = new DummyOutputCollector<IntWritable,VectorWritable>();
+ DummyOutputCollector<IntWritable, VectorWritable> collector = new DummyOutputCollector<IntWritable, VectorWritable>();
SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path("output/clusteredPoints/part-00000"), conf);
-
+
// The key is the clusterId
IntWritable clusterId = new IntWritable(0);
// The value is the vector
@@ -439,10 +429,10 @@ public class TestKmeansClustering extend
collector.collect(clusterId, value);
clusterId = new IntWritable(0);
value = new VectorWritable();
-
+
}
reader.close();
-
+
assertEquals("num points[0]", 4, collector.getValue(new IntWritable(0)).size());
assertEquals("num points[1]", 5, collector.getValue(new IntWritable(1)).size());
}
Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/dirichlet/DisplayDirichlet.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/dirichlet/DisplayDirichlet.java?rev=939019&r1=939018&r2=939019&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/dirichlet/DisplayDirichlet.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/dirichlet/DisplayDirichlet.java Wed Apr 28 17:37:12 2010
@@ -215,10 +215,8 @@ public class DisplayDirichlet extends Fr
sampleParams.add(new DenseVector(params));
log.info("Generating {} samples m=[{}, {}] sd={}", new Object[] {num, mx, my, sd});
for (int i = 0; i < num; i++) {
- sampleData.add(new VectorWritable(
- new NamedVector(new DenseVector(new double[] {UncommonDistributions.rNorm(mx, sd),
- UncommonDistributions.rNorm(my, sd)}),
- String.valueOf(i))));
+ sampleData.add(new VectorWritable(new DenseVector(new double[] {UncommonDistributions.rNorm(mx, sd),
+ UncommonDistributions.rNorm(my, sd)})));
}
}
Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/fuzzykmeans/DisplayFuzzyKMeans.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/fuzzykmeans/DisplayFuzzyKMeans.java?rev=939019&r1=939018&r2=939019&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/fuzzykmeans/DisplayFuzzyKMeans.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/fuzzykmeans/DisplayFuzzyKMeans.java Wed Apr 28 17:37:12 2010
@@ -62,9 +62,9 @@ class DisplayFuzzyKMeans extends Display
public static void main(String[] args) {
RandomUtils.useTestSeed();
DisplayDirichlet.generateSamples();
- List<NamedVector> points = new ArrayList<NamedVector>();
+ List<Vector> points = new ArrayList<Vector>();
for (VectorWritable sample : sampleData) {
- points.add((NamedVector) sample.get());
+ points.add((Vector) sample.get());
}
DistanceMeasure measure = new ManhattanDistanceMeasure();
List<SoftCluster> initialClusters = new ArrayList<SoftCluster>();
Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/kmeans/DisplayKMeans.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/kmeans/DisplayKMeans.java?rev=939019&r1=939018&r2=939019&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/kmeans/DisplayKMeans.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/kmeans/DisplayKMeans.java Wed Apr 28 17:37:12 2010
@@ -33,14 +33,14 @@ import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
class DisplayKMeans extends DisplayDirichlet {
-
+
private static List<List<Cluster>> clusters;
-
+
DisplayKMeans() {
initialize();
this.setTitle("K-Means Clusters (> 5% of population)");
}
-
+
@Override
public void paint(Graphics g) {
super.plotSampleData(g);
@@ -59,13 +59,13 @@ class DisplayKMeans extends DisplayDiric
}
}
}
-
+
public static void main(String[] args) {
RandomUtils.useTestSeed();
DisplayDirichlet.generateSamples();
- List<NamedVector> points = new ArrayList<NamedVector>();
+ List<Vector> points = new ArrayList<Vector>();
for (VectorWritable sample : sampleData) {
- points.add((NamedVector) sample.get());
+ points.add(sample.get());
}
DistanceMeasure measure = new ManhattanDistanceMeasure();
List<Cluster> initialClusters = new ArrayList<Cluster>();
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwDriver.java?rev=939019&r1=939018&r2=939019&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwDriver.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwDriver.java Wed Apr 28 17:37:12 2010
@@ -34,7 +34,6 @@ import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
-import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
@@ -44,8 +43,8 @@ import org.apache.hadoop.mapred.Sequence
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.mahout.clustering.Cluster;
import org.apache.mahout.clustering.ClusterBase;
+import org.apache.mahout.clustering.WeightedPointWritable;
import org.apache.mahout.clustering.dirichlet.DirichletCluster;
-import org.apache.mahout.clustering.dirichlet.DirichletMapper;
import org.apache.mahout.clustering.kmeans.KMeansDriver;
import org.apache.mahout.common.CommandLineUtil;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
@@ -142,6 +141,13 @@ public class CDbwDriver {
// now point the input to the old output directory
stateIn = stateOut;
}
+
+ Configurable client = new JobClient();
+ JobConf conf = new JobConf(CDbwDriver.class);
+ conf.set(STATE_IN_KEY, stateIn);
+ conf.set(DISTANCE_MEASURE_KEY, distanceMeasureClass);
+ CDbwEvaluator evaluator = new CDbwEvaluator(conf, clustersIn);
+ System.out.println("CDbw = " + evaluator.CDbw());
}
private static void writeInitialState(String output, String clustersIn) throws ClassNotFoundException, InstantiationException,
@@ -192,7 +198,7 @@ public class CDbwDriver {
conf.setOutputKeyClass(IntWritable.class);
conf.setOutputValueClass(VectorWritable.class);
conf.setMapOutputKeyClass(IntWritable.class);
- conf.setMapOutputValueClass(CDbwDistantPointWritable.class);
+ conf.setMapOutputValueClass(WeightedPointWritable.class);
FileInputFormat.setInputPaths(conf, new Path(input));
Path outPath = new Path(stateOut);
@@ -213,36 +219,4 @@ public class CDbwDriver {
log.warn(e.toString(), e);
}
}
-
- /**
- * Run the job using supplied arguments
- *
- * @param input
- * the directory pathname for input points
- * @param stateIn
- * the directory pathname for input state
- * @param output
- * the directory pathname for output points
- */
- public static void runClustering(String input, String stateIn, String output) {
- Configurable client = new JobClient();
- JobConf conf = new JobConf(CDbwDriver.class);
-
- conf.setOutputKeyClass(Text.class);
- conf.setOutputValueClass(Text.class);
-
- FileInputFormat.setInputPaths(conf, new Path(input));
- Path outPath = new Path(output);
- FileOutputFormat.setOutputPath(conf, outPath);
-
- conf.setMapperClass(DirichletMapper.class);
- conf.setNumReduceTasks(0);
-
- client.setConf(conf);
- try {
- JobClient.runJob(conf);
- } catch (IOException e) {
- log.warn(e.toString(), e);
- }
- }
}
Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java?rev=939019&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java Wed Apr 28 17:37:12 2010
@@ -0,0 +1,304 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.cdbw;
+
+import java.io.File;
+import java.io.IOException;
+import java.lang.reflect.InvocationTargetException;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.function.SquareRootFunction;
+
+public class CDbwEvaluator {
+
+ Map<Integer, List<VectorWritable>> representativePoints;
+
+ Map<Integer, Double> stDevs = new HashMap<Integer, Double>();
+
+ Map<Integer, Cluster> clusters;
+
+ DistanceMeasure measure;
+
+ /**
+ * For testing only
+ *
+ * @param representativePoints
+ * a Map<Integer,List<VectorWritable>> of representative points keyed by clusterId
+ * @param clusters
+ * a Map<Integer,Cluster> of the clusters keyed by clusterId
+ * @param measure
+ * an appropriate DistanceMeasure
+ */
+ public CDbwEvaluator(Map<Integer, List<VectorWritable>> representativePoints, Map<Integer, Cluster> clusters,
+ DistanceMeasure measure) {
+ super();
+ this.representativePoints = representativePoints;
+ this.clusters = clusters;
+ this.measure = measure;
+ for (Integer cId : representativePoints.keySet()) {
+ setStDev(cId);
+ }
+ }
+
+ /**
+ * Initialize a new instance from job information
+ *
+ * @param job
+ * a JobConf with appropriate parameters
+ * @param clustersIn
+ * a String path to the input clusters directory
+ *
+ * @throws SecurityException
+ * @throws IllegalArgumentException
+ * @throws NoSuchMethodException
+ * @throws InvocationTargetException
+ * @throws ClassNotFoundException
+ * @throws InstantiationException
+ * @throws IllegalAccessException
+ * @throws IOException
+ */
+ public CDbwEvaluator(JobConf job, String clustersIn) throws SecurityException, IllegalArgumentException, NoSuchMethodException,
+ InvocationTargetException, ClassNotFoundException, InstantiationException, IllegalAccessException, IOException {
+ super();
+ ClassLoader ccl = Thread.currentThread().getContextClassLoader();
+ Class<?> cl = ccl.loadClass(job.get(CDbwDriver.DISTANCE_MEASURE_KEY));
+ measure = (DistanceMeasure) cl.newInstance();
+ representativePoints = CDbwMapper.getRepresentativePoints(job);
+ clusters = loadClusters(job, clustersIn);
+ for (Integer cId : representativePoints.keySet()) {
+ setStDev(cId);
+ }
+ }
+
+ public double CDbw() {
+ double cdbw = intraClusterDensity() * separation();
+ System.out.println("CDbw=" + cdbw);
+ return cdbw;
+ }
+
+ /**
+ * Load the clusters from their sequence files
+ *
+ * @param clustersIn
+ * a String pathname to the directory containing input cluster files
+ * @return a List<Cluster> of the clusters
+ *
+ * @throws ClassNotFoundException
+ * @throws InstantiationException
+ * @throws IllegalAccessException
+ * @throws IOException
+ * @throws SecurityException
+ * @throws NoSuchMethodException
+ * @throws InvocationTargetException
+ */
+ private HashMap<Integer, Cluster> loadClusters(JobConf job, String clustersIn) throws ClassNotFoundException,
+ InstantiationException, IllegalAccessException, IOException, SecurityException, NoSuchMethodException,
+ InvocationTargetException {
+ HashMap<Integer, Cluster> clusters = new HashMap<Integer, Cluster>();
+ File f = new File(clustersIn);
+ for (File part : f.listFiles()) {
+ if (!part.getName().startsWith(".")) {
+ Path inPart = new Path(clustersIn + "/" + part.getName());
+ FileSystem fs = FileSystem.get(inPart.toUri(), job);
+ SequenceFile.Reader reader = new SequenceFile.Reader(fs, inPart, job);
+ Writable key = (Writable) reader.getKeyClass().newInstance();
+ Writable value = (Writable) reader.getValueClass().newInstance();
+ while (reader.next(key, value)) {
+ Cluster cluster = (Cluster) value;
+ clusters.put(cluster.getId(), cluster);
+ value = (Writable) reader.getValueClass().newInstance();
+ }
+ reader.close();
+ }
+ }
+ return clusters;
+ }
+
+ double interClusterDensity() {
+ double sum = 0;
+ for (int cI : representativePoints.keySet()) {
+ for (int cJ : representativePoints.keySet()) {
+ if (cI == cJ) {
+ continue;
+ }
+ List<VectorWritable> repI = representativePoints.get(cI);
+ List<VectorWritable> repJ = representativePoints.get(cJ);
+ double minDistance = Double.MAX_VALUE;
+ Vector uIJ = null;
+ for (int ptI = 0; ptI < repI.size(); ptI++) {
+ for (int ptJ = 0; ptJ < repJ.size(); ptJ++) {
+ Vector vI = repI.get(ptI).get();
+ Vector vJ = repJ.get(ptJ).get();
+ double distance = measure.distance(vI, vJ);
+ if (distance < minDistance) {
+ minDistance = distance;
+ uIJ = vI.plus(vJ).divide(2);
+ }
+ }
+ }
+ double stDevI = stDevs.get(cI);
+ double stDevJ = stDevs.get(cJ);
+ double interDensity = interDensity(uIJ, cI, cJ);
+ double stdSum = stDevI + stDevJ;
+ double density = 0;
+ if (stdSum > 0) {
+ density = minDistance * interDensity / stdSum;
+ }
+
+ if (false) {
+ System.out.println("minDistance[" + cI + "," + cJ + "]=" + minDistance);
+ System.out.println("stDev[" + cI + "]=" + stDevI);
+ System.out.println("stDev[" + cJ + "]=" + stDevJ);
+ System.out.println("interDensity[" + cI + "," + cJ + "]=" + interDensity);
+ System.out.println("density[" + cI + "," + cJ + "]=" + density);
+ System.out.println();
+ }
+ sum += density;
+ }
+ }
+ System.out.println("interClusterDensity=" + sum);
+ return sum;
+ }
+
+ double interDensity(Vector uIJ, int cI, int cJ) {
+ List<VectorWritable> repI = representativePoints.get(cI);
+ List<VectorWritable> repJ = representativePoints.get(cJ);
+ double density = 0;
+ double std = (stDevs.get(cI) + stDevs.get(cJ)) / 2;
+ for (VectorWritable vwI : repI) {
+ if (measure.distance(uIJ, vwI.get()) <= std) {
+ density++;
+ }
+ }
+ for (VectorWritable vwJ : repJ) {
+ if (measure.distance(uIJ, vwJ.get()) <= std) {
+ density++;
+ }
+ }
+ return density / (repI.size() + repJ.size());
+ }
+
+ private void setStDev(int cI) {
+ List<VectorWritable> repPts = representativePoints.get(cI);
+ double d = 0;
+ if (repPts == null) {
+ System.out.println();
+ }
+ int s0 = 0;
+ Vector s1 = null;
+ Vector s2 = null;
+ for (VectorWritable vw : repPts) {
+ s0++;
+ Vector v = vw.get();
+ if (s1 == null) {
+ s1 = v.clone();
+ } else {
+ s1 = s1.plus(v);
+ }
+ if (s2 == null) {
+ s2 = v.times(v);
+ } else {
+ s2 = s2.plus(v.times(v));
+ }
+ }
+ Vector std = s2.times(s0).minus(s1.times(s1)).assign(new SquareRootFunction()).divide(s0);
+ d = std.zSum() / std.size();
+ System.out.println("stDev[" + cI + "]=" + d);
+ stDevs.put(cI, d);
+ }
+
+ double minRpDistance(List<VectorWritable> repI, List<VectorWritable> repJ) {
+ double minDistance = Double.MAX_VALUE;
+ for (int ptI = 0; ptI < repI.size(); ptI++) {
+ for (int ptJ = 0; ptJ < repJ.size(); ptJ++) {
+ double distance = measure.distance(repI.get(ptI).get(), repJ.get(ptJ).get());
+ if (distance < minDistance) {
+ minDistance = distance;
+ }
+ }
+ }
+ return minDistance;
+ }
+
+ double separation() {
+ double minDistance = Double.MAX_VALUE;
+ for (Integer cI : representativePoints.keySet()) {
+ for (int cJ : representativePoints.keySet()) {
+ if (cI == cJ) {
+ continue;
+ }
+ List<VectorWritable> repI = representativePoints.get(cI);
+ List<VectorWritable> repJ = representativePoints.get(cJ);
+ for (int ptI = 0; ptI < repI.size(); ptI++) {
+ for (int ptJ = 0; ptJ < repJ.size(); ptJ++) {
+ double distance = measure.distance(repI.get(ptI).get(), repJ.get(ptJ).get());
+ if (distance < minDistance) {
+ minDistance = distance;
+ }
+ }
+ }
+ }
+ }
+ double separation = minDistance / (1 + interClusterDensity());
+ System.out.println("separation=" + separation);
+ return separation;
+ }
+
+ double intraClusterDensity() {
+ double avgStd = 0;
+ for (Integer cId : representativePoints.keySet()) {
+ avgStd += stDevs.get(cId);
+ }
+ avgStd = avgStd / representativePoints.size();
+
+ double sum = 0;
+ for (Integer cId : representativePoints.keySet()) {
+ List<VectorWritable> repI = representativePoints.get(cId);
+ double cSum = 0;
+ for (int ptI = 0; ptI < repI.size(); ptI++) {
+ double inDensity = intraDensity(clusters.get(cId).getCenter(), repI.get(ptI).get(), avgStd);
+ Double std = stDevs.get(cId);
+ if (std > 0) {
+ cSum += inDensity / std;
+ }
+ }
+ sum += cSum / repI.size();
+ }
+ double intraClusterDensity = sum / representativePoints.size();
+ System.out.println("intraClusterDensity=" + intraClusterDensity);
+ return intraClusterDensity;
+ }
+
+ double intraDensity(Vector clusterCenter, Vector repPoint, double avgStd) {
+ if (measure.distance(clusterCenter, repPoint) <= avgStd) {
+ return 1;
+ }
+ return 0;
+ }
+}
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwMapper.java?rev=939019&r1=939018&r2=939019&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwMapper.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwMapper.java Wed Apr 28 17:37:12 2010
@@ -35,36 +35,37 @@ import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.OutputLogFilter;
import org.apache.hadoop.mapred.Reporter;
+import org.apache.mahout.clustering.WeightedPointWritable;
import org.apache.mahout.common.distance.DistanceMeasure;
import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
import org.apache.mahout.math.VectorWritable;
-public class CDbwMapper extends MapReduceBase implements Mapper<IntWritable, VectorWritable, IntWritable, CDbwDistantPointWritable> {
+public class CDbwMapper extends MapReduceBase implements Mapper<IntWritable, VectorWritable, IntWritable, WeightedPointWritable> {
private Map<Integer, List<VectorWritable>> representativePoints;
- private Map<Integer, CDbwDistantPointWritable> mostDistantPoints = new HashMap<Integer, CDbwDistantPointWritable>();
+ private Map<Integer, WeightedPointWritable> mostDistantPoints = new HashMap<Integer, WeightedPointWritable>();
private DistanceMeasure measure = new EuclideanDistanceMeasure();
- private OutputCollector<IntWritable, CDbwDistantPointWritable> output = null;
+ private OutputCollector<IntWritable, WeightedPointWritable> output = null;
@Override
- public void map(IntWritable clusterId, VectorWritable point, OutputCollector<IntWritable, CDbwDistantPointWritable> output,
+ public void map(IntWritable clusterId, VectorWritable point, OutputCollector<IntWritable, WeightedPointWritable> output,
Reporter reporter) throws IOException {
- this.output = output;
+ this.output = output;
int key = clusterId.get();
- CDbwDistantPointWritable currentMDP = mostDistantPoints.get(key);
+ WeightedPointWritable currentMDP = mostDistantPoints.get(key);
List<VectorWritable> refPoints = representativePoints.get(key);
double totalDistance = 0.0;
for (VectorWritable refPoint : refPoints) {
totalDistance += measure.distance(refPoint.get(), point.get());
}
- if (currentMDP == null || currentMDP.getDistance() < totalDistance) {
- mostDistantPoints.put(key, new CDbwDistantPointWritable(totalDistance, new VectorWritable(point.get().clone())));
+ if (currentMDP == null || currentMDP.getWeight() < totalDistance) {
+ mostDistantPoints.put(key, new WeightedPointWritable(totalDistance, new VectorWritable(point.get().clone())));
}
}
@@ -73,7 +74,7 @@ public class CDbwMapper extends MapReduc
this.measure = measure;
}
- public static Map<Integer, List<VectorWritable>> getReferencePoints(JobConf job) throws SecurityException,
+ public static Map<Integer, List<VectorWritable>> getRepresentativePoints(JobConf job) throws SecurityException,
IllegalArgumentException, NoSuchMethodException, InvocationTargetException {
String statePath = job.get(CDbwDriver.STATE_IN_KEY);
Map<Integer, List<VectorWritable>> representativePoints = new HashMap<Integer, List<VectorWritable>>();
@@ -93,7 +94,8 @@ public class CDbwMapper extends MapReduc
representativePoints.put(key.get(), repPoints);
}
repPoints.add(point);
- point = new VectorWritable(); }
+ point = new VectorWritable();
+ }
} finally {
reader.close();
}
@@ -111,7 +113,7 @@ public class CDbwMapper extends MapReduc
ClassLoader ccl = Thread.currentThread().getContextClassLoader();
Class<?> cl = ccl.loadClass(job.get(CDbwDriver.DISTANCE_MEASURE_KEY));
measure = (DistanceMeasure) cl.newInstance();
- representativePoints = getReferencePoints(job);
+ representativePoints = getRepresentativePoints(job);
} catch (NumberFormatException e) {
throw new IllegalStateException(e);
} catch (SecurityException e) {
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwReducer.java?rev=939019&r1=939018&r2=939019&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwReducer.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwReducer.java Wed Apr 28 17:37:12 2010
@@ -29,25 +29,25 @@ import org.apache.hadoop.mapred.MapReduc
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
+import org.apache.mahout.clustering.WeightedPointWritable;
import org.apache.mahout.math.VectorWritable;
-public class CDbwReducer extends MapReduceBase implements
- Reducer<IntWritable, CDbwDistantPointWritable, IntWritable, VectorWritable> {
+public class CDbwReducer extends MapReduceBase implements Reducer<IntWritable, WeightedPointWritable, IntWritable, VectorWritable> {
private Map<Integer, List<VectorWritable>> referencePoints;
private OutputCollector<IntWritable, VectorWritable> output;
@Override
- public void reduce(IntWritable key, Iterator<CDbwDistantPointWritable> values,
- OutputCollector<IntWritable, VectorWritable> output, Reporter reporter) throws IOException {
+ public void reduce(IntWritable key, Iterator<WeightedPointWritable> values, OutputCollector<IntWritable, VectorWritable> output,
+ Reporter reporter) throws IOException {
this.output = output;
// find the most distant point
- CDbwDistantPointWritable mdp = null;
+ WeightedPointWritable mdp = null;
while (values.hasNext()) {
- CDbwDistantPointWritable dpw = values.next();
- if (mdp == null || mdp.getDistance() < dpw.getDistance()) {
- mdp = new CDbwDistantPointWritable(dpw.getDistance(), dpw.getPoint());
+ WeightedPointWritable dpw = values.next();
+ if (mdp == null || mdp.getWeight() < dpw.getWeight()) {
+ mdp = new WeightedPointWritable(dpw.getWeight(), dpw.getPoint());
}
}
output.collect(new IntWritable(key.get()), mdp.getPoint());
@@ -74,7 +74,7 @@ public class CDbwReducer extends MapRedu
public void configure(JobConf job) {
super.configure(job);
try {
- referencePoints = CDbwMapper.getReferencePoints(job);
+ referencePoints = CDbwMapper.getRepresentativePoints(job);
} catch (NumberFormatException e) {
throw new IllegalStateException(e);
} catch (SecurityException e) {