You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by je...@apache.org on 2010/07/24 05:39:33 UTC
svn commit: r978786 [2/2] - in /mahout/trunk:
core/src/main/java/org/apache/mahout/clustering/
core/src/main/java/org/apache/mahout/clustering/canopy/
core/src/main/java/org/apache/mahout/clustering/dirichlet/models/
core/src/main/java/org/apache/mahou...
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyClusterer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyClusterer.java?rev=978786&r1=978785&r2=978786&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyClusterer.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyClusterer.java Sat Jul 24 03:39:30 2010
@@ -112,11 +112,8 @@ public class MeanShiftCanopyClusterer {
* @return if the cluster is converged
*/
public boolean shiftToMean(MeanShiftCanopy canopy) {
- Vector centroid = canopy.computeCentroid();
- canopy.setConverged(measure.distance(centroid, canopy.getCenter()) < convergenceDelta);
- canopy.setCenter(centroid);
- canopy.setNumPoints(1);
- canopy.setPointTotal(centroid.clone());
+ canopy.computeConvergence(measure, convergenceDelta);
+ canopy.computeParameters();
return canopy.isConverged();
}
@@ -194,8 +191,7 @@ public class MeanShiftCanopyClusterer {
return migratedCanopies;
}
- @SuppressWarnings("unused")
- private static void verifyNonOverlap(List<MeanShiftCanopy> canopies) {
+ protected static void verifyNonOverlap(List<MeanShiftCanopy> canopies) {
Set<Integer> coveredPoints = new HashSet<Integer>();
// verify no overlap
for (MeanShiftCanopy canopy : canopies) {
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java?rev=978786&r1=978785&r2=978786&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java Sat Jul 24 03:39:30 2010
@@ -34,6 +34,7 @@ import org.apache.hadoop.mapreduce.lib.i
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.mahout.clustering.AbstractCluster;
import org.apache.mahout.clustering.Cluster;
import org.apache.mahout.clustering.WeightedVectorWritable;
import org.apache.mahout.clustering.canopy.CanopyDriver;
@@ -439,6 +440,9 @@ public class MeanShiftCanopyDriver exten
MeanShiftCanopy.class);
try {
for (MeanShiftCanopy cluster : clusters) {
+ log.info("Writing Cluster:" + cluster.getId() + " center:" + AbstractCluster.formatVector(cluster.getCenter(), null)
+ + " numPoints:" + cluster.getNumPoints() + " radius:" + AbstractCluster.formatVector(cluster.getRadius(), null) + " to: "
+ + clustersOut.getName());
writer.append(new Text(cluster.getIdentifier()), cluster);
}
} finally {
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/TestClusterInterface.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/TestClusterInterface.java?rev=978786&r1=978785&r2=978786&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/TestClusterInterface.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/TestClusterInterface.java Sat Jul 24 03:39:30 2010
@@ -176,7 +176,7 @@ public class TestClusterInterface extend
Cluster cluster = new Canopy(m, 123);
String formatString = cluster.asFormatString(null);
System.out.println(formatString);
- assertEquals("format", "C-123: [1.100, 2.200, 3.300]", formatString);
+ assertEquals("format", "C-123{n=0 c=[1.100, 2.200, 3.300] r=[]}", formatString);
}
public void testCanopyAsFormatStringSparse() {
@@ -186,7 +186,7 @@ public class TestClusterInterface extend
Cluster cluster = new Canopy(m, 123);
String formatString = cluster.asFormatString(null);
System.out.println(formatString);
- assertEquals("format", "C-123: [0:1.100, 2:3.300]", formatString);
+ assertEquals("format", "C-123{n=0 c=[0:1.100, 2:3.300] r=[]}", formatString);
}
public void testCanopyAsFormatStringWithBindings() {
@@ -196,7 +196,7 @@ public class TestClusterInterface extend
String[] bindings = { "fee", null, null };
String formatString = cluster.asFormatString(bindings);
System.out.println(formatString);
- assertEquals("format", "C-123: [fee:1.100, 1:2.200, 2:3.300]", formatString);
+ assertEquals("format", "C-123{n=0 c=[fee:1.100, 1:2.200, 2:3.300] r=[]}", formatString);
}
public void testCanopyAsFormatStringSparseWithBindings() {
@@ -206,7 +206,7 @@ public class TestClusterInterface extend
Cluster cluster = new Canopy(m, 123);
String formatString = cluster.asFormatString(null);
System.out.println(formatString);
- assertEquals("format", "C-123: [0:1.100, 2:3.300]", formatString);
+ assertEquals("format", "C-123{n=0 c=[0:1.100, 2:3.300] r=[]}", formatString);
}
public void testClusterAsFormatString() {
@@ -215,7 +215,7 @@ public class TestClusterInterface extend
Cluster cluster = new org.apache.mahout.clustering.kmeans.Cluster(m, 123);
String formatString = cluster.asFormatString(null);
System.out.println(formatString);
- assertEquals("format", "C-123: [1.100, 2.200, 3.300]", formatString);
+ assertEquals("format", "CL-123{n=0 c=[1.100, 2.200, 3.300] r=[]}", formatString);
}
public void testClusterAsFormatStringSparse() {
@@ -225,7 +225,7 @@ public class TestClusterInterface extend
Cluster cluster = new org.apache.mahout.clustering.kmeans.Cluster(m, 123);
String formatString = cluster.asFormatString(null);
System.out.println(formatString);
- assertEquals("format", "C-123: [0:1.100, 2:3.300]", formatString);
+ assertEquals("format", "CL-123{n=0 c=[0:1.100, 2:3.300] r=[]}", formatString);
}
public void testClusterAsFormatStringWithBindings() {
@@ -235,7 +235,7 @@ public class TestClusterInterface extend
String[] bindings = { "fee", null, "foo" };
String formatString = cluster.asFormatString(bindings);
System.out.println(formatString);
- assertEquals("format", "C-123: [fee:1.100, 1:2.200, foo:3.300]", formatString);
+ assertEquals("format", "CL-123{n=0 c=[fee:1.100, 1:2.200, foo:3.300] r=[]}", formatString);
}
public void testClusterAsFormatStringSparseWithBindings() {
@@ -245,7 +245,7 @@ public class TestClusterInterface extend
Cluster cluster = new org.apache.mahout.clustering.kmeans.Cluster(m, 123);
String formatString = cluster.asFormatString(null);
System.out.println(formatString);
- assertEquals("format", "C-123: [0:1.100, 2:3.300]", formatString);
+ assertEquals("format", "CL-123{n=0 c=[0:1.100, 2:3.300] r=[]}", formatString);
}
public void testMSCanopyAsFormatString() {
@@ -254,7 +254,7 @@ public class TestClusterInterface extend
Cluster cluster = new MeanShiftCanopy(m, 123);
String formatString = cluster.asFormatString(null);
System.out.println(formatString);
- assertEquals("format", "C-123: [1.100, 2.200, 3.300]", formatString);
+ assertEquals("format", "MSC-123{n=0 c=[1.100, 2.200, 3.300] r=[]}", formatString);
}
public void testMSCanopyAsFormatStringSparse() {
@@ -264,7 +264,7 @@ public class TestClusterInterface extend
Cluster cluster = new MeanShiftCanopy(m, 123);
String formatString = cluster.asFormatString(null);
System.out.println(formatString);
- assertEquals("format", "C-123: [0:1.100, 2:3.300]", formatString);
+ assertEquals("format", "MSC-123{n=0 c=[0:1.100, 2:3.300] r=[]}", formatString);
}
public void testMSCanopyAsFormatStringWithBindings() {
@@ -274,7 +274,7 @@ public class TestClusterInterface extend
String[] bindings = { "fee", null, "foo" };
String formatString = cluster.asFormatString(bindings);
System.out.println(formatString);
- assertEquals("format", "C-123: [fee:1.100, 1:2.200, foo:3.300]", formatString);
+ assertEquals("format", "MSC-123{n=0 c=[fee:1.100, 1:2.200, foo:3.300] r=[]}", formatString);
}
public void testMSCanopyAsFormatStringSparseWithBindings() {
@@ -285,7 +285,7 @@ public class TestClusterInterface extend
String[] bindings = { "fee", null, "foo" };
String formatString = cluster.asFormatString(bindings);
System.out.println(formatString);
- assertEquals("format", "C-123: [fee:1.100, foo:3.300]", formatString);
+ assertEquals("format", "MSC-123{n=0 c=[fee:1.100, foo:3.300] r=[]}", formatString);
}
}
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java?rev=978786&r1=978785&r2=978786&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java Sat Jul 24 03:39:30 2010
@@ -32,7 +32,7 @@ import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
-import org.apache.mahout.clustering.ClusterBase;
+import org.apache.mahout.clustering.AbstractCluster;
import org.apache.mahout.clustering.ClusteringTestUtils;
import org.apache.mahout.clustering.WeightedVectorWritable;
import org.apache.mahout.common.DummyRecordWriter;
@@ -92,7 +92,7 @@ public class TestCanopyCreation extends
*/
private static void printCanopies(List<Canopy> canopies) {
for (Canopy canopy : canopies) {
- System.out.println(canopy.toString());
+ System.out.println(canopy.asFormatString(null));
}
}
@@ -110,9 +110,9 @@ public class TestCanopyCreation extends
super.setUp();
fs = FileSystem.get(new Configuration());
referenceManhattan = CanopyClusterer.createCanopies(getPoints(), manhattanDistanceMeasure, 3.1, 2.1);
- manhattanCentroids = CanopyClusterer.calculateCentroids(referenceManhattan);
+ manhattanCentroids = CanopyClusterer.getCenters(referenceManhattan);
referenceEuclidean = CanopyClusterer.createCanopies(getPoints(), euclideanDistanceMeasure, 3.1, 2.1);
- euclideanCentroids = CanopyClusterer.calculateCentroids(referenceEuclidean);
+ euclideanCentroids = CanopyClusterer.getCenters(referenceEuclidean);
}
/** Story: User can cluster points using a ManhattanDistanceMeasure and a reference implementation */
@@ -437,7 +437,7 @@ public class TestCanopyCreation extends
WeightedVectorWritable vector = new WeightedVectorWritable();
while (reader.next(clusterId, vector)) {
count++;
- System.out.println("Txt: " + clusterId + " Vec: " + ClusterBase.formatVector(vector.getVector().get(), null));
+ System.out.println("Txt: " + clusterId + " Vec: " + AbstractCluster.formatVector(vector.getVector().get(), null));
}
assertEquals("number of points", points.size(), count);
reader.close();
@@ -477,7 +477,7 @@ public class TestCanopyCreation extends
WeightedVectorWritable vector = new WeightedVectorWritable();
while (reader.next(clusterId, vector)) {
count++;
- System.out.println("Txt: " + clusterId + " Vec: " + ClusterBase.formatVector(vector.getVector().get(), null));
+ System.out.println("Txt: " + clusterId + " Vec: " + AbstractCluster.formatVector(vector.getVector().get(), null));
}
assertEquals("number of points", points.size(), count);
reader.close();
@@ -502,7 +502,7 @@ public class TestCanopyCreation extends
WeightedVectorWritable vector = new WeightedVectorWritable();
while (reader.next(clusterId, vector)) {
count++;
- System.out.println("Txt: " + clusterId + " Vec: " + ClusterBase.formatVector(vector.getVector().get(), null));
+ System.out.println("Txt: " + clusterId + " Vec: " + AbstractCluster.formatVector(vector.getVector().get(), null));
}
assertEquals("number of points", points.size(), count);
reader.close();
@@ -532,7 +532,7 @@ public class TestCanopyCreation extends
WeightedVectorWritable vw = new WeightedVectorWritable();
while (reader.next(canopyId, vw)) {
count++;
- System.out.println("Txt: " + canopyId.toString() + " Vec: " + ClusterBase.formatVector(vw.getVector().get(), null));
+ System.out.println("Txt: " + canopyId.toString() + " Vec: " + AbstractCluster.formatVector(vw.getVector().get(), null));
}
assertEquals("number of points", points.size(), count);
reader.close();
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java?rev=978786&r1=978785&r2=978786&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java Sat Jul 24 03:39:30 2010
@@ -35,6 +35,8 @@ import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.mahout.clustering.AbstractCluster;
+import org.apache.mahout.clustering.ClusterObservations;
import org.apache.mahout.clustering.ClusteringTestUtils;
import org.apache.mahout.clustering.WeightedVectorWritable;
import org.apache.mahout.clustering.kmeans.TestKmeansClustering;
@@ -185,12 +187,12 @@ public class TestFuzzyKmeansClustering e
public void testFuzzyKMeansSeqJob() throws Exception {
List<VectorWritable> points = TestKmeansClustering.getPointsWritable(TestKmeansClustering.reference);
-
+
Path pointsPath = getTestTempDirPath("points");
Path clustersPath = getTestTempDirPath("clusters");
Configuration conf = new Configuration();
ClusteringTestUtils.writePointsToFile(points, new Path(pointsPath, "file1"), fs, conf);
-
+
for (int k = 0; k < points.size(); k++) {
System.out.println("testKFuzzyKMeansMRJob k= " + k);
// pick k initial cluster centers at random
@@ -201,18 +203,18 @@ public class TestFuzzyKmeansClustering e
SoftCluster.class);
for (int i = 0; i < k + 1; i++) {
Vector vec = tweakValue(points.get(i).get());
-
- SoftCluster cluster = new SoftCluster(vec);
+
+ SoftCluster cluster = new SoftCluster(vec, i);
// add the center so the centroid will be correct upon output
- cluster.addPoint(cluster.getCenter(), 1);
+ cluster.observe(cluster.getCenter(), 1);
/*
* writer.write(cluster.getIdentifier() + '\t' + SoftCluster.formatCluster(cluster) + '\n');
*/
writer.append(new Text(cluster.getIdentifier()), cluster);
-
+
}
writer.close();
-
+
// now run the Job using the run() command line options.
Path output = getTestTempDirPath("output");
/* FuzzyKMeansDriver.runJob(pointsPath,
@@ -227,17 +229,12 @@ public class TestFuzzyKmeansClustering e
true,
0);
*/
- String[] args = {
- optKey(DefaultOptionCreator.INPUT_OPTION), pointsPath.toString(),
- optKey(DefaultOptionCreator.CLUSTERS_IN_OPTION), clustersPath.toString(),
- optKey(DefaultOptionCreator.OUTPUT_OPTION), output.toString(),
- optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION), EuclideanDistanceMeasure.class.getName(),
- optKey(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION), "0.001",
- optKey(DefaultOptionCreator.MAX_ITERATIONS_OPTION), "2",
- optKey(FuzzyKMeansDriver.M_OPTION), "2.0",
- optKey(DefaultOptionCreator.CLUSTERING_OPTION),
- optKey(DefaultOptionCreator.EMIT_MOST_LIKELY_OPTION),
- optKey(DefaultOptionCreator.OVERWRITE_OPTION),
+ String[] args = { optKey(DefaultOptionCreator.INPUT_OPTION), pointsPath.toString(),
+ optKey(DefaultOptionCreator.CLUSTERS_IN_OPTION), clustersPath.toString(), optKey(DefaultOptionCreator.OUTPUT_OPTION),
+ output.toString(), optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION), EuclideanDistanceMeasure.class.getName(),
+ optKey(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION), "0.001", optKey(DefaultOptionCreator.MAX_ITERATIONS_OPTION), "2",
+ optKey(FuzzyKMeansDriver.M_OPTION), "2.0", optKey(DefaultOptionCreator.CLUSTERING_OPTION),
+ optKey(DefaultOptionCreator.EMIT_MOST_LIKELY_OPTION), optKey(DefaultOptionCreator.OVERWRITE_OPTION),
optKey(DefaultOptionCreator.METHOD_OPTION), DefaultOptionCreator.SEQUENTIAL_METHOD };
new FuzzyKMeansDriver().run(args);
SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path(output, "clusteredPoints/part-m-0"), conf);
@@ -248,7 +245,7 @@ public class TestFuzzyKmeansClustering e
}
reader.close();
}
-
+
}
public void testFuzzyKMeansMRJob() throws Exception {
@@ -270,9 +267,9 @@ public class TestFuzzyKmeansClustering e
for (int i = 0; i < k + 1; i++) {
Vector vec = tweakValue(points.get(i).get());
- SoftCluster cluster = new SoftCluster(vec);
+ SoftCluster cluster = new SoftCluster(vec, i);
// add the center so the centroid will be correct upon output
- cluster.addPoint(cluster.getCenter(), 1);
+ cluster.observe(cluster.getCenter(), 1);
/*
* writer.write(cluster.getIdentifier() + '\t' + SoftCluster.formatCluster(cluster) + '\n');
*/
@@ -295,17 +292,12 @@ public class TestFuzzyKmeansClustering e
true,
0);
*/
- String[] args = {
- optKey(DefaultOptionCreator.INPUT_OPTION), pointsPath.toString(),
- optKey(DefaultOptionCreator.CLUSTERS_IN_OPTION), clustersPath.toString(),
- optKey(DefaultOptionCreator.OUTPUT_OPTION), output.toString(),
- optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION), EuclideanDistanceMeasure.class.getName(),
- optKey(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION), "0.001",
- optKey(DefaultOptionCreator.MAX_ITERATIONS_OPTION), "2",
- optKey(FuzzyKMeansDriver.M_OPTION), "2.0",
- optKey(DefaultOptionCreator.CLUSTERING_OPTION),
- optKey(DefaultOptionCreator.EMIT_MOST_LIKELY_OPTION),
- optKey(DefaultOptionCreator.OVERWRITE_OPTION) };
+ String[] args = { optKey(DefaultOptionCreator.INPUT_OPTION), pointsPath.toString(),
+ optKey(DefaultOptionCreator.CLUSTERS_IN_OPTION), clustersPath.toString(), optKey(DefaultOptionCreator.OUTPUT_OPTION),
+ output.toString(), optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION), EuclideanDistanceMeasure.class.getName(),
+ optKey(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION), "0.001", optKey(DefaultOptionCreator.MAX_ITERATIONS_OPTION), "2",
+ optKey(FuzzyKMeansDriver.M_OPTION), "2.0", optKey(DefaultOptionCreator.CLUSTERING_OPTION),
+ optKey(DefaultOptionCreator.EMIT_MOST_LIKELY_OPTION), optKey(DefaultOptionCreator.OVERWRITE_OPTION) };
new FuzzyKMeansDriver().run(args);
SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path(output, "clusteredPoints/part-m-00000"), conf);
IntWritable key = new IntWritable();
@@ -330,7 +322,7 @@ public class TestFuzzyKmeansClustering e
Vector vec = tweakValue(points.get(i).get());
SoftCluster cluster = new SoftCluster(vec, i);
- cluster.addPoint(cluster.getCenter(), 1);
+ cluster.observe(cluster.getCenter(), 1);
clusterList.add(cluster);
}
@@ -345,10 +337,9 @@ public class TestFuzzyKmeansClustering e
conf.set(FuzzyKMeansConfigKeys.EMIT_MOST_LIKELY_KEY, "true");
conf.set(FuzzyKMeansConfigKeys.THRESHOLD_KEY, "0");
- DummyRecordWriter<Text, FuzzyKMeansInfo> mapWriter = new DummyRecordWriter<Text, FuzzyKMeansInfo>();
- Mapper<WritableComparable<?>, VectorWritable, Text, FuzzyKMeansInfo>.Context mapContext = DummyRecordWriter.build(mapper,
- conf,
- mapWriter);
+ DummyRecordWriter<Text, ClusterObservations> mapWriter = new DummyRecordWriter<Text, ClusterObservations>();
+ Mapper<WritableComparable<?>, VectorWritable, Text, ClusterObservations>.Context mapContext = DummyRecordWriter
+ .build(mapper, conf, mapWriter);
mapper.setup(mapContext);
for (VectorWritable point : points) {
mapper.map(new Text(), point, mapContext);
@@ -361,15 +352,15 @@ public class TestFuzzyKmeansClustering e
for (Text key : mapWriter.getKeys()) {
// SoftCluster cluster = SoftCluster.decodeCluster(key);
- List<FuzzyKMeansInfo> values = mapWriter.getValue(key);
+ List<ClusterObservations> values = mapWriter.getValue(key);
- for (FuzzyKMeansInfo value : values) {
- Double val = pointTotalProbMap.get(value.getVector());
+ for (ClusterObservations value : values) {
+ Double val = pointTotalProbMap.get(value.getS1());
double probVal = 0.0;
if (val != null) {
probVal = val;
}
- pointTotalProbMap.put(value.getVector(), probVal + value.getProbability());
+ pointTotalProbMap.put(value.getS1(), probVal + value.getS0());
}
}
for (Map.Entry<Vector, Double> entry : pointTotalProbMap.entrySet()) {
@@ -393,7 +384,7 @@ public class TestFuzzyKmeansClustering e
Vector vec = tweakValue(points.get(i).get());
SoftCluster cluster = new SoftCluster(vec, i);
- cluster.addPoint(cluster.getCenter(), 1);
+ cluster.observe(cluster.getCenter(), 1);
clusterList.add(cluster);
}
@@ -408,10 +399,9 @@ public class TestFuzzyKmeansClustering e
conf.set(FuzzyKMeansConfigKeys.EMIT_MOST_LIKELY_KEY, "true");
conf.set(FuzzyKMeansConfigKeys.THRESHOLD_KEY, "0");
- DummyRecordWriter<Text, FuzzyKMeansInfo> mapWriter = new DummyRecordWriter<Text, FuzzyKMeansInfo>();
- Mapper<WritableComparable<?>, VectorWritable, Text, FuzzyKMeansInfo>.Context mapContext = DummyRecordWriter.build(mapper,
- conf,
- mapWriter);
+ DummyRecordWriter<Text, ClusterObservations> mapWriter = new DummyRecordWriter<Text, ClusterObservations>();
+ Mapper<WritableComparable<?>, VectorWritable, Text, ClusterObservations>.Context mapContext = DummyRecordWriter
+ .build(mapper, conf, mapWriter);
mapper.setup(mapContext);
for (VectorWritable point : points) {
mapper.map(new Text(), point, mapContext);
@@ -419,12 +409,12 @@ public class TestFuzzyKmeansClustering e
// run combiner
FuzzyKMeansCombiner combiner = new FuzzyKMeansCombiner();
- DummyRecordWriter<Text, FuzzyKMeansInfo> combinerWriter = new DummyRecordWriter<Text, FuzzyKMeansInfo>();
- Reducer<Text, FuzzyKMeansInfo, Text, FuzzyKMeansInfo>.Context combinerContext = DummyRecordWriter
- .build(combiner, conf, combinerWriter, Text.class, FuzzyKMeansInfo.class);
+ DummyRecordWriter<Text, ClusterObservations> combinerWriter = new DummyRecordWriter<Text, ClusterObservations>();
+ Reducer<Text, ClusterObservations, Text, ClusterObservations>.Context combinerContext = DummyRecordWriter
+ .build(combiner, conf, combinerWriter, Text.class, ClusterObservations.class);
combiner.setup(combinerContext);
for (Text key : mapWriter.getKeys()) {
- List<FuzzyKMeansInfo> values = mapWriter.getValue(key);
+ List<ClusterObservations> values = mapWriter.getValue(key);
combiner.reduce(new Text(key), values, combinerContext);
}
@@ -432,7 +422,7 @@ public class TestFuzzyKmeansClustering e
assertEquals("Combiner Output", k + 1, combinerWriter.getData().size());
for (Text key : combinerWriter.getKeys()) {
- List<FuzzyKMeansInfo> values = combinerWriter.getValue(key);
+ List<ClusterObservations> values = combinerWriter.getValue(key);
assertEquals("too many values", 1, values.size());
}
}
@@ -465,10 +455,9 @@ public class TestFuzzyKmeansClustering e
conf.set(FuzzyKMeansConfigKeys.EMIT_MOST_LIKELY_KEY, "true");
conf.set(FuzzyKMeansConfigKeys.THRESHOLD_KEY, "0");
- DummyRecordWriter<Text, FuzzyKMeansInfo> mapWriter = new DummyRecordWriter<Text, FuzzyKMeansInfo>();
- Mapper<WritableComparable<?>, VectorWritable, Text, FuzzyKMeansInfo>.Context mapContext = DummyRecordWriter.build(mapper,
- conf,
- mapWriter);
+ DummyRecordWriter<Text, ClusterObservations> mapWriter = new DummyRecordWriter<Text, ClusterObservations>();
+ Mapper<WritableComparable<?>, VectorWritable, Text, ClusterObservations>.Context mapContext = DummyRecordWriter
+ .build(mapper, conf, mapWriter);
mapper.setup(mapContext);
for (VectorWritable point : points) {
mapper.map(new Text(), point, mapContext);
@@ -476,27 +465,24 @@ public class TestFuzzyKmeansClustering e
// run combiner
FuzzyKMeansCombiner combiner = new FuzzyKMeansCombiner();
- DummyRecordWriter<Text, FuzzyKMeansInfo> combinerWriter = new DummyRecordWriter<Text, FuzzyKMeansInfo>();
- Reducer<Text, FuzzyKMeansInfo, Text, FuzzyKMeansInfo>.Context combinerContext = DummyRecordWriter
- .build(combiner, conf, combinerWriter, Text.class, FuzzyKMeansInfo.class);
+ DummyRecordWriter<Text, ClusterObservations> combinerWriter = new DummyRecordWriter<Text, ClusterObservations>();
+ Reducer<Text, ClusterObservations, Text, ClusterObservations>.Context combinerContext = DummyRecordWriter
+ .build(combiner, conf, combinerWriter, Text.class, ClusterObservations.class);
combiner.setup(combinerContext);
for (Text key : mapWriter.getKeys()) {
- List<FuzzyKMeansInfo> values = mapWriter.getValue(key);
+ List<ClusterObservations> values = mapWriter.getValue(key);
combiner.reduce(new Text(key), values, combinerContext);
}
// run reducer
FuzzyKMeansReducer reducer = new FuzzyKMeansReducer();
DummyRecordWriter<Text, SoftCluster> reducerWriter = new DummyRecordWriter<Text, SoftCluster>();
- Reducer<Text, FuzzyKMeansInfo, Text, SoftCluster>.Context reducerContext = DummyRecordWriter.build(reducer,
- conf,
- reducerWriter,
- Text.class,
- FuzzyKMeansInfo.class);
+ Reducer<Text, ClusterObservations, Text, SoftCluster>.Context reducerContext = DummyRecordWriter
+ .build(reducer, conf, reducerWriter, Text.class, ClusterObservations.class);
reducer.setup(clusterList, conf);
for (Text key : combinerWriter.getKeys()) {
- List<FuzzyKMeansInfo> values = combinerWriter.getValue(key);
+ List<ClusterObservations> values = combinerWriter.getValue(key);
reducer.reduce(new Text(key), values, reducerContext);
}
@@ -522,9 +508,9 @@ public class TestFuzzyKmeansClustering e
List<SoftCluster> values = reducerWriter.getValue(new Text(clusterId));
SoftCluster cluster = values.get(0);
System.out.println("ref= " + key.toString() + " cluster= " + cluster.toString());
- cluster.recomputeCenter();
- assertEquals("key center: " + key.getCenter().asFormatString() + " does not equal cluster: "
- + cluster.getCenter().asFormatString(), key.getCenter(), cluster.getCenter());
+ cluster.computeParameters();
+ assertEquals("key center: " + AbstractCluster.formatVector(key.getCenter(), null) + " does not equal cluster: "
+ + AbstractCluster.formatVector(cluster.getCenter(), null), key.getCenter(), cluster.getCenter());
}
}
}
@@ -541,11 +527,11 @@ public class TestFuzzyKmeansClustering e
Vector vec = tweakValue(points.get(i).get());
SoftCluster cluster = new SoftCluster(vec, i);
- cluster.addPoint(cluster.getCenter(), 1);
+ cluster.observe(cluster.getCenter(), 1);
clusterList.add(cluster);
}
for (SoftCluster softCluster : clusterList) {
- softCluster.recomputeCenter();
+ softCluster.computeParameters();
}
// run mapper
@@ -560,10 +546,9 @@ public class TestFuzzyKmeansClustering e
conf.set(FuzzyKMeansConfigKeys.EMIT_MOST_LIKELY_KEY, "true");
conf.set(FuzzyKMeansConfigKeys.THRESHOLD_KEY, "0");
- DummyRecordWriter<Text, FuzzyKMeansInfo> mapWriter = new DummyRecordWriter<Text, FuzzyKMeansInfo>();
- Mapper<WritableComparable<?>, VectorWritable, Text, FuzzyKMeansInfo>.Context mapContext = DummyRecordWriter.build(mapper,
- conf,
- mapWriter);
+ DummyRecordWriter<Text, ClusterObservations> mapWriter = new DummyRecordWriter<Text, ClusterObservations>();
+ Mapper<WritableComparable<?>, VectorWritable, Text, ClusterObservations>.Context mapContext = DummyRecordWriter
+ .build(mapper, conf, mapWriter);
mapper.setup(mapContext);
for (VectorWritable point : points) {
mapper.map(new Text(), point, mapContext);
@@ -571,27 +556,24 @@ public class TestFuzzyKmeansClustering e
// run combiner
FuzzyKMeansCombiner combiner = new FuzzyKMeansCombiner();
- DummyRecordWriter<Text, FuzzyKMeansInfo> combinerWriter = new DummyRecordWriter<Text, FuzzyKMeansInfo>();
- Reducer<Text, FuzzyKMeansInfo, Text, FuzzyKMeansInfo>.Context combinerContext = DummyRecordWriter
- .build(combiner, conf, combinerWriter, Text.class, FuzzyKMeansInfo.class);
+ DummyRecordWriter<Text, ClusterObservations> combinerWriter = new DummyRecordWriter<Text, ClusterObservations>();
+ Reducer<Text, ClusterObservations, Text, ClusterObservations>.Context combinerContext = DummyRecordWriter
+ .build(combiner, conf, combinerWriter, Text.class, ClusterObservations.class);
combiner.setup(combinerContext);
for (Text key : mapWriter.getKeys()) {
- List<FuzzyKMeansInfo> values = mapWriter.getValue(key);
+ List<ClusterObservations> values = mapWriter.getValue(key);
combiner.reduce(new Text(key), values, combinerContext);
}
// run reducer
FuzzyKMeansReducer reducer = new FuzzyKMeansReducer();
DummyRecordWriter<Text, SoftCluster> reducerWriter = new DummyRecordWriter<Text, SoftCluster>();
- Reducer<Text, FuzzyKMeansInfo, Text, SoftCluster>.Context reducerContext = DummyRecordWriter.build(reducer,
- conf,
- reducerWriter,
- Text.class,
- FuzzyKMeansInfo.class);
+ Reducer<Text, ClusterObservations, Text, SoftCluster>.Context reducerContext = DummyRecordWriter
+ .build(reducer, conf, reducerWriter, Text.class, ClusterObservations.class);
reducer.setup(clusterList, conf);
for (Text key : combinerWriter.getKeys()) {
- List<FuzzyKMeansInfo> values = combinerWriter.getValue(key);
+ List<ClusterObservations> values = combinerWriter.getValue(key);
reducer.reduce(new Text(key), values, reducerContext);
}
@@ -602,7 +584,7 @@ public class TestFuzzyKmeansClustering e
reducerClusters.add(values.get(0));
}
for (SoftCluster softCluster : reducerClusters) {
- softCluster.recomputeCenter();
+ softCluster.computeParameters();
}
FuzzyKMeansClusterMapper clusterMapper = new FuzzyKMeansClusterMapper();
@@ -655,19 +637,19 @@ public class TestFuzzyKmeansClustering e
}
}
- public void testFuzzyKMeansInfoSerialization() throws IOException {
+ public void testClusterObservationsSerialization() throws IOException {
double[] data = { 1.1, 2.2, 3.3 };
Vector vector = new DenseVector(data);
- FuzzyKMeansInfo reference = new FuzzyKMeansInfo(2.0, vector, 1);
+ ClusterObservations reference = new ClusterObservations(1, 2.0, vector, vector);
DataOutputBuffer out = new DataOutputBuffer();
reference.write(out);
- FuzzyKMeansInfo info = new FuzzyKMeansInfo();
+ ClusterObservations info = new ClusterObservations();
DataInputBuffer in = new DataInputBuffer();
in.reset(out.getData(), out.getLength());
info.readFields(in);
- assertEquals("probability", reference.getProbability(), info.getProbability());
- assertTrue("point total", reference.getVector().equals(info.getVector()));
- assertEquals("combiner", reference.getCombinerPass(), info.getCombinerPass());
+ assertEquals("probability", reference.getS0(), info.getS0());
+ assertTrue("point total", reference.getS1().equals(info.getS1()));
+ assertEquals("combiner", reference.getCombinerState(), info.getCombinerState());
}
}
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java?rev=978786&r1=978785&r2=978786&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java Sat Jul 24 03:39:30 2010
@@ -32,6 +32,8 @@ import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.mahout.clustering.AbstractCluster;
+import org.apache.mahout.clustering.ClusterObservations;
import org.apache.mahout.clustering.ClusteringTestUtils;
import org.apache.mahout.clustering.WeightedVectorWritable;
import org.apache.mahout.clustering.canopy.CanopyDriver;
@@ -111,31 +113,20 @@ public class TestKmeansClustering extend
List<Cluster> clusters = new ArrayList<Cluster>();
for (int i = 0; i < k + 1; i++) {
Vector vec = points.get(i);
- clusters.add(new VisibleCluster(vec));
+ clusters.add(new Cluster(vec, i));
}
// iterate clusters until they converge
int maxIter = 10;
List<List<Cluster>> clustersList = KMeansClusterer.clusterPoints(points, clusters, measure, maxIter, 0.001);
clusters = clustersList.get(clustersList.size() - 1);
for (int c = 0; c < clusters.size(); c++) {
- Cluster cluster = clusters.get(c);
- System.out.println(cluster.toString());
+ AbstractCluster cluster = clusters.get(c);
+ System.out.println(cluster.asFormatString(null));
assertEquals("Cluster " + c + " test " + (k + 1), expectedNumPoints[k][c], cluster.getNumPoints());
}
}
}
- public void testStd() {
- List<Vector> points = getPoints(reference);
- Cluster c = new Cluster(points.get(0));
- for (Vector p : points) {
- c.addPoint(p);
- if (c.getNumPoints() > 1) {
- assertTrue(c.getStd() > 0.0);
- }
- }
- }
-
private static Map<String, Cluster> loadClusterMap(List<Cluster> clusters) {
Map<String, Cluster> clusterMap = new HashMap<String, Cluster>();
@@ -156,16 +147,15 @@ public class TestKmeansClustering extend
List<VectorWritable> points = getPointsWritable(reference);
for (int k = 0; k < points.size(); k++) {
// pick k initial cluster centers at random
- DummyRecordWriter<Text, KMeansInfo> mapWriter = new DummyRecordWriter<Text, KMeansInfo>();
- Mapper<WritableComparable<?>, VectorWritable, Text, KMeansInfo>.Context mapContext = DummyRecordWriter.build(mapper,
- conf,
- mapWriter);
+ DummyRecordWriter<Text, ClusterObservations> mapWriter = new DummyRecordWriter<Text, ClusterObservations>();
+ Mapper<WritableComparable<?>, VectorWritable, Text, ClusterObservations>.Context mapContext = DummyRecordWriter
+ .build(mapper, conf, mapWriter);
List<Cluster> clusters = new ArrayList<Cluster>();
for (int i = 0; i < k + 1; i++) {
Cluster cluster = new Cluster(points.get(i).get(), i);
// add the center so the centroid will be correct upon output
- cluster.addPoint(cluster.getCenter());
+ cluster.observe(cluster.getCenter(), 1);
clusters.add(cluster);
}
mapper.setup(clusters, measure);
@@ -175,16 +165,14 @@ public class TestKmeansClustering extend
mapper.map(new Text(), point, mapContext);
}
assertEquals("Number of map results", k + 1, mapWriter.getData().size());
- // now verify that all points are correctly allocated
- EuclideanDistanceMeasure euclideanDistanceMeasure = measure;
Map<String, Cluster> clusterMap = loadClusterMap(clusters);
for (Text key : mapWriter.getKeys()) {
- Cluster cluster = clusterMap.get(key.toString());
- List<KMeansInfo> values = mapWriter.getValue(key);
- for (KMeansInfo value : values) {
- double distance = euclideanDistanceMeasure.distance(cluster.getCenter(), value.getPointTotal());
- for (Cluster c : clusters) {
- assertTrue("distance error", distance <= euclideanDistanceMeasure.distance(value.getPointTotal(), c.getCenter()));
+ AbstractCluster cluster = clusterMap.get(key.toString());
+ List<ClusterObservations> values = mapWriter.getValue(key);
+ for (ClusterObservations value : values) {
+ double distance = measure.distance(cluster.getCenter(), value.getS1());
+ for (AbstractCluster c : clusters) {
+ assertTrue("distance error", distance <= measure.distance(value.getS1(), c.getCenter()));
}
}
}
@@ -205,17 +193,16 @@ public class TestKmeansClustering extend
List<VectorWritable> points = getPointsWritable(reference);
for (int k = 0; k < points.size(); k++) {
// pick k initial cluster centers at random
- DummyRecordWriter<Text, KMeansInfo> mapWriter = new DummyRecordWriter<Text, KMeansInfo>();
- Mapper<WritableComparable<?>, VectorWritable, Text, KMeansInfo>.Context mapContext = DummyRecordWriter.build(mapper,
- conf,
- mapWriter);
+ DummyRecordWriter<Text, ClusterObservations> mapWriter = new DummyRecordWriter<Text, ClusterObservations>();
+ Mapper<WritableComparable<?>, VectorWritable, Text, ClusterObservations>.Context mapContext = DummyRecordWriter
+ .build(mapper, conf, mapWriter);
List<Cluster> clusters = new ArrayList<Cluster>();
for (int i = 0; i < k + 1; i++) {
Vector vec = points.get(i).get();
Cluster cluster = new Cluster(vec, i);
// add the center so the centroid will be correct upon output
- cluster.addPoint(cluster.getCenter());
+ cluster.observe(cluster.getCenter(), 1);
clusters.add(cluster);
}
mapper.setup(clusters, measure);
@@ -225,12 +212,9 @@ public class TestKmeansClustering extend
}
// now combine the data
KMeansCombiner combiner = new KMeansCombiner();
- DummyRecordWriter<Text, KMeansInfo> combinerWriter = new DummyRecordWriter<Text, KMeansInfo>();
- Reducer<Text, KMeansInfo, Text, KMeansInfo>.Context combinerContext = DummyRecordWriter.build(combiner,
- conf,
- combinerWriter,
- Text.class,
- KMeansInfo.class);
+ DummyRecordWriter<Text, ClusterObservations> combinerWriter = new DummyRecordWriter<Text, ClusterObservations>();
+ Reducer<Text, ClusterObservations, Text, ClusterObservations>.Context combinerContext = DummyRecordWriter
+ .build(combiner, conf, combinerWriter, Text.class, ClusterObservations.class);
for (Text key : mapWriter.getKeys()) {
combiner.reduce(new Text(key), mapWriter.getValue(key), combinerContext);
}
@@ -240,13 +224,12 @@ public class TestKmeansClustering extend
int count = 0;
Vector total = new DenseVector(2);
for (Text key : combinerWriter.getKeys()) {
- List<KMeansInfo> values = combinerWriter.getValue(key);
+ List<ClusterObservations> values = combinerWriter.getValue(key);
assertEquals("too many values", 1, values.size());
- // String value = values.get(0).toString();
- KMeansInfo info = values.get(0);
+ ClusterObservations info = values.get(0);
- count += info.getPoints();
- total = total.plus(info.getPointTotal());
+ count += info.getS0();
+ total = total.plus(info.getS1());
}
assertEquals("total points", 9, count);
assertEquals("point total[0]", 27, (int) total.get(0));
@@ -269,10 +252,9 @@ public class TestKmeansClustering extend
for (int k = 0; k < points.size(); k++) {
System.out.println("K = " + k);
// pick k initial cluster centers at random
- DummyRecordWriter<Text, KMeansInfo> mapWriter = new DummyRecordWriter<Text, KMeansInfo>();
- Mapper<WritableComparable<?>, VectorWritable, Text, KMeansInfo>.Context mapContext = DummyRecordWriter.build(mapper,
- conf,
- mapWriter);
+ DummyRecordWriter<Text, ClusterObservations> mapWriter = new DummyRecordWriter<Text, ClusterObservations>();
+ Mapper<WritableComparable<?>, VectorWritable, Text, ClusterObservations>.Context mapContext = DummyRecordWriter
+ .build(mapper, conf, mapWriter);
List<Cluster> clusters = new ArrayList<Cluster>();
for (int i = 0; i < k + 1; i++) {
Vector vec = points.get(i).get();
@@ -288,12 +270,9 @@ public class TestKmeansClustering extend
}
// now combine the data
KMeansCombiner combiner = new KMeansCombiner();
- DummyRecordWriter<Text, KMeansInfo> combinerWriter = new DummyRecordWriter<Text, KMeansInfo>();
- Reducer<Text, KMeansInfo, Text, KMeansInfo>.Context combinerContext = DummyRecordWriter.build(combiner,
- conf,
- combinerWriter,
- Text.class,
- KMeansInfo.class);
+ DummyRecordWriter<Text, ClusterObservations> combinerWriter = new DummyRecordWriter<Text, ClusterObservations>();
+ Reducer<Text, ClusterObservations, Text, ClusterObservations>.Context combinerContext = DummyRecordWriter
+ .build(combiner, conf, combinerWriter, Text.class, ClusterObservations.class);
for (Text key : mapWriter.getKeys()) {
combiner.reduce(new Text(key), mapWriter.getValue(key), combinerContext);
}
@@ -302,11 +281,11 @@ public class TestKmeansClustering extend
KMeansReducer reducer = new KMeansReducer();
reducer.setup(clusters, measure);
DummyRecordWriter<Text, Cluster> reducerWriter = new DummyRecordWriter<Text, Cluster>();
- Reducer<Text, KMeansInfo, Text, Cluster>.Context reducerContext = DummyRecordWriter.build(reducer,
- conf,
- reducerWriter,
- Text.class,
- KMeansInfo.class);
+ Reducer<Text, ClusterObservations, Text, Cluster>.Context reducerContext = DummyRecordWriter.build(reducer,
+ conf,
+ reducerWriter,
+ Text.class,
+ ClusterObservations.class);
for (Text key : combinerWriter.getKeys()) {
reducer.reduce(new Text(key), combinerWriter.getValue(key), reducerContext);
}
@@ -339,7 +318,7 @@ public class TestKmeansClustering extend
converged = converged && cluster.isConverged();
// Since we aren't roundtripping through Writable, we need to compare the reference center with the
// cluster centroid
- cluster.recomputeCenter();
+ cluster.computeParameters();
assertEquals(ref.getCenter(), cluster.getCenter());
}
if (k == 8) {
@@ -353,7 +332,7 @@ public class TestKmeansClustering extend
/** Story: User wishes to run kmeans job on reference data */
public void testKMeansSeqJob() throws Exception {
List<VectorWritable> points = getPointsWritable(reference);
-
+
Path pointsPath = getTestTempDirPath("points");
Path clustersPath = getTestTempDirPath("clusters");
Configuration conf = new Configuration();
@@ -365,13 +344,13 @@ public class TestKmeansClustering extend
Path path = new Path(clustersPath, "part-00000");
FileSystem fs = FileSystem.get(path.toUri(), conf);
SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path, Text.class, Cluster.class);
-
+
for (int i = 0; i < k + 1; i++) {
Vector vec = points.get(i).get();
-
+
Cluster cluster = new Cluster(vec, i);
// add the center so the centroid will be correct upon output
- cluster.addPoint(cluster.getCenter());
+ cluster.observe(cluster.getCenter(), 1);
writer.append(new Text(cluster.getIdentifier()), cluster);
}
writer.close();
@@ -385,7 +364,7 @@ public class TestKmeansClustering extend
optKey(DefaultOptionCreator.CLUSTERING_OPTION), optKey(DefaultOptionCreator.OVERWRITE_OPTION),
optKey(DefaultOptionCreator.METHOD_OPTION), DefaultOptionCreator.SEQUENTIAL_METHOD };
new KMeansDriver().run(args);
-
+
// now compare the expected clusters with actual
Path clusteredPointsPath = new Path(outputPath, "clusteredPoints");
SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path(clusteredPointsPath, "part-m-0"), conf);
@@ -426,7 +405,7 @@ public class TestKmeansClustering extend
Cluster cluster = new Cluster(vec, i);
// add the center so the centroid will be correct upon output
- cluster.addPoint(cluster.getCenter());
+ cluster.observe(cluster.getCenter(), 1);
writer.append(new Text(cluster.getIdentifier()), cluster);
}
writer.close();
@@ -491,8 +470,6 @@ public class TestKmeansClustering extend
// now compare the expected clusters with actual
Path clusteredPointsPath = new Path(outputPath, "clusteredPoints");
- //String[] outFiles = outDir.list();
- //assertEquals("output dir files?", 4, outFiles.length);
DummyOutputCollector<IntWritable, WeightedVectorWritable> collector = new DummyOutputCollector<IntWritable, WeightedVectorWritable>();
SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path(clusteredPointsPath, "part-m-00000"), conf);
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestRandomSeedGenerator.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestRandomSeedGenerator.java?rev=978786&r1=978785&r2=978786&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestRandomSeedGenerator.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestRandomSeedGenerator.java Sat Jul 24 03:39:30 2010
@@ -29,6 +29,7 @@ import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
+import org.apache.mahout.clustering.AbstractCluster;
import org.apache.mahout.clustering.ClusteringTestUtils;
import org.apache.mahout.common.MahoutTestCase;
import org.apache.mahout.math.RandomAccessSparseVector;
@@ -87,7 +88,7 @@ public class TestRandomSeedGenerator ext
SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path(output, "part-randomSeed"), conf);
Writable key = (Writable) reader.getKeyClass().newInstance();
- Cluster value = (Cluster) reader.getValueClass().newInstance();
+ AbstractCluster value = (AbstractCluster) reader.getValueClass().newInstance();
int clusterCount = 0;
Set<Integer> set = new HashSet<Integer>();
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java?rev=978786&r1=978785&r2=978786&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java Sat Jul 24 03:39:30 2010
@@ -69,7 +69,7 @@ public class TestMeanShift extends Mahou
}
}
for (MeanShiftCanopy canopy : canopies) {
- int ch = 'A' + canopy.getCanopyId();
+ int ch = 'A' + canopy.getId();
for (int pid : canopy.getBoundPoints().toList()) {
Vector pt = raw[pid];
out[(int) pt.getQuick(0)][(int) pt.getQuick(1)] = (char) ch;
@@ -203,8 +203,8 @@ public class TestMeanShift extends Mahou
for (Map.Entry<String, MeanShiftCanopy> stringMeanShiftCanopyEntry : refCanopyMap.entrySet()) {
MeanShiftCanopy ref = stringMeanShiftCanopyEntry.getValue();
- MeanShiftCanopy canopy = canopyMap.get((ref.isConverged() ? "V-" : "C-") + ref.getCanopyId());
- assertEquals("ids", ref.getCanopyId(), canopy.getCanopyId());
+ MeanShiftCanopy canopy = canopyMap.get((ref.isConverged() ? "MSV-" : "MSC-") + ref.getId());
+ assertEquals("ids", ref.getId(), canopy.getId());
assertEquals("centers(" + ref.getIdentifier() + ')', ref.getCenter().asFormatString(), canopy.getCenter().asFormatString());
assertEquals("bound points", ref.getBoundPoints().toList().size(), canopy.getBoundPoints().toList().size());
}
@@ -241,6 +241,7 @@ public class TestMeanShift extends Mahou
conf.set(MeanShiftCanopyConfigKeys.T1_KEY, "4");
conf.set(MeanShiftCanopyConfigKeys.T2_KEY, "1");
conf.set(MeanShiftCanopyConfigKeys.CLUSTER_CONVERGENCE_KEY, "0.5");
+ conf.set(MeanShiftCanopyConfigKeys.CONTROL_PATH_KEY, "output/control");
MeanShiftCanopyMapper mapper = new MeanShiftCanopyMapper();
DummyRecordWriter<Text, MeanShiftCanopy> mapWriter = new DummyRecordWriter<Text, MeanShiftCanopy>();
@@ -280,11 +281,10 @@ public class TestMeanShift extends Mahou
for (Map.Entry<String, MeanShiftCanopy> mapEntry : reducerReferenceMap.entrySet()) {
MeanShiftCanopy refCanopy = mapEntry.getValue();
- List<MeanShiftCanopy> values = reduceWriter.getValue(new Text((refCanopy.isConverged() ? "V-" : "C-")
- + refCanopy.getCanopyId()));
+ List<MeanShiftCanopy> values = reduceWriter.getValue(new Text((refCanopy.isConverged() ? "MSV-" : "MSC-") + refCanopy.getId()));
assertEquals("values", 1, values.size());
MeanShiftCanopy reducerCanopy = values.get(0);
- assertEquals("ids", refCanopy.getCanopyId(), reducerCanopy.getCanopyId());
+ assertEquals("ids", refCanopy.getId(), reducerCanopy.getId());
int refNumPoints = refCanopy.getNumPoints();
int reducerNumPoints = reducerCanopy.getNumPoints();
assertEquals("numPoints", refNumPoints, reducerNumPoints);
Added: mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/ClustersFilter.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/ClustersFilter.java?rev=978786&view=auto
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/ClustersFilter.java (added)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/ClustersFilter.java Sat Jul 24 03:39:30 2010
@@ -0,0 +1,11 @@
+package org.apache.mahout.clustering.display;
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathFilter;
+
+public class ClustersFilter implements PathFilter {
+ @Override
+ public boolean accept(Path path) {
+ return (path.toString().contains("/clusters-"));
+ }
+}
Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayCanopy.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayCanopy.java?rev=978786&r1=978785&r2=978786&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayCanopy.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayCanopy.java Sat Jul 24 03:39:30 2010
@@ -17,17 +17,20 @@
package org.apache.mahout.clustering.display;
-import java.awt.BasicStroke;
import java.awt.Graphics;
import java.awt.Graphics2D;
+import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
+import org.apache.hadoop.fs.Path;
+import org.apache.mahout.clustering.Cluster;
import org.apache.mahout.clustering.canopy.Canopy;
import org.apache.mahout.clustering.canopy.CanopyClusterer;
+import org.apache.mahout.clustering.canopy.CanopyDriver;
+import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.RandomUtils;
import org.apache.mahout.common.distance.ManhattanDistanceMeasure;
-import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
@@ -35,12 +38,6 @@ class DisplayCanopy extends DisplayClust
private static final long serialVersionUID = 1L;
- private static List<Canopy> canopies;
-
- private static final double T1 = 3.0;
-
- private static final double T2 = 1.6;
-
DisplayCanopy() {
initialize();
this.setTitle("Canopy Clusters (>" + (int) (SIGNIFICANCE * 100) + "% of population)");
@@ -48,33 +45,38 @@ class DisplayCanopy extends DisplayClust
@Override
public void paint(Graphics g) {
- Graphics2D g2 = (Graphics2D) g;
- plotSampleData(g2);
- Vector dv = new DenseVector(2);
- for (Canopy canopy : canopies) {
- if (canopy.getNumPoints() > DisplayClustering.SAMPLE_DATA.size() * SIGNIFICANCE) {
- g2.setStroke(new BasicStroke(2));
- g2.setColor(COLORS[1]);
- dv.assign(T1);
- Vector center = canopy.computeCentroid();
- DisplayClustering.plotEllipse(g2, center, dv);
- g2.setStroke(new BasicStroke(3));
- g2.setColor(COLORS[0]);
- dv.assign(T2);
- DisplayClustering.plotEllipse(g2, center, dv);
- }
- }
+ plotSampleData((Graphics2D) g);
+ plotClusters((Graphics2D) g);
}
- public static void main(String[] args) {
+ public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException, InstantiationException,
+ IllegalAccessException {
+ SIGNIFICANCE = 0.05;
+ Path samples = new Path("samples");
+ Path output = new Path("output");
+ HadoopUtil.overwriteOutput(samples);
+ HadoopUtil.overwriteOutput(output);
RandomUtils.useTestSeed();
- DisplayClustering.generateSamples();
- List<Vector> points = new ArrayList<Vector>();
- for (VectorWritable sample : SAMPLE_DATA) {
- points.add(sample.get());
+ generateSamples();
+ writeSampleData(samples);
+ boolean b = true;
+ if (b) {
+ new CanopyDriver().buildClusters(samples, output, ManhattanDistanceMeasure.class.getName(), T1, T2, true);
+ loadClusters(output);
+ } else {
+ List<Vector> points = new ArrayList<Vector>();
+ for (VectorWritable sample : SAMPLE_DATA) {
+ points.add(sample.get());
+ }
+ List<Canopy> canopies = CanopyClusterer.createCanopies(points, new ManhattanDistanceMeasure(), T1, T2);
+ CanopyClusterer.updateCentroids(canopies);
+ List<Cluster> clusters = new ArrayList<Cluster>();
+ for (Canopy canopy : canopies)
+ clusters.add(canopy);
+ CLUSTERS.add(clusters);
}
- canopies = CanopyClusterer.createCanopies(points, new ManhattanDistanceMeasure(), T1, T2);
- CanopyClusterer.updateCentroids(canopies);
+
new DisplayCanopy();
}
+
}
Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayClustering.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayClustering.java?rev=978786&r1=978785&r2=978786&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayClustering.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayClustering.java Sat Jul 24 03:39:30 2010
@@ -28,11 +28,21 @@ import java.awt.event.WindowEvent;
import java.awt.geom.AffineTransform;
import java.awt.geom.Ellipse2D;
import java.awt.geom.Rectangle2D;
+import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.clustering.AbstractCluster;
import org.apache.mahout.clustering.Cluster;
import org.apache.mahout.clustering.dirichlet.UncommonDistributions;
+import org.apache.mahout.clustering.kmeans.OutputLogFilter;
import org.apache.mahout.common.RandomUtils;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.Vector;
@@ -59,6 +69,10 @@ public class DisplayClustering extends F
protected static final Color[] COLORS = { Color.red, Color.orange, Color.yellow, Color.green, Color.blue, Color.magenta,
Color.lightGray };
+ protected static final double T1 = 3.0;
+
+ protected static final double T2 = 2.8;
+
protected static int res; // screen resolution
public DisplayClustering() {
@@ -215,6 +229,54 @@ public class DisplayClustering extends F
}
}
+ protected static void writeSampleData(Path output) throws IOException {
+ Configuration conf = new Configuration();
+ FileSystem fs = FileSystem.get(output.toUri(), conf);
+ SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, output, Text.class, VectorWritable.class);
+ try {
+ for (VectorWritable vw : SAMPLE_DATA) {
+ writer.append(new Text(), vw);
+ }
+ } finally {
+ writer.close();
+ }
+ }
+
+ protected static List<Cluster> readClusters(Path clustersIn) throws IOException, InstantiationException, IllegalAccessException {
+ List<Cluster> clusters = new ArrayList<Cluster>();
+ Configuration conf = new Configuration();
+ FileSystem fs = FileSystem.get(clustersIn.toUri(), conf);
+ FileStatus[] status = fs.listStatus(clustersIn, new OutputLogFilter());
+ for (FileStatus s : status) {
+ SequenceFile.Reader reader = new SequenceFile.Reader(fs, s.getPath(), conf);
+ try {
+ Text key = new Text();
+ Writable value = (Writable) reader.getValueClass().newInstance();
+ while (reader.next(key, value)) {
+ Cluster cluster = (Cluster) value;
+ log.info("Reading Cluster:" + cluster.getId() + " center:" + AbstractCluster.formatVector(cluster.getCenter(), null)
+ + " numPoints:" + cluster.getNumPoints() + " radius:" + AbstractCluster.formatVector(cluster.getRadius(), null));
+ clusters.add(cluster);
+ value = (Writable) reader.getValueClass().newInstance();
+ }
+ } finally {
+ reader.close();
+ }
+ }
+ return clusters;
+ }
+
+ protected static void loadClusters(Path output) throws IOException, InstantiationException, IllegalAccessException{
+ List<Cluster> clusters = new ArrayList<Cluster>();
+ Configuration conf = new Configuration();
+ FileSystem fs = FileSystem.get(output.toUri(), conf);
+ FileStatus[] status = fs.listStatus(output, new ClustersFilter());
+ for (FileStatus s : status) {
+ clusters = readClusters(s.getPath());
+ CLUSTERS.add(clusters);
+ }
+ }
+
/**
* Generate random samples and add them to the sampleData
*
Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayFuzzyKMeans.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayFuzzyKMeans.java?rev=978786&r1=978785&r2=978786&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayFuzzyKMeans.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayFuzzyKMeans.java Sat Jul 24 03:39:30 2010
@@ -19,11 +19,16 @@ package org.apache.mahout.clustering.dis
import java.awt.Graphics;
import java.awt.Graphics2D;
+import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
+import org.apache.hadoop.fs.Path;
import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansClusterer;
+import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver;
import org.apache.mahout.clustering.fuzzykmeans.SoftCluster;
+import org.apache.mahout.clustering.kmeans.RandomSeedGenerator;
+import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.RandomUtils;
import org.apache.mahout.common.distance.DistanceMeasure;
import org.apache.mahout.common.distance.ManhattanDistanceMeasure;
@@ -44,43 +49,67 @@ class DisplayFuzzyKMeans extends Display
plotClusters((Graphics2D) g);
}
- public static void main(String[] args) {
+ public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException, InstantiationException,
+ IllegalAccessException {
DistanceMeasure measure = new ManhattanDistanceMeasure();
double threshold = 0.001;
int numClusters = 3;
int numIterations = 10;
int m = 3;
-
+
+ Path samples = new Path("samples");
+ Path output = new Path("output");
+ HadoopUtil.overwriteOutput(samples);
+ HadoopUtil.overwriteOutput(output);
RandomUtils.useTestSeed();
DisplayClustering.generateSamples();
- List<Vector> points = new ArrayList<Vector>();
- for (VectorWritable sample : SAMPLE_DATA) {
- points.add((Vector) sample.get());
- }
- int id = 0;
- List<SoftCluster> initialClusters = new ArrayList<SoftCluster>();
- for (Vector point : points) {
- if (initialClusters.size() < Math.min(numClusters, points.size())) {
- initialClusters.add(new SoftCluster(point, id++));
- } else {
- break;
+ boolean b = true;
+ if (b) {
+ writeSampleData(samples);
+ Path clusters = RandomSeedGenerator.buildRandom(samples, new Path(output, "clusters-0"), 3);
+ FuzzyKMeansDriver.runJob(samples,
+ clusters,
+ output,
+ measure.getClass().getName(),
+ threshold,
+ numIterations,
+ 1,
+ m,
+ true,
+ true,
+ threshold,
+ b);
+ loadClusters(output);
+ } else {
+ List<Vector> points = new ArrayList<Vector>();
+ for (VectorWritable sample : SAMPLE_DATA) {
+ points.add((Vector) sample.get());
}
- }
- List<List<SoftCluster>> results = FuzzyKMeansClusterer.clusterPoints(points,
- initialClusters,
- measure,
- threshold,
- m,
- numIterations);
- for (List<SoftCluster> models : results) {
- List<org.apache.mahout.clustering.Cluster> clusters = new ArrayList<org.apache.mahout.clustering.Cluster>();
- for (SoftCluster cluster : models) {
- org.apache.mahout.clustering.Cluster cluster2 = (org.apache.mahout.clustering.Cluster) cluster;
- if (isSignificant(cluster2)) {
- clusters.add(cluster2);
+ int id = 0;
+ List<SoftCluster> initialClusters = new ArrayList<SoftCluster>();
+ for (Vector point : points) {
+ if (initialClusters.size() < Math.min(numClusters, points.size())) {
+ initialClusters.add(new SoftCluster(point, id++));
+ } else {
+ break;
+ }
+ }
+ List<List<SoftCluster>> results = FuzzyKMeansClusterer.clusterPoints(points,
+ initialClusters,
+ measure,
+ threshold,
+ m,
+ numIterations);
+ for (List<SoftCluster> models : results) {
+ List<org.apache.mahout.clustering.Cluster> clusters = new ArrayList<org.apache.mahout.clustering.Cluster>();
+ for (SoftCluster cluster : models) {
+ org.apache.mahout.clustering.Cluster cluster2 = (org.apache.mahout.clustering.Cluster) cluster;
+ if (isSignificant(cluster2)) {
+ clusters.add(cluster2);
+ }
}
+ CLUSTERS.add(clusters);
}
- CLUSTERS.add(clusters);
}
new DisplayFuzzyKMeans();
}
Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayKMeans.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayKMeans.java?rev=978786&r1=978785&r2=978786&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayKMeans.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayKMeans.java Sat Jul 24 03:39:30 2010
@@ -19,11 +19,17 @@ package org.apache.mahout.clustering.dis
import java.awt.Graphics;
import java.awt.Graphics2D;
+import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
+import org.apache.hadoop.fs.Path;
+import org.apache.mahout.clustering.AbstractCluster;
import org.apache.mahout.clustering.kmeans.Cluster;
import org.apache.mahout.clustering.kmeans.KMeansClusterer;
+import org.apache.mahout.clustering.kmeans.KMeansDriver;
+import org.apache.mahout.clustering.kmeans.RandomSeedGenerator;
+import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.RandomUtils;
import org.apache.mahout.common.distance.DistanceMeasure;
import org.apache.mahout.common.distance.ManhattanDistanceMeasure;
@@ -39,40 +45,52 @@ class DisplayKMeans extends DisplayClust
this.setTitle("k-Means Clusters (>" + (int) (SIGNIFICANCE * 100) + "% of population)");
}
- public static void main(String[] args) {
+ public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException, InstantiationException,
+ IllegalAccessException {
DistanceMeasure measure = new ManhattanDistanceMeasure();
int numClusters = 3;
int maxIter = 10;
double distanceThreshold = 0.001;
-
+ Path samples = new Path("samples");
+ Path output = new Path("output");
+ HadoopUtil.overwriteOutput(samples);
+ HadoopUtil.overwriteOutput(output);
+
RandomUtils.useTestSeed();
DisplayClustering.generateSamples();
- List<Vector> points = new ArrayList<Vector>();
- for (VectorWritable sample : SAMPLE_DATA) {
- points.add(sample.get());
- }
- List<Cluster> initialClusters = new ArrayList<Cluster>();
- int id = 0;
- for (Vector point : points) {
- if (initialClusters.size() < Math.min(numClusters, points.size())) {
- initialClusters.add(new Cluster(point, id++));
- } else {
- break;
+ writeSampleData(samples);
+ boolean b = true;
+ if (b) {
+ Path clusters = RandomSeedGenerator.buildRandom(samples, new Path(output, "clusters-0"), 3);
+ KMeansDriver.runJob(samples, clusters, output, measure.getClass().getName(), distanceThreshold, maxIter, 1, true, true);
+ loadClusters(output);
+ } else {
+ List<Vector> points = new ArrayList<Vector>();
+ for (VectorWritable sample : SAMPLE_DATA) {
+ points.add(sample.get());
}
- }
- result = KMeansClusterer.clusterPoints(points, initialClusters, measure, maxIter, distanceThreshold);
- for (List<Cluster> models : result) {
- List<org.apache.mahout.clustering.Cluster> clusters = new ArrayList<org.apache.mahout.clustering.Cluster>();
- for (Cluster cluster : models) {
- org.apache.mahout.clustering.Cluster cluster2 = (org.apache.mahout.clustering.Cluster) cluster;
- if (isSignificant(cluster2)) {
- clusters.add(cluster2);
+ List<Cluster> initialClusters = new ArrayList<Cluster>();
+ int id = 0;
+ for (Vector point : points) {
+ if (initialClusters.size() < Math.min(numClusters, points.size())) {
+ initialClusters.add(new Cluster(point, id++));
+ } else {
+ break;
}
}
- CLUSTERS.add(clusters);
- }
- System.out.println(result.size());
+ result = KMeansClusterer.clusterPoints(points, initialClusters, measure, maxIter, distanceThreshold);
+ for (List<Cluster> models : result) {
+ List<org.apache.mahout.clustering.Cluster> clusters = new ArrayList<org.apache.mahout.clustering.Cluster>();
+ for (AbstractCluster cluster : models) {
+ org.apache.mahout.clustering.Cluster cluster2 = (org.apache.mahout.clustering.Cluster) cluster;
+ if (isSignificant(cluster2)) {
+ clusters.add(cluster2);
+ }
+ }
+ CLUSTERS.add(clusters);
+ }
+ }
new DisplayKMeans();
}
Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayMeanShift.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayMeanShift.java?rev=978786&r1=978785&r2=978786&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayMeanShift.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayMeanShift.java Sat Jul 24 03:39:30 2010
@@ -21,11 +21,16 @@ import java.awt.Color;
import java.awt.Graphics;
import java.awt.Graphics2D;
import java.awt.geom.AffineTransform;
+import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
+import org.apache.hadoop.fs.Path;
+import org.apache.mahout.clustering.Cluster;
import org.apache.mahout.clustering.meanshift.MeanShiftCanopy;
import org.apache.mahout.clustering.meanshift.MeanShiftCanopyClusterer;
+import org.apache.mahout.clustering.meanshift.MeanShiftCanopyDriver;
+import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.RandomUtils;
import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
import org.apache.mahout.math.DenseVector;
@@ -38,8 +43,6 @@ final class DisplayMeanShift extends Dis
private static final Logger log = LoggerFactory.getLogger(DisplayMeanShift.class);
- private static List<MeanShiftCanopy> canopies = new ArrayList<MeanShiftCanopy>();
-
private static double t1;
private static double t2;
@@ -70,7 +73,8 @@ final class DisplayMeanShift extends Dis
DisplayClustering.plotRectangle(g2, v.get(), dv);
}
int i = 0;
- for (MeanShiftCanopy canopy : canopies) {
+ for (Cluster cluster : CLUSTERS.get(CLUSTERS.size()-1)) {
+ MeanShiftCanopy canopy = (MeanShiftCanopy) cluster;
if (canopy.getBoundPoints().toList().size() >= SIGNIFICANCE * DisplayClustering.SAMPLE_DATA.size()) {
g2.setColor(COLORS[Math.min(i++, DisplayClustering.COLORS.length - 1)]);
int count = 0;
@@ -88,20 +92,35 @@ final class DisplayMeanShift extends Dis
}
}
- public static void main(String[] args) {
+ public static void main(String[] args) throws IOException, InstantiationException, IllegalAccessException, InterruptedException,
+ ClassNotFoundException {
t1 = 1.5;
- t2 = 0.1;
+ t2 = 0.5;
SIGNIFICANCE = 0.02;
+ EuclideanDistanceMeasure measure = new EuclideanDistanceMeasure();
+
+ Path samples = new Path("samples");
+ Path output = new Path("output");
+ HadoopUtil.overwriteOutput(samples);
+ HadoopUtil.overwriteOutput(output);
RandomUtils.useTestSeed();
DisplayClustering.generateSamples();
- List<Vector> points = new ArrayList<Vector>();
- for (VectorWritable sample : SAMPLE_DATA) {
- points.add(sample.get());
- }
- canopies = MeanShiftCanopyClusterer.clusterPoints(points, new EuclideanDistanceMeasure(), 0.005, t1, t2, 20);
- for (MeanShiftCanopy canopy : canopies) {
- log.info(canopy.toString());
+ writeSampleData(samples);
+ boolean b = true;
+ if (b) {
+ MeanShiftCanopyDriver.runJob(samples, output, measure.getClass().getName(), t1, t2, 0.005, 20, false, true, true);
+ loadClusters(output);
+ } else {
+ List<Vector> points = new ArrayList<Vector>();
+ for (VectorWritable sample : SAMPLE_DATA) {
+ points.add(sample.get());
+ }
+ List<MeanShiftCanopy> canopies = new ArrayList<MeanShiftCanopy>();
+ canopies = MeanShiftCanopyClusterer.clusterPoints(points, measure, 0.005, t1, t2, 20);
+ for (MeanShiftCanopy canopy : canopies) {
+ log.info(canopy.toString());
+ }
}
new DisplayMeanShift();
}
Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java?rev=978786&r1=978785&r2=978786&view=diff
==============================================================================
--- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java (original)
+++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java Sat Jul 24 03:39:30 2010
@@ -40,8 +40,8 @@ import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Writable;
+import org.apache.mahout.clustering.AbstractCluster;
import org.apache.mahout.clustering.Cluster;
-import org.apache.mahout.clustering.ClusterBase;
import org.apache.mahout.clustering.WeightedVectorWritable;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.Pair;
@@ -191,7 +191,7 @@ public final class ClusterDumper extends
WeightedVectorWritable point = iterator.next();
writer.write(String.valueOf(point.getWeight()));
writer.write(": ");
- writer.write(ClusterBase.formatVector(point.getVector().get(), dictionary));
+ writer.write(AbstractCluster.formatVector(point.getVector().get(), dictionary));
if (iterator.hasNext()) {
writer.write("\n\t");
}
Modified: mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java?rev=978786&r1=978785&r2=978786&view=diff
==============================================================================
--- mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java (original)
+++ mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java Sat Jul 24 03:39:30 2010
@@ -129,7 +129,7 @@ public class TestClusterDumper extends M
} else {
namedVector = new NamedVector(vector, "P(" + i + ')');
}
- System.out.println(ClusterBase.formatVector(namedVector, termDictionary));
+ System.out.println(AbstractCluster.formatVector(namedVector, termDictionary));
sampleData.add(new VectorWritable(namedVector));
i++;
}
Modified: mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java?rev=978786&r1=978785&r2=978786&view=diff
==============================================================================
--- mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java (original)
+++ mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java Sat Jul 24 03:39:30 2010
@@ -29,8 +29,8 @@ import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
+import org.apache.mahout.clustering.AbstractCluster;
import org.apache.mahout.clustering.Cluster;
-import org.apache.mahout.clustering.ClusterBase;
import org.apache.mahout.clustering.ClusteringTestUtils;
import org.apache.mahout.clustering.canopy.Canopy;
import org.apache.mahout.clustering.canopy.CanopyDriver;
@@ -77,7 +77,7 @@ public class TestCDbwEvaluator extends M
IntWritable clusterId = new IntWritable(0);
VectorWritable point = new VectorWritable();
while (reader.next(clusterId, point)) {
- System.out.println("\tC-" + clusterId + ": " + ClusterBase.formatVector(point.get(), null));
+ System.out.println("\tC-" + clusterId + ": " + AbstractCluster.formatVector(point.get(), null));
}
} finally {
reader.close();