You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by je...@apache.org on 2010/04/29 18:16:11 UTC
svn commit: r939360 - in /lucene/mahout/trunk:
core/src/main/java/org/apache/mahout/clustering/
core/src/main/java/org/apache/mahout/clustering/canopy/
core/src/main/java/org/apache/mahout/clustering/dirichlet/
core/src/main/java/org/apache/mahout/clus...
Author: jeastman
Date: Thu Apr 29 16:16:10 2010
New Revision: 939360
URL: http://svn.apache.org/viewvc?rev=939360&view=rev
Log:
MAHOUT-236: modified point clustering jobs to output WeightedPointWritables containing the clustered point and its probability of membership. Still only outputting the top cluster for each point
Added:
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/WeightedVectorWritable.java
- copied, changed from r939144, lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/WeightedPointWritable.java
Removed:
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/WeightedPointWritable.java
Modified:
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyClusterer.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/ClusterDriver.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/ClusterMapper.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletClusterMapper.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansClusterMapper.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansClusterer.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterMapper.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterer.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyClusterMapper.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java
lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java
lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java
lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java
lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/meanshift/DisplayMeanShift.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwDriver.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwMapper.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwReducer.java
Copied: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/WeightedVectorWritable.java (from r939144, lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/WeightedPointWritable.java)
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/WeightedVectorWritable.java?p2=lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/WeightedVectorWritable.java&p1=lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/WeightedPointWritable.java&r1=939144&r2=939360&rev=939360&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/WeightedPointWritable.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/WeightedVectorWritable.java Thu Apr 29 16:16:10 2010
@@ -24,7 +24,11 @@ import java.io.IOException;
import org.apache.hadoop.io.Writable;
import org.apache.mahout.math.VectorWritable;
-public class WeightedPointWritable implements Writable {
+public class WeightedVectorWritable implements Writable {
+
+ private double weight;
+
+ private VectorWritable vector;
/**
* @return the weight
@@ -36,39 +40,35 @@ public class WeightedPointWritable imple
/**
* @return the point
*/
- public VectorWritable getPoint() {
- return point;
+ public VectorWritable getVector() {
+ return vector;
}
- public WeightedPointWritable(double weight, VectorWritable point) {
+ public WeightedVectorWritable(double weight, VectorWritable vector) {
super();
this.weight = weight;
- this.point = point;
+ this.vector = vector;
}
- public WeightedPointWritable() {
+ public WeightedVectorWritable() {
super();
}
- private double weight;
-
- private VectorWritable point;
-
@Override
public void readFields(DataInput in) throws IOException {
weight = in.readDouble();
- point = new VectorWritable();
- point.readFields(in);
+ vector = new VectorWritable();
+ vector.readFields(in);
}
@Override
public void write(DataOutput out) throws IOException {
out.writeDouble(weight);
- point.write(out);
+ vector.write(out);
}
public String toString() {
- return String.valueOf(weight) + ": " + (point == null ? "null" : ClusterBase.formatVector(point.get(), null));
+ return String.valueOf(weight) + ": " + (vector == null ? "null" : ClusterBase.formatVector(vector.get(), null));
}
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyClusterer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyClusterer.java?rev=939360&r1=939359&r2=939360&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyClusterer.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyClusterer.java Thu Apr 29 16:16:10 2010
@@ -27,37 +27,38 @@ import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
+import org.apache.mahout.clustering.WeightedVectorWritable;
import org.apache.mahout.common.distance.DistanceMeasure;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
public class CanopyClusterer {
-
+
private int nextCanopyId;
-
+
private int numVectors;
-
+
// the T1 distance threshold
private double t1;
-
+
// the T2 distance threshold
private double t2;
-
+
// the distance measure
private DistanceMeasure measure;
-
+
// private int nextClusterId = 0;
-
+
public CanopyClusterer(DistanceMeasure measure, double t1, double t2) {
this.t1 = t1;
this.t2 = t2;
this.measure = measure;
}
-
+
public CanopyClusterer(JobConf job) {
this.configure(job);
}
-
+
/**
* Configure the Canopy and its distance measure
*
@@ -81,14 +82,14 @@ public class CanopyClusterer {
t2 = Double.parseDouble(job.get(CanopyConfigKeys.T2_KEY));
nextCanopyId = 0;
}
-
+
/** Configure the Canopy for unit tests */
public void config(DistanceMeasure aMeasure, double aT1, double aT2) {
measure = aMeasure;
t1 = aT1;
t2 = aT2;
}
-
+
/**
* This is the same algorithm as the reference but inverted to iterate over existing canopies instead of the
* points. Because of this it does not need to actually store the points, instead storing a total points
@@ -118,7 +119,7 @@ public class CanopyClusterer {
}
numVectors++;
}
-
+
/**
* This method is used by the CanopyMapper to perform canopy inclusion tests and to emit the point and its
* covering canopies to the output. The CanopyCombiner will then sum the canopy points and produce the
@@ -131,9 +132,8 @@ public class CanopyClusterer {
* @param collector
* an OutputCollector in which to emit the point
*/
- public void emitPointToNewCanopies(Vector point,
- List<Canopy> canopies,
- OutputCollector<Text,Vector> collector) throws IOException {
+ public void emitPointToNewCanopies(Vector point, List<Canopy> canopies, OutputCollector<Text, Vector> collector)
+ throws IOException {
boolean pointStronglyBound = false;
for (Canopy canopy : canopies) {
double dist = measure.distance(canopy.getCenter().getLengthSquared(), canopy.getCenter(), point);
@@ -148,7 +148,7 @@ public class CanopyClusterer {
canopy.emitPoint(point, collector);
}
}
-
+
/**
* This method is used by the CanopyMapper to perform canopy inclusion tests and to emit the point keyed by
* its covering canopies to the output. if the point is not covered by any canopies (due to canopy centroid
@@ -163,10 +163,8 @@ public class CanopyClusterer {
* @param reporter
* to report status of the job
*/
- public void emitPointToExistingCanopies(Vector point,
- List<Canopy> canopies,
- OutputCollector<IntWritable,VectorWritable> collector,
- Reporter reporter) throws IOException {
+ public void emitPointToExistingCanopies(Vector point, List<Canopy> canopies,
+ OutputCollector<IntWritable, WeightedVectorWritable> collector, Reporter reporter) throws IOException {
double minDist = Double.MAX_VALUE;
Canopy closest = null;
boolean isCovered = false;
@@ -175,7 +173,7 @@ public class CanopyClusterer {
if (dist < t1) {
isCovered = true;
VectorWritable vw = new VectorWritable(point);
- collector.collect(new IntWritable(canopy.getId()), vw);
+ collector.collect(new IntWritable(canopy.getId()), new WeightedVectorWritable(1, vw));
reporter.setStatus("Emit Canopy ID:" + canopy.getIdentifier());
} else if (dist < minDist) {
minDist = dist;
@@ -185,11 +183,11 @@ public class CanopyClusterer {
// if the point is not contained in any canopies (due to canopy centroid
// clustering), emit the point to the closest covering canopy.
if (!isCovered) {
- collector.collect(new IntWritable(closest.getId()), new VectorWritable(point));
+ collector.collect(new IntWritable(closest.getId()), new WeightedVectorWritable(1, new VectorWritable(point)));
reporter.setStatus("Emit Closest Canopy ID:" + closest.getIdentifier());
}
}
-
+
/**
* Return if the point is covered by the canopy
*
@@ -200,7 +198,7 @@ public class CanopyClusterer {
public boolean canopyCovers(Canopy canopy, Vector point) {
return measure.distance(canopy.getCenter().getLengthSquared(), canopy.getCenter(), point) < t1;
}
-
+
/**
* Iterate through the points, adding new canopies. Return the canopies.
*
@@ -246,7 +244,7 @@ public class CanopyClusterer {
}
return canopies;
}
-
+
/**
* Iterate through the canopies, adding their centroids to a list
*
@@ -261,7 +259,7 @@ public class CanopyClusterer {
}
return result;
}
-
+
/**
* Iterate through the canopies, resetting their center to their centroids
*
@@ -273,5 +271,5 @@ public class CanopyClusterer {
canopy.setCenter(canopy.computeCentroid());
}
}
-
+
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/ClusterDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/ClusterDriver.java?rev=939360&r1=939359&r2=939360&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/ClusterDriver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/ClusterDriver.java Thu Apr 29 16:16:10 2010
@@ -38,9 +38,9 @@ import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.lib.IdentityReducer;
+import org.apache.mahout.clustering.WeightedVectorWritable;
import org.apache.mahout.common.CommandLineUtil;
import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure;
-import org.apache.mahout.math.VectorWritable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -155,7 +155,7 @@ public final class ClusterDriver {
conf.setInputFormat(SequenceFileInputFormat.class);
conf.setOutputKeyClass(IntWritable.class);
- conf.setOutputValueClass(VectorWritable.class);
+ conf.setOutputValueClass(WeightedVectorWritable.class);
conf.setOutputFormat(SequenceFileOutputFormat.class);
FileInputFormat.setInputPaths(conf, new Path(points));
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/ClusterMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/ClusterMapper.java?rev=939360&r1=939359&r2=939360&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/ClusterMapper.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/ClusterMapper.java Thu Apr 29 16:16:10 2010
@@ -32,11 +32,12 @@ import org.apache.hadoop.mapred.MapReduc
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
+import org.apache.mahout.clustering.WeightedVectorWritable;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
public class ClusterMapper extends MapReduceBase implements
- Mapper<WritableComparable<?>,VectorWritable,IntWritable,VectorWritable> {
+ Mapper<WritableComparable<?>,VectorWritable,IntWritable,WeightedVectorWritable> {
private CanopyClusterer canopyClusterer;
private final List<Canopy> canopies = new ArrayList<Canopy>();
@@ -44,7 +45,7 @@ public class ClusterMapper extends MapRe
@Override
public void map(WritableComparable<?> key,
VectorWritable point,
- OutputCollector<IntWritable,VectorWritable> output,
+ OutputCollector<IntWritable,WeightedVectorWritable> output,
Reporter reporter) throws IOException {
canopyClusterer.emitPointToExistingCanopies(point.get(), canopies, output, reporter);
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletClusterMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletClusterMapper.java?rev=939360&r1=939359&r2=939360&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletClusterMapper.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletClusterMapper.java Thu Apr 29 16:16:10 2010
@@ -36,10 +36,11 @@ import org.apache.hadoop.mapred.OutputCo
import org.apache.hadoop.mapred.OutputLogFilter;
import org.apache.hadoop.mapred.Reporter;
import org.apache.mahout.clustering.ClusterBase;
+import org.apache.mahout.clustering.WeightedVectorWritable;
import org.apache.mahout.math.VectorWritable;
public class DirichletClusterMapper extends MapReduceBase implements
- Mapper<WritableComparable<?>, VectorWritable, IntWritable, VectorWritable> {
+ Mapper<WritableComparable<?>, VectorWritable, IntWritable, WeightedVectorWritable> {
private OutputCollector<IntWritable, VectorWritable> output;
@@ -47,7 +48,7 @@ public class DirichletClusterMapper exte
@SuppressWarnings("unchecked")
@Override
- public void map(WritableComparable<?> key, VectorWritable vector, OutputCollector<IntWritable, VectorWritable> output,
+ public void map(WritableComparable<?> key, VectorWritable vector, OutputCollector<IntWritable, WeightedVectorWritable> output,
Reporter reporter) throws IOException {
int clusterId = -1;
double clusterPdf = 0;
@@ -59,7 +60,7 @@ public class DirichletClusterMapper exte
}
}
System.out.println(clusterId + ": " + ClusterBase.formatVector(vector.get(), null));
- output.collect(new IntWritable(clusterId), vector);
+ output.collect(new IntWritable(clusterId), new WeightedVectorWritable(clusterPdf, vector));
}
@Override
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java?rev=939360&r1=939359&r2=939360&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java Thu Apr 29 16:16:10 2010
@@ -42,6 +42,7 @@ import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.mahout.clustering.ClusterBase;
+import org.apache.mahout.clustering.WeightedVectorWritable;
import org.apache.mahout.clustering.dirichlet.models.VectorModelDistribution;
import org.apache.mahout.clustering.kmeans.KMeansDriver;
import org.apache.mahout.common.CommandLineUtil;
@@ -376,7 +377,7 @@ public class DirichletDriver {
conf.setJobName("Dirichlet Clustering");
conf.setOutputKeyClass(IntWritable.class);
- conf.setOutputValueClass(VectorWritable.class);
+ conf.setOutputValueClass(WeightedVectorWritable.class);
FileInputFormat.setInputPaths(conf, new Path(input));
Path outPath = new Path(output);
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansClusterMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansClusterMapper.java?rev=939360&r1=939359&r2=939360&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansClusterMapper.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansClusterMapper.java Thu Apr 29 16:16:10 2010
@@ -22,25 +22,24 @@ import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
-import org.apache.mahout.math.NamedVector;
+import org.apache.mahout.clustering.WeightedVectorWritable;
import org.apache.mahout.math.VectorWritable;
public class FuzzyKMeansClusterMapper extends MapReduceBase implements
- Mapper<WritableComparable<?>, VectorWritable, IntWritable, VectorWritable> {
+ Mapper<WritableComparable<?>, VectorWritable, IntWritable, WeightedVectorWritable> {
private final List<SoftCluster> clusters = new ArrayList<SoftCluster>();
private FuzzyKMeansClusterer clusterer;
@Override
- public void map(WritableComparable<?> key, VectorWritable point, OutputCollector<IntWritable, VectorWritable> output,
+ public void map(WritableComparable<?> key, VectorWritable point, OutputCollector<IntWritable, WeightedVectorWritable> output,
Reporter reporter) throws IOException {
clusterer.outputPointWithClusterProbabilities(key.toString(), point.get(), clusters, output);
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansClusterer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansClusterer.java?rev=939360&r1=939359&r2=939360&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansClusterer.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansClusterer.java Thu Apr 29 16:16:10 2010
@@ -25,9 +25,8 @@ import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
-import org.apache.mahout.clustering.ClusterBase;
+import org.apache.mahout.clustering.WeightedVectorWritable;
import org.apache.mahout.common.distance.DistanceMeasure;
-import org.apache.mahout.math.NamedVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
@@ -126,35 +125,32 @@ public class FuzzyKMeansClusterer {
* the OutputCollector to emit into
*/
public void outputPointWithClusterProbabilities(String key, Vector point, List<SoftCluster> clusters,
- OutputCollector<IntWritable, VectorWritable> output) throws IOException {
+ OutputCollector<IntWritable, WeightedVectorWritable> output) throws IOException {
- // TODO: remove this later
- // List<Double> clusterDistanceList = new ArrayList<Double>();
- //
- // for (SoftCluster cluster : clusters) {
- // clusterDistanceList.add(measure.distance(cluster.getCenter(), point));
- // }
- // FuzzyKMeansOutput fOutput = new FuzzyKMeansOutput(clusters.size());
- // for (int i = 0; i < clusters.size(); i++) {
- // double probWeight = computeProbWeight(clusterDistanceList.get(i), clusterDistanceList);
- // fOutput.add(i, clusters.get(i), probWeight);
- // }
- // String name = point.getName();
-
- // for now just emit the closest cluster
- int clusterId = -1;
- double distance = Double.MAX_VALUE;
+ // calculate point distances for all clusters
+ List<Double> clusterDistanceList = new ArrayList<Double>();
for (SoftCluster cluster : clusters) {
- Vector center = cluster.getCenter();
- // System.out.println("cluster-" + cluster.getId() + "@ " + ClusterBase.formatVector(center, null));
- double d = measure.distance(center, point);
- if (d < distance) {
- clusterId = cluster.getId();
- distance = d;
+ clusterDistanceList.add(measure.distance(cluster.getCenter(), point));
+ }
+ // calculate point pdf for all clusters
+ List<Double> clusterPdfList = new ArrayList<Double>();
+ for (int i = 0; i < clusters.size(); i++) {
+ double probWeight = computeProbWeight(clusterDistanceList.get(i), clusterDistanceList);
+ clusterPdfList.add(probWeight);
+ }
+ // for now just emit the most likely cluster
+ int clusterId = -1;
+ double clusterPdf = 0;
+ for (int i = 0; i < clusters.size(); i++) {
+ // System.out.println("cluster-" + clusters.get(i).getId() + "@ " + ClusterBase.formatVector(center, null));
+ double pdf = clusterPdfList.get(i);
+ if (pdf > clusterPdf) {
+ clusterId = clusters.get(i).getId();
+ clusterPdf = pdf;
}
}
// System.out.println("cluster-" + clusterId + ": " + ClusterBase.formatVector(point, null));
- output.collect(new IntWritable(clusterId), new VectorWritable(point));
+ output.collect(new IntWritable(clusterId), new WeightedVectorWritable(clusterPdf, new VectorWritable(point)));
}
/** Computes the probability of a point belonging to a cluster */
@@ -210,12 +206,8 @@ public class FuzzyKMeansClusterer {
* @return
* a List<List<SoftCluster>> of clusters produced per iteration
*/
- public static List<List<SoftCluster>> clusterPoints(List<Vector> points,
- List<SoftCluster> clusters,
- DistanceMeasure measure,
- double threshold,
- double m,
- int numIter) {
+ public static List<List<SoftCluster>> clusterPoints(List<Vector> points, List<SoftCluster> clusters, DistanceMeasure measure,
+ double threshold, double m, int numIter) {
List<List<SoftCluster>> clustersList = new ArrayList<List<SoftCluster>>();
clustersList.add(clusters);
FuzzyKMeansClusterer clusterer = new FuzzyKMeansClusterer(measure, threshold, m);
@@ -243,9 +235,7 @@ public class FuzzyKMeansClusterer {
* the List<Cluster> clusters
* @return
*/
- public static boolean runFuzzyKMeansIteration(List<Vector> points,
- List<SoftCluster> clusterList,
- FuzzyKMeansClusterer clusterer) {
+ public static boolean runFuzzyKMeansIteration(List<Vector> points, List<SoftCluster> clusterList, FuzzyKMeansClusterer clusterer) {
for (Vector point : points) {
List<Double> clusterDistanceList = new ArrayList<Double>();
for (SoftCluster cluster : clusterList) {
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java?rev=939360&r1=939359&r2=939360&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java Thu Apr 29 16:16:10 2010
@@ -17,7 +17,6 @@
package org.apache.mahout.clustering.fuzzykmeans;
-import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
@@ -46,11 +45,11 @@ import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.mahout.clustering.ClusterBase;
+import org.apache.mahout.clustering.WeightedVectorWritable;
import org.apache.mahout.clustering.kmeans.RandomSeedGenerator;
import org.apache.mahout.common.CommandLineUtil;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure;
-import org.apache.mahout.math.VectorWritable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -348,7 +347,7 @@ public final class FuzzyKMeansDriver {
conf.setJobName("Fuzzy K Means Clustering");
conf.setOutputKeyClass(IntWritable.class);
- conf.setOutputValueClass(VectorWritable.class);
+ conf.setOutputValueClass(WeightedVectorWritable.class);
FileInputFormat.setInputPaths(conf, new Path(input));
Path outPath = new Path(output);
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterMapper.java?rev=939360&r1=939359&r2=939360&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterMapper.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterMapper.java Thu Apr 29 16:16:10 2010
@@ -28,12 +28,12 @@ import org.apache.hadoop.mapred.MapReduc
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
+import org.apache.mahout.clustering.WeightedVectorWritable;
import org.apache.mahout.common.distance.DistanceMeasure;
-import org.apache.mahout.math.NamedVector;
import org.apache.mahout.math.VectorWritable;
public class KMeansClusterMapper extends MapReduceBase implements
- Mapper<WritableComparable<?>,VectorWritable,IntWritable,VectorWritable> {
+ Mapper<WritableComparable<?>,VectorWritable,IntWritable,WeightedVectorWritable> {
private final List<Cluster> clusters = new ArrayList<Cluster>();
private KMeansClusterer clusterer;
@@ -41,7 +41,7 @@ public class KMeansClusterMapper extends
@Override
public void map(WritableComparable<?> key,
VectorWritable point,
- OutputCollector<IntWritable,VectorWritable> output,
+ OutputCollector<IntWritable,WeightedVectorWritable> output,
Reporter reporter) throws IOException {
clusterer.outputPointWithClusterInfo(point.get(), clusters, output);
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterer.java?rev=939360&r1=939359&r2=939360&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterer.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterer.java Thu Apr 29 16:16:10 2010
@@ -23,8 +23,8 @@ import java.util.List;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.mahout.clustering.WeightedVectorWritable;
import org.apache.mahout.common.distance.DistanceMeasure;
-import org.apache.mahout.math.NamedVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.slf4j.Logger;
@@ -35,12 +35,12 @@ import org.slf4j.LoggerFactory;
* representation. The class can be used as part of a clustering job to be started as map/reduce job.
* */
public class KMeansClusterer {
-
+
private static final Logger log = LoggerFactory.getLogger(KMeansClusterer.class);
-
+
/** Distance to use for point to cluster comparison. */
private final DistanceMeasure measure;
-
+
/**
* Init the k-means clusterer with the distance measure to use for comparison.
*
@@ -51,7 +51,7 @@ public class KMeansClusterer {
public KMeansClusterer(DistanceMeasure measure) {
this.measure = measure;
}
-
+
/**
* Iterates over all clusters and identifies the one closes to the given point. Distance measure used is
* configured at creation time of .
@@ -61,9 +61,8 @@ public class KMeansClusterer {
* @param clusters
* a List<Cluster> to test.
*/
- public void emitPointToNearestCluster(Vector point,
- List<Cluster> clusters,
- OutputCollector<Text,KMeansInfo> output) throws IOException {
+ public void emitPointToNearestCluster(Vector point, List<Cluster> clusters, OutputCollector<Text, KMeansInfo> output)
+ throws IOException {
Cluster nearestCluster = null;
double nearestDistance = Double.MAX_VALUE;
for (Cluster cluster : clusters) {
@@ -80,10 +79,9 @@ public class KMeansClusterer {
// emit only clusterID
output.collect(new Text(nearestCluster.getIdentifier()), new KMeansInfo(1, point));
}
-
- public void outputPointWithClusterInfo(Vector vector,
- List<Cluster> clusters,
- OutputCollector<IntWritable,VectorWritable> output) throws IOException {
+
+ public void outputPointWithClusterInfo(Vector vector, List<Cluster> clusters,
+ OutputCollector<IntWritable, WeightedVectorWritable> output) throws IOException {
Cluster nearestCluster = null;
double nearestDistance = Double.MAX_VALUE;
for (Cluster cluster : clusters) {
@@ -94,10 +92,10 @@ public class KMeansClusterer {
nearestDistance = distance;
}
}
-
- output.collect(new IntWritable(nearestCluster.getId()), new VectorWritable(vector));
+
+ output.collect(new IntWritable(nearestCluster.getId()), new WeightedVectorWritable(1, new VectorWritable(vector)));
}
-
+
/**
* This is the reference k-means implementation. Given its inputs it iterates over the points and clusters
* until their centers converge or until the maximum number of iterations is exceeded.
@@ -111,14 +109,11 @@ public class KMeansClusterer {
* @param maxIter
* the maximum number of iterations
*/
- public static List<List<Cluster>> clusterPoints(List<Vector> points,
- List<Cluster> clusters,
- DistanceMeasure measure,
- int maxIter,
- double distanceThreshold) {
+ public static List<List<Cluster>> clusterPoints(List<Vector> points, List<Cluster> clusters, DistanceMeasure measure,
+ int maxIter, double distanceThreshold) {
List<List<Cluster>> clustersList = new ArrayList<List<Cluster>>();
clustersList.add(clusters);
-
+
boolean converged = false;
int iteration = 0;
while (!converged && iteration < maxIter) {
@@ -133,7 +128,7 @@ public class KMeansClusterer {
}
return clustersList;
}
-
+
/**
* Perform a single iteration over the points and clusters, assigning points to clusters and returning if
* the iterations are completed.
@@ -146,10 +141,8 @@ public class KMeansClusterer {
* a DistanceMeasure to use
* @return
*/
- public static boolean runKMeansIteration(List<Vector> points,
- List<Cluster> clusters,
- DistanceMeasure measure,
- double distanceThreshold) {
+ public static boolean runKMeansIteration(List<Vector> points, List<Cluster> clusters, DistanceMeasure measure,
+ double distanceThreshold) {
// iterate through all points, assigning each to the nearest cluster
for (Vector point : points) {
Cluster closestCluster = null;
@@ -178,5 +171,5 @@ public class KMeansClusterer {
}
return converged;
}
-
+
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java?rev=939360&r1=939359&r2=939360&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java Thu Apr 29 16:16:10 2010
@@ -40,6 +40,7 @@ import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.mahout.clustering.ClusterBase;
+import org.apache.mahout.clustering.WeightedVectorWritable;
import org.apache.mahout.common.CommandLineUtil;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure;
@@ -303,11 +304,8 @@ public final class KMeansDriver {
conf.setInputFormat(SequenceFileInputFormat.class);
conf.setOutputFormat(SequenceFileOutputFormat.class);
- conf.setMapOutputKeyClass(IntWritable.class);
- conf.setMapOutputValueClass(VectorWritable.class);
conf.setOutputKeyClass(IntWritable.class);
- // the output is the cluster id
- conf.setOutputValueClass(VectorWritable.class);
+ conf.setOutputValueClass(WeightedVectorWritable.class);
FileInputFormat.setInputPaths(conf, new Path(input));
Path outPath = new Path(output);
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyClusterMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyClusterMapper.java?rev=939360&r1=939359&r2=939360&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyClusterMapper.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyClusterMapper.java Thu Apr 29 16:16:10 2010
@@ -35,27 +35,27 @@ import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.OutputLogFilter;
import org.apache.hadoop.mapred.Reporter;
-import org.apache.mahout.clustering.ClusterBase;
+import org.apache.mahout.clustering.WeightedVectorWritable;
import org.apache.mahout.math.VectorWritable;
public class MeanShiftCanopyClusterMapper extends MapReduceBase implements
- Mapper<WritableComparable<?>, MeanShiftCanopy, IntWritable, VectorWritable> {
+ Mapper<WritableComparable<?>, MeanShiftCanopy, IntWritable, WeightedVectorWritable> {
private MeanShiftCanopyClusterer clusterer;
- private OutputCollector<IntWritable, VectorWritable> output;
+ private OutputCollector<IntWritable, WeightedVectorWritable> output;
private List<MeanShiftCanopy> canopies;
@Override
- public void map(WritableComparable<?> key, MeanShiftCanopy vector, OutputCollector<IntWritable, VectorWritable> output,
+ public void map(WritableComparable<?> key, MeanShiftCanopy vector, OutputCollector<IntWritable, WeightedVectorWritable> output,
Reporter reporter) throws IOException {
int vectorId = vector.getId();
for (MeanShiftCanopy msc : canopies) {
for (int containedId : msc.getBoundPoints().toList()) {
if (vectorId == containedId) {
// System.out.println(msc.getId() + ": v" + vectorId + "=" + ClusterBase.formatVector(vector.getCenter(), null));
- output.collect(new IntWritable(msc.getId()), new VectorWritable(vector.getCenter()));
+ output.collect(new IntWritable(msc.getId()), new WeightedVectorWritable(1, new VectorWritable(vector.getCenter())));
}
}
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java?rev=939360&r1=939359&r2=939360&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java Thu Apr 29 16:16:10 2010
@@ -37,12 +37,10 @@ import org.apache.hadoop.mapred.JobClien
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
-import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansClusterMapper;
-import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansConfigKeys;
+import org.apache.mahout.clustering.WeightedVectorWritable;
import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver;
import org.apache.mahout.common.CommandLineUtil;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
-import org.apache.mahout.math.VectorWritable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -205,7 +203,7 @@ public final class MeanShiftCanopyDriver
conf.setJobName("Mean Shift Clustering");
conf.setOutputKeyClass(IntWritable.class);
- conf.setOutputValueClass(VectorWritable.class);
+ conf.setOutputValueClass(WeightedVectorWritable.class);
FileInputFormat.setInputPaths(conf, new Path(input));
Path outPath = new Path(output);
Modified: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java?rev=939360&r1=939359&r2=939360&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java (original)
+++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java Thu Apr 29 16:16:10 2010
@@ -31,9 +31,8 @@ import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
-import org.apache.hadoop.mapred.Reducer;
-import org.apache.hadoop.mapred.lib.IdentityReducer;
import org.apache.mahout.clustering.ClusteringTestUtils;
+import org.apache.mahout.clustering.WeightedVectorWritable;
import org.apache.mahout.common.DummyOutputCollector;
import org.apache.mahout.common.DummyReporter;
import org.apache.mahout.common.MahoutTestCase;
@@ -410,7 +409,7 @@ public class TestCanopyCreation extends
mapper.configure(conf);
List<Canopy> canopies = new ArrayList<Canopy>();
- DummyOutputCollector<IntWritable,VectorWritable> collector = new DummyOutputCollector<IntWritable,VectorWritable>();
+ DummyOutputCollector<IntWritable,WeightedVectorWritable> collector = new DummyOutputCollector<IntWritable,WeightedVectorWritable>();
int nextCanopyId = 0;
for (Vector centroid : manhattanCentroids) {
canopies.add(new Canopy(centroid, nextCanopyId++));
@@ -421,14 +420,14 @@ public class TestCanopyCreation extends
for (VectorWritable point : points) {
mapper.map(new Text(), point, collector, new DummyReporter());
}
- Map<IntWritable, List<VectorWritable>> data = collector.getData();
+ Map<IntWritable, List<WeightedVectorWritable>> data = collector.getData();
assertEquals("Number of map results", canopies.size(), data.size());
- for (Entry<IntWritable, List<VectorWritable>> stringListEntry : data.entrySet()) {
+ for (Entry<IntWritable, List<WeightedVectorWritable>> stringListEntry : data.entrySet()) {
IntWritable key = stringListEntry.getKey();
Canopy canopy = findCanopy(key.get(), canopies);
- List<VectorWritable> pts = stringListEntry.getValue();
- for (VectorWritable ptDef : pts) {
- assertTrue("Point not in canopy", mapper.canopyCovers(canopy, ptDef.get()));
+ List<WeightedVectorWritable> pts = stringListEntry.getValue();
+ for (WeightedVectorWritable ptDef : pts) {
+ assertTrue("Point not in canopy", mapper.canopyCovers(canopy, ptDef.getVector().get()));
}
}
}
@@ -453,7 +452,7 @@ public class TestCanopyCreation extends
mapper.configure(conf);
List<Canopy> canopies = new ArrayList<Canopy>();
- DummyOutputCollector<IntWritable,VectorWritable> collector = new DummyOutputCollector<IntWritable,VectorWritable>();
+ DummyOutputCollector<IntWritable,WeightedVectorWritable> collector = new DummyOutputCollector<IntWritable,WeightedVectorWritable>();
int nextCanopyId = 0;
for (Vector centroid : euclideanCentroids) {
canopies.add(new Canopy(centroid, nextCanopyId++));
@@ -464,14 +463,14 @@ public class TestCanopyCreation extends
for (VectorWritable point : points) {
mapper.map(new Text(), point, collector, new DummyReporter());
}
- Map<IntWritable,List<VectorWritable>> data = collector.getData();
+ Map<IntWritable, List<WeightedVectorWritable>> data = collector.getData();
assertEquals("Number of map results", canopies.size(), data.size());
- for (Entry<IntWritable, List<VectorWritable>> stringListEntry : data.entrySet()) {
+ for (Entry<IntWritable, List<WeightedVectorWritable>> stringListEntry : data.entrySet()) {
IntWritable key = stringListEntry.getKey();
Canopy canopy = findCanopy(key.get(), canopies);
- List<VectorWritable> pts = stringListEntry.getValue();
- for (VectorWritable ptDef : pts) {
- assertTrue("Point not in canopy", mapper.canopyCovers(canopy, ptDef.get()));
+ List<WeightedVectorWritable> pts = stringListEntry.getValue();
+ for (WeightedVectorWritable ptDef : pts) {
+ assertTrue("Point not in canopy", mapper.canopyCovers(canopy, ptDef.getVector().get()));
}
}
}
@@ -500,10 +499,10 @@ public class TestCanopyCreation extends
* while (reader.ready()) { System.out.println(reader.readLine()); count++; }
*/
IntWritable clusterId = new IntWritable(0);
- VectorWritable vector = new VectorWritable();
+ WeightedVectorWritable vector = new WeightedVectorWritable();
while (reader.next(clusterId, vector)) {
count++;
- System.out.println("Txt: " + clusterId + " Vec: " + vector.get().asFormatString());
+ System.out.println("Txt: " + clusterId + " Vec: " + vector.getVector().get().asFormatString());
}
// the point [3.0,3.0] is covered by both canopies
assertEquals("number of points", 1 + points.size(), count);
@@ -532,7 +531,7 @@ public class TestCanopyCreation extends
* while (reader.ready()) { System.out.println(reader.readLine()); count++; }
*/
IntWritable canopyId = new IntWritable(0);
- VectorWritable can = new VectorWritable();
+ WeightedVectorWritable can = new WeightedVectorWritable();
while (reader.next(canopyId, can)) {
count++;
}
Modified: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java?rev=939360&r1=939359&r2=939360&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java (original)
+++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java Thu Apr 29 16:16:10 2010
@@ -31,6 +31,7 @@ import org.apache.hadoop.io.SequenceFile
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.mahout.clustering.ClusteringTestUtils;
+import org.apache.mahout.clustering.WeightedVectorWritable;
import org.apache.mahout.clustering.kmeans.TestKmeansClustering;
import org.apache.mahout.common.DummyOutputCollector;
import org.apache.mahout.common.DummyReporter;
@@ -87,26 +88,37 @@ public class TestFuzzyKmeansClustering e
}
private static void computeCluster(List<Vector> points, List<SoftCluster> clusterList, FuzzyKMeansClusterer clusterer,
- Map<Integer, List<Vector>> pointClusterInfo) {
+ Map<Integer, List<WeightedVectorWritable>> pointClusterInfo) {
for (Vector point : points) {
+ // calculate point distances for all clusters
List<Double> clusterDistanceList = new ArrayList<Double>();
- SoftCluster closestCluster = null;
- double closestDistance = Double.MAX_VALUE;
for (SoftCluster cluster : clusterList) {
- double distance = clusterer.getMeasure().distance(point, cluster.getCenter());
- if (distance < closestDistance) {
- closestDistance = distance;
- closestCluster = cluster;
+ clusterDistanceList.add(clusterer.getMeasure().distance(cluster.getCenter(), point));
+ }
+ // calculate point pdf for all clusters
+ List<Double> clusterPdfList = new ArrayList<Double>();
+ for (int i = 0; i < clusterList.size(); i++) {
+ double probWeight = clusterer.computeProbWeight(clusterDistanceList.get(i), clusterDistanceList);
+ clusterPdfList.add(probWeight);
+ }
+ // for now just emit the most likely cluster
+ int clusterId = -1;
+ double clusterPdf = 0;
+ for (int i = 0; i < clusterList.size(); i++) {
+ // System.out.println("cluster-" + clusters.get(i).getId() + "@ " + ClusterBase.formatVector(center, null));
+ double pdf = clusterPdfList.get(i);
+ if (pdf > clusterPdf) {
+ clusterId = clusterList.get(i).getId();
+ clusterPdf = pdf;
}
- clusterDistanceList.add(distance);
}
- List<Vector> list = pointClusterInfo.get(closestCluster.getId());
+ List<WeightedVectorWritable> list = pointClusterInfo.get(clusterId);
if (list == null) {
- list = new ArrayList<Vector>();
- pointClusterInfo.put(closestCluster.getId(), list);
+ list = new ArrayList<WeightedVectorWritable>();
+ pointClusterInfo.put(clusterId, list);
}
- list.add(point);
+ list.add(new WeightedVectorWritable(clusterPdf, new VectorWritable(point)));
double totalProb = 0;
for (int i = 0; i < clusterList.size(); i++) {
SoftCluster cluster = clusterList.get(i);
@@ -118,9 +130,9 @@ public class TestFuzzyKmeansClustering e
for (SoftCluster cluster : clusterList) {
System.out.println(cluster.asFormatString(null));
- List<Vector> list = pointClusterInfo.get(cluster.getId());
+ List<WeightedVectorWritable> list = pointClusterInfo.get(cluster.getId());
if (list != null)
- for (Vector vector : list) {
+ for (WeightedVectorWritable vector : list) {
System.out.println("\t" + vector);
}
}
@@ -140,7 +152,7 @@ public class TestFuzzyKmeansClustering e
//cluster.addPoint(cluster.getCenter(), 1);
clusterList.add(cluster);
}
- Map<Integer, List<Vector>> pointClusterInfo = new HashMap<Integer, List<Vector>>();
+ Map<Integer, List<WeightedVectorWritable>> pointClusterInfo = new HashMap<Integer, List<WeightedVectorWritable>>();
// run reference FuzzyKmeans algorithm
List<List<SoftCluster>> clusters = FuzzyKMeansClusterer.clusterPoints(points, clusterList, new EuclideanDistanceMeasure(),
0.001, 2, 2);
@@ -150,7 +162,7 @@ public class TestFuzzyKmeansClustering e
// iterate for each cluster
int size = 0;
for (int cId : pointClusterInfo.keySet()) {
- List<Vector> pts = pointClusterInfo.get(cId);
+ List<WeightedVectorWritable> pts = pointClusterInfo.get(cId);
size += pts.size();
}
assertEquals("total size", size, points.size());
@@ -222,7 +234,7 @@ public class TestFuzzyKmeansClustering e
// assertEquals("output dir files?", 4, outFiles.length);
SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path("output/clusteredPoints/part-00000"), conf);
IntWritable key = new IntWritable();
- VectorWritable out = new VectorWritable();
+ WeightedVectorWritable out = new WeightedVectorWritable();
while (reader.next(key, out)) {
// make sure we can read all the clusters
}
@@ -493,7 +505,7 @@ public class TestFuzzyKmeansClustering e
softCluster.recomputeCenter();
}
- DummyOutputCollector<IntWritable, VectorWritable> clusterMapperCollector = new DummyOutputCollector<IntWritable, VectorWritable>();
+ DummyOutputCollector<IntWritable, WeightedVectorWritable> clusterMapperCollector = new DummyOutputCollector<IntWritable, WeightedVectorWritable>();
FuzzyKMeansClusterMapper clusterMapper = new FuzzyKMeansClusterMapper();
clusterMapper.config(reducerCluster);
@@ -511,7 +523,7 @@ public class TestFuzzyKmeansClustering e
Vector vec = tweakValue(points.get(i).get());
reference.add(new SoftCluster(vec, i));
}
- Map<Integer, List<Vector>> refClusters = new HashMap<Integer, List<Vector>>();
+ Map<Integer, List<WeightedVectorWritable>> refClusters = new HashMap<Integer, List<WeightedVectorWritable>>();
List<Vector> pointsVectors = new ArrayList<Vector>();
for (VectorWritable point : points) {
pointsVectors.add((Vector) point.get());
@@ -532,7 +544,7 @@ public class TestFuzzyKmeansClustering e
// make sure all points are allocated to a cluster
int size = 0;
for (int cId : refClusters.keySet()) {
- List<Vector> pts = refClusters.get(cId);
+ List<WeightedVectorWritable> pts = refClusters.get(cId);
size += pts.size();
}
assertEquals("total size", size, points.size());
Modified: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java?rev=939360&r1=939359&r2=939360&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java (original)
+++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java Thu Apr 29 16:16:10 2010
@@ -31,6 +31,7 @@ import org.apache.hadoop.io.SequenceFile
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.mahout.clustering.ClusteringTestUtils;
+import org.apache.mahout.clustering.WeightedVectorWritable;
import org.apache.mahout.clustering.canopy.CanopyDriver;
import org.apache.mahout.common.DummyOutputCollector;
import org.apache.mahout.common.DummyReporter;
@@ -370,15 +371,15 @@ public class TestKmeansClustering extend
// assertEquals("output dir files?", 4, outFiles.length);
SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path("output/clusteredPoints/part-00000"), conf);
int[] expect = expectedNumPoints[k];
- DummyOutputCollector<IntWritable, VectorWritable> collector = new DummyOutputCollector<IntWritable, VectorWritable>();
+ DummyOutputCollector<IntWritable, WeightedVectorWritable> collector = new DummyOutputCollector<IntWritable, WeightedVectorWritable>();
// The key is the clusterId
IntWritable clusterId = new IntWritable(0);
- // The value is the vector
- VectorWritable value = new VectorWritable();
+ // The value is the weighted vector
+ WeightedVectorWritable value = new WeightedVectorWritable();
while (reader.next(clusterId, value)) {
collector.collect(clusterId, value);
clusterId = new IntWritable(0);
- value = new VectorWritable();
+ value = new WeightedVectorWritable();
}
reader.close();
@@ -418,17 +419,17 @@ public class TestKmeansClustering extend
assertTrue("output dir exists?", outDir.exists());
String[] outFiles = outDir.list();
assertEquals("output dir files?", 4, outFiles.length);
- DummyOutputCollector<IntWritable, VectorWritable> collector = new DummyOutputCollector<IntWritable, VectorWritable>();
+ DummyOutputCollector<IntWritable, WeightedVectorWritable> collector = new DummyOutputCollector<IntWritable, WeightedVectorWritable>();
SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path("output/clusteredPoints/part-00000"), conf);
// The key is the clusterId
IntWritable clusterId = new IntWritable(0);
// The value is the vector
- VectorWritable value = new VectorWritable();
+ WeightedVectorWritable value = new WeightedVectorWritable();
while (reader.next(clusterId, value)) {
collector.collect(clusterId, value);
clusterId = new IntWritable(0);
- value = new VectorWritable();
+ value = new WeightedVectorWritable();
}
reader.close();
Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/meanshift/DisplayMeanShift.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/meanshift/DisplayMeanShift.java?rev=939360&r1=939359&r2=939360&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/meanshift/DisplayMeanShift.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/meanshift/DisplayMeanShift.java Thu Apr 29 16:16:10 2010
@@ -35,35 +35,32 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
class DisplayMeanShift extends DisplayDirichlet {
-
+
private static final Logger log = LoggerFactory.getLogger(DisplayMeanShift.class);
-
- private static final MeanShiftCanopyClusterer clusterer = new MeanShiftCanopyClusterer(
- new EuclideanDistanceMeasure(), 1.0, 0.05, 0.5);
+
private static List<MeanShiftCanopy> canopies = new ArrayList<MeanShiftCanopy>();
-
+
+ private static double t1, t2;
+
private DisplayMeanShift() {
initialize();
this.setTitle("Canopy Clusters (> 1.5% of population)");
}
-
- // TODO this is never queried?
- // private static final List<List<Vector>> iterationCenters = new ArrayList<List<Vector>>();
-
+
@Override
public void paint(Graphics g) {
Graphics2D g2 = (Graphics2D) g;
double sx = (double) res / ds;
g2.setTransform(AffineTransform.getScaleInstance(sx, sx));
-
+
// plot the axes
g2.setColor(Color.BLACK);
Vector dv = new DenseVector(2).assign(size / 2.0);
- Vector dv1 = new DenseVector(2).assign(DisplayMeanShift.clusterer.getT1());
- Vector dv2 = new DenseVector(2).assign(DisplayMeanShift.clusterer.getT2());
+ Vector dv1 = new DenseVector(2).assign(t1);
+ Vector dv2 = new DenseVector(2).assign(t2);
DisplayDirichlet.plotRectangle(g2, new DenseVector(2).assign(2), dv);
DisplayDirichlet.plotRectangle(g2, new DenseVector(2).assign(-2), dv);
-
+
// plot the sample data
g2.setColor(Color.DARK_GRAY);
dv.assign(0.03);
@@ -72,7 +69,7 @@ class DisplayMeanShift extends DisplayDi
}
int i = 0;
for (MeanShiftCanopy canopy : canopies) {
- if (canopy.getBoundPoints().size() > 0.015 * DisplayDirichlet.sampleData.size()) {
+ if (canopy.getBoundPoints().toList().size() > 0.015 * DisplayDirichlet.sampleData.size()) {
g2.setColor(colors[Math.min(i++, DisplayDirichlet.colors.length - 1)]);
for (int v : canopy.getBoundPoints().elements()) {
DisplayDirichlet.plotRectangle(g2, sampleData.get(v).get(), dv);
@@ -82,7 +79,7 @@ class DisplayMeanShift extends DisplayDi
}
}
}
-
+
public static void main(String[] args) {
RandomUtils.useTestSeed();
DisplayDirichlet.generateSamples();
@@ -90,14 +87,15 @@ class DisplayMeanShift extends DisplayDi
for (VectorWritable sample : sampleData) {
points.add(sample.get());
}
- canopies = MeanShiftCanopyClusterer.clusterPoints(points, new EuclideanDistanceMeasure(), 0.5, 1.0, 0.05,
- 10);
+ t1 = 1.5;
+ t2 = 0.5;
+ canopies = MeanShiftCanopyClusterer.clusterPoints(points, new EuclideanDistanceMeasure(), 0.005, t1, t2, 20);
for (MeanShiftCanopy canopy : canopies) {
log.info(canopy.toString());
}
new DisplayMeanShift();
}
-
+
static void generateResults() {
DisplayDirichlet.generateResults(new NormalModelDistribution());
}
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwDriver.java?rev=939360&r1=939359&r2=939360&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwDriver.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwDriver.java Thu Apr 29 16:16:10 2010
@@ -43,7 +43,7 @@ import org.apache.hadoop.mapred.Sequence
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.mahout.clustering.Cluster;
import org.apache.mahout.clustering.ClusterBase;
-import org.apache.mahout.clustering.WeightedPointWritable;
+import org.apache.mahout.clustering.WeightedVectorWritable;
import org.apache.mahout.clustering.dirichlet.DirichletCluster;
import org.apache.mahout.clustering.kmeans.KMeansDriver;
import org.apache.mahout.common.CommandLineUtil;
@@ -198,7 +198,7 @@ public class CDbwDriver {
conf.setOutputKeyClass(IntWritable.class);
conf.setOutputValueClass(VectorWritable.class);
conf.setMapOutputKeyClass(IntWritable.class);
- conf.setMapOutputValueClass(WeightedPointWritable.class);
+ conf.setMapOutputValueClass(WeightedVectorWritable.class);
FileInputFormat.setInputPaths(conf, new Path(input));
Path outPath = new Path(stateOut);
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwMapper.java?rev=939360&r1=939359&r2=939360&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwMapper.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwMapper.java Thu Apr 29 16:16:10 2010
@@ -35,37 +35,38 @@ import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.OutputLogFilter;
import org.apache.hadoop.mapred.Reporter;
-import org.apache.mahout.clustering.WeightedPointWritable;
+import org.apache.mahout.clustering.WeightedVectorWritable;
import org.apache.mahout.common.distance.DistanceMeasure;
import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
import org.apache.mahout.math.VectorWritable;
-public class CDbwMapper extends MapReduceBase implements Mapper<IntWritable, VectorWritable, IntWritable, WeightedPointWritable> {
+public class CDbwMapper extends MapReduceBase implements
+ Mapper<IntWritable, WeightedVectorWritable, IntWritable, WeightedVectorWritable> {
private Map<Integer, List<VectorWritable>> representativePoints;
- private Map<Integer, WeightedPointWritable> mostDistantPoints = new HashMap<Integer, WeightedPointWritable>();
+ private Map<Integer, WeightedVectorWritable> mostDistantPoints = new HashMap<Integer, WeightedVectorWritable>();
private DistanceMeasure measure = new EuclideanDistanceMeasure();
- private OutputCollector<IntWritable, WeightedPointWritable> output = null;
+ private OutputCollector<IntWritable, WeightedVectorWritable> output = null;
@Override
- public void map(IntWritable clusterId, VectorWritable point, OutputCollector<IntWritable, WeightedPointWritable> output,
+ public void map(IntWritable clusterId, WeightedVectorWritable point, OutputCollector<IntWritable, WeightedVectorWritable> output,
Reporter reporter) throws IOException {
this.output = output;
int key = clusterId.get();
- WeightedPointWritable currentMDP = mostDistantPoints.get(key);
+ WeightedVectorWritable currentMDP = mostDistantPoints.get(key);
List<VectorWritable> refPoints = representativePoints.get(key);
double totalDistance = 0.0;
for (VectorWritable refPoint : refPoints) {
- totalDistance += measure.distance(refPoint.get(), point.get());
+ totalDistance += measure.distance(refPoint.get(), point.getVector().get());
}
if (currentMDP == null || currentMDP.getWeight() < totalDistance) {
- mostDistantPoints.put(key, new WeightedPointWritable(totalDistance, new VectorWritable(point.get().clone())));
+ mostDistantPoints.put(key, new WeightedVectorWritable(totalDistance, new VectorWritable(point.getVector().get().clone())));
}
}
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwReducer.java?rev=939360&r1=939359&r2=939360&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwReducer.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwReducer.java Thu Apr 29 16:16:10 2010
@@ -29,28 +29,28 @@ import org.apache.hadoop.mapred.MapReduc
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
-import org.apache.mahout.clustering.WeightedPointWritable;
+import org.apache.mahout.clustering.WeightedVectorWritable;
import org.apache.mahout.math.VectorWritable;
-public class CDbwReducer extends MapReduceBase implements Reducer<IntWritable, WeightedPointWritable, IntWritable, VectorWritable> {
+public class CDbwReducer extends MapReduceBase implements Reducer<IntWritable, WeightedVectorWritable, IntWritable, VectorWritable> {
private Map<Integer, List<VectorWritable>> referencePoints;
private OutputCollector<IntWritable, VectorWritable> output;
@Override
- public void reduce(IntWritable key, Iterator<WeightedPointWritable> values, OutputCollector<IntWritable, VectorWritable> output,
+ public void reduce(IntWritable key, Iterator<WeightedVectorWritable> values, OutputCollector<IntWritable, VectorWritable> output,
Reporter reporter) throws IOException {
this.output = output;
// find the most distant point
- WeightedPointWritable mdp = null;
+ WeightedVectorWritable mdp = null;
while (values.hasNext()) {
- WeightedPointWritable dpw = values.next();
+ WeightedVectorWritable dpw = values.next();
if (mdp == null || mdp.getWeight() < dpw.getWeight()) {
- mdp = new WeightedPointWritable(dpw.getWeight(), dpw.getPoint());
+ mdp = new WeightedVectorWritable(dpw.getWeight(), dpw.getVector());
}
}
- output.collect(new IntWritable(key.get()), mdp.getPoint());
+ output.collect(new IntWritable(key.get()), mdp.getVector());
}
public void configure(Map<Integer, List<VectorWritable>> referencePoints) {