You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by is...@apache.org on 2009/12/10 10:34:54 UTC
svn commit: r889158 [1/2] - in /lucene/mahout/trunk:
core/src/main/java/org/apache/mahout/clustering/
core/src/main/java/org/apache/mahout/clustering/canopy/
core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/
core/src/main/java/org/apache/mah...
Author: isabel
Date: Thu Dec 10 09:34:48 2009
New Revision: 889158
URL: http://svn.apache.org/viewvc?rev=889158&view=rev
Log:
MAHOUT-11 - refactors lots of the clustering code to get rid of its
static fields. Thanks to Drew Farris for fixing most of the code.
Modified:
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/ClusterBase.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/Canopy.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyDriver.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyMapper.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyReducer.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/ClusterDriver.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/ClusterMapper.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansClusterMapper.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansCombiner.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansMapper.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansReducer.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/SoftCluster.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/Cluster.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterMapper.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansCombiner.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansMapper.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansReducer.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/RandomSeedGenerator.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopy.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyMapper.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyReducer.java
lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java
lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/VisibleCanopy.java
lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java
lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java
lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java
lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/canopy/DisplayCanopy.java
lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/fuzzykmeans/DisplayFuzzyKMeans.java
lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/kmeans/DisplayKMeans.java
lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/meanshift/DisplayMeanShift.java
lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/InputMapper.java
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/ClusterBase.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/ClusterBase.java?rev=889158&r1=889157&r2=889158&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/ClusterBase.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/ClusterBase.java Thu Dec 10 09:34:48 2009
@@ -43,7 +43,7 @@
return id;
}
- protected void setId(int id) {
+ public void setId(int id) {
this.id = id;
}
@@ -51,7 +51,7 @@
return center;
}
- protected void setCenter(Vector center) {
+ public void setCenter(Vector center) {
this.center = center;
}
@@ -59,7 +59,7 @@
return numPoints;
}
- protected void setNumPoints(int numPoints) {
+ public void setNumPoints(int numPoints) {
this.numPoints = numPoints;
}
@@ -67,7 +67,7 @@
return pointTotal;
}
- protected void setPointTotal(Vector pointTotal) {
+ public void setPointTotal(Vector pointTotal) {
this.pointTotal = pointTotal;
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/Canopy.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/Canopy.java?rev=889158&r1=889157&r2=889158&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/Canopy.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/Canopy.java Thu Dec 10 09:34:48 2009
@@ -18,17 +18,14 @@
package org.apache.mahout.clustering.canopy;
import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.mahout.clustering.ClusterBase;
import org.apache.mahout.matrix.AbstractVector;
import org.apache.mahout.matrix.Vector;
-import org.apache.mahout.common.distance.DistanceMeasure;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
-import java.util.List;
/**
* This class models a canopy as a center point, the number of points that are contained within it according to the
@@ -37,45 +34,11 @@
*/
public class Canopy extends ClusterBase {
- // keys used by Driver, Mapper, Combiner & Reducer
- public static final String DISTANCE_MEASURE_KEY = "org.apache.mahout.clustering.canopy.measure";
-
- public static final String T1_KEY = "org.apache.mahout.clustering.canopy.t1";
-
- public static final String T2_KEY = "org.apache.mahout.clustering.canopy.t2";
-
- public static final String CANOPY_PATH_KEY = "org.apache.mahout.clustering.canopy.path";
-
- // the next canopyId to be allocated
- private static int nextCanopyId = 0;
-
- // the T1 distance threshold
- private static double t1;
-
- // the T2 distance threshold
- private static double t2;
-
- // the distance measure
- private static DistanceMeasure measure;
-
-
- /** Used w */
+ /** Used for deserializaztion as a writable */
public Canopy() {
}
/**
- * Create a new Canopy containing the given point
- *
- * @param point a point in vector space
- */
- public Canopy(Vector point) {
- this.setId(nextCanopyId++);
- this.setCenter(point.clone());
- this.setPointTotal(point.clone());
- this.setNumPoints(1);
- }
-
- /**
* Create a new Canopy containing the given point and canopyId
*
* @param point a point in vector space
@@ -88,118 +51,6 @@
this.setNumPoints(1);
}
- /**
- * Configure the Canopy and its distance measure
- *
- * @param job the JobConf for this job
- */
- public static void configure(JobConf job) {
- try {
- ClassLoader ccl = Thread.currentThread().getContextClassLoader();
- Class<?> cl = ccl.loadClass(job.get(DISTANCE_MEASURE_KEY));
- measure = (DistanceMeasure) cl.newInstance();
- measure.configure(job);
- } catch (ClassNotFoundException e) {
- throw new IllegalStateException(e);
- } catch (IllegalAccessException e) {
- throw new IllegalStateException(e);
- } catch (InstantiationException e) {
- throw new IllegalStateException(e);
- }
- nextCanopyId = 0;
- t1 = Double.parseDouble(job.get(T1_KEY));
- t2 = Double.parseDouble(job.get(T2_KEY));
- }
-
- /** Configure the Canopy for unit tests */
- public static void config(DistanceMeasure aMeasure, double aT1, double aT2) {
- nextCanopyId = 0;
- measure = aMeasure;
- t1 = aT1;
- t2 = aT2;
- }
-
- /**
- * This is the same algorithm as the reference but inverted to iterate over existing canopies instead of the points.
- * Because of this it does not need to actually store the points, instead storing a total points vector and the number
- * of points. From this a centroid can be computed. <p/> This method is used by the CanopyReducer.
- *
- * @param point the point to be added
- * @param canopies the List<Canopy> to be appended
- */
- public static void addPointToCanopies(Vector point, List<Canopy> canopies) {
- boolean pointStronglyBound = false;
- for (Canopy canopy : canopies) {
- double dist = measure.distance(canopy.getCenter().getLengthSquared(), canopy.getCenter(), point);
- if (dist < t1) {
- canopy.addPoint(point);
- }
- pointStronglyBound = pointStronglyBound || (dist < t2);
- }
- if (!pointStronglyBound) {
- canopies.add(new Canopy(point));
- }
- }
-
- /**
- * This method is used by the CanopyMapper to perform canopy inclusion tests and to emit the point and its covering
- * canopies to the output. The CanopyCombiner will then sum the canopy points and produce the centroids.
- *
- * @param point the point to be added
- * @param canopies the List<Canopy> to be appended
- * @param collector an OutputCollector in which to emit the point
- */
- public static void emitPointToNewCanopies(Vector point,
- List<Canopy> canopies, OutputCollector<Text, Vector> collector)
- throws IOException {
- boolean pointStronglyBound = false;
- for (Canopy canopy : canopies) {
- double dist = measure.distance(canopy.getCenter().getLengthSquared(), canopy.getCenter(), point);
- if (dist < t1) {
- canopy.emitPoint(point, collector);
- }
- pointStronglyBound = pointStronglyBound || (dist < t2);
- }
- if (!pointStronglyBound) {
- Canopy canopy = new Canopy(point);
- canopies.add(canopy);
- canopy.emitPoint(point, collector);
- }
- }
-
- /**
- * This method is used by the CanopyMapper to perform canopy inclusion tests and to emit the point keyed by its
- * covering canopies to the output. if the point is not covered by any canopies (due to canopy centroid clustering),
- * emit the point to the closest covering canopy.
- *
- * @param point the point to be added
- * @param canopies the List<Canopy> to be appended
- * @param collector an OutputCollector in which to emit the point
- */
- public static void emitPointToExistingCanopies(Vector point,
- List<Canopy> canopies,
- OutputCollector<Text, Vector> collector) throws IOException {
- double minDist = Double.MAX_VALUE;
- Canopy closest = null;
- boolean isCovered = false;
- for (Canopy canopy : canopies) {
- double dist = measure.distance(canopy.getCenter().getLengthSquared(), canopy.getCenter(), point);
- if (dist < t1) {
- isCovered = true;
- collector.collect(new Text(canopy.getIdentifier()), point);
- } else if (dist < minDist) {
- minDist = dist;
- closest = canopy;
- }
- }
- // if the point is not contained in any canopies (due to canopy centroid
- // clustering), emit the point to the closest covering canopy.
- if (!isCovered) {
- collector.collect(new Text(closest.getIdentifier()), point);
- }
- }
-
-
@Override
public void write(DataOutput out) throws IOException {
super.write(out);
@@ -283,14 +134,4 @@
public Vector computeCentroid() {
return getPointTotal().divide(getNumPoints());
}
-
- /**
- * Return if the point is covered by this canopy
- *
- * @param point a point
- * @return if the point is covered
- */
- public boolean covers(Vector point) {
- return measure.distance(getCenter().getLengthSquared(), getCenter(), point) < t1;
- }
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyDriver.java?rev=889158&r1=889157&r2=889158&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyDriver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyDriver.java Thu Dec 10 09:34:48 2009
@@ -134,11 +134,10 @@
log.info("Input: " + input + " Out: " + output + " Measure: " + measureClassName + " t1: " + t1
+ " t2: " + t2 + " Vector Class: " + vectorClass.getSimpleName());
JobClient client = new JobClient();
- JobConf conf = new JobConf(
- org.apache.mahout.clustering.canopy.CanopyDriver.class);
- conf.set(Canopy.DISTANCE_MEASURE_KEY, measureClassName);
- conf.set(Canopy.T1_KEY, String.valueOf(t1));
- conf.set(Canopy.T2_KEY, String.valueOf(t2));
+ JobConf conf = new JobConf(CanopyDriver.class);
+ conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY, measureClassName);
+ conf.set(CanopyConfigKeys.T1_KEY, String.valueOf(t1));
+ conf.set(CanopyConfigKeys.T2_KEY, String.valueOf(t2));
conf.setInputFormat(SequenceFileInputFormat.class);
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyMapper.java?rev=889158&r1=889157&r2=889158&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyMapper.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyMapper.java Thu Dec 10 09:34:48 2009
@@ -37,17 +37,19 @@
private OutputCollector<Text, Vector> outputCollector;
+ private CanopyClusterer canopyClusterer;
+
@Override
public void map(WritableComparable<?> key, Vector point,
OutputCollector<Text, Vector> output, Reporter reporter) throws IOException {
outputCollector = output;
- Canopy.addPointToCanopies(point, canopies);
+ canopyClusterer.addPointToCanopies(point, canopies);
}
@Override
public void configure(JobConf job) {
super.configure(job);
- Canopy.configure(job);
+ canopyClusterer = new CanopyClusterer(job);
}
@Override
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyReducer.java?rev=889158&r1=889157&r2=889158&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyReducer.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyReducer.java Thu Dec 10 09:34:48 2009
@@ -35,12 +35,14 @@
private final List<Canopy> canopies = new ArrayList<Canopy>();
+ private CanopyClusterer canopyClusterer;
+
@Override
public void reduce(Text key, Iterator<Vector> values,
OutputCollector<Text, Canopy> output, Reporter reporter) throws IOException {
while (values.hasNext()) {
Vector point = values.next();
- Canopy.addPointToCanopies(point, canopies);
+ canopyClusterer.addPointToCanopies(point, canopies);
}
for (Canopy canopy : canopies) {
output.collect(new Text(canopy.getIdentifier()), canopy);
@@ -50,7 +52,7 @@
@Override
public void configure(JobConf job) {
super.configure(job);
- Canopy.configure(job);
+ canopyClusterer = new CanopyClusterer(job);
}
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/ClusterDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/ClusterDriver.java?rev=889158&r1=889157&r2=889158&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/ClusterDriver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/ClusterDriver.java Thu Dec 10 09:34:48 2009
@@ -140,13 +140,12 @@
public static void runJob(String points, String canopies, String output,
String measureClassName, double t1, double t2, Class<? extends Vector> vectorClass) throws IOException {
JobClient client = new JobClient();
- JobConf conf = new JobConf(
- org.apache.mahout.clustering.canopy.ClusterDriver.class);
+ JobConf conf = new JobConf(ClusterDriver.class);
- conf.set(Canopy.DISTANCE_MEASURE_KEY, measureClassName);
- conf.set(Canopy.T1_KEY, String.valueOf(t1));
- conf.set(Canopy.T2_KEY, String.valueOf(t2));
- conf.set(Canopy.CANOPY_PATH_KEY, canopies);
+ conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY, measureClassName);
+ conf.set(CanopyConfigKeys.T1_KEY, String.valueOf(t1));
+ conf.set(CanopyConfigKeys.T2_KEY, String.valueOf(t2));
+ conf.set(CanopyConfigKeys.CANOPY_PATH_KEY, canopies);
conf.setInputFormat(SequenceFileInputFormat.class);
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/ClusterMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/ClusterMapper.java?rev=889158&r1=889157&r2=889158&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/ClusterMapper.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/ClusterMapper.java Thu Dec 10 09:34:48 2009
@@ -36,12 +36,13 @@
public class ClusterMapper extends MapReduceBase implements
Mapper<WritableComparable<?>, Vector, Text, Vector> {
- private List<Canopy> canopies;
+ private CanopyClusterer canopyClusterer;
+ private final List<Canopy> canopies = new ArrayList<Canopy>();
@Override
public void map(WritableComparable<?> key, Vector point,
OutputCollector<Text, Vector> output, Reporter reporter) throws IOException {
- Canopy.emitPointToExistingCanopies(point, canopies, output);
+ canopyClusterer.emitPointToExistingCanopies(point, canopies, output);
}
/**
@@ -50,33 +51,42 @@
* @param canopies a List<Canopy>
*/
public void config(List<Canopy> canopies) {
- this.canopies = canopies;
+ this.canopies.clear();
+ this.canopies.addAll(canopies);
}
@Override
public void configure(JobConf job) {
super.configure(job);
- Canopy.configure(job);
+ canopyClusterer = new CanopyClusterer(job);
- String canopyPath = job.get(Canopy.CANOPY_PATH_KEY);
- canopies = new ArrayList<Canopy>();
-
- try {
- Path path = new Path(canopyPath + "/part-00000");
- FileSystem fs = FileSystem.get(path.toUri(), job);
- SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, job);
+ String canopyPath = job.get(CanopyConfigKeys.CANOPY_PATH_KEY);
+ if (canopyPath != null && canopyPath.length() > 0) {
try {
- Text key = new Text();
- Canopy value = new Canopy();
- while (reader.next(key, value)) {
- canopies.add(value);
- value = new Canopy();
+ Path path = new Path(canopyPath + "/part-00000");
+ FileSystem fs = FileSystem.get(path.toUri(), job);
+ SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, job);
+ try {
+ Text key = new Text();
+ Canopy value = new Canopy();
+ while (reader.next(key, value)) {
+ canopies.add(value);
+ value = new Canopy();
+ }
+ } finally {
+ reader.close();
}
- } finally {
- reader.close();
+ } catch (IOException e) {
+ throw new IllegalStateException(e);
+ }
+
+ if (canopies.isEmpty()) {
+ throw new NullPointerException("Canopies are empty!");
}
- } catch (IOException e) {
- throw new IllegalStateException(e);
}
}
+
+ public boolean canopyCovers(Canopy canopy, Vector point) {
+ return canopyClusterer.canopyCovers(canopy, point);
+ }
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansClusterMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansClusterMapper.java?rev=889158&r1=889157&r2=889158&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansClusterMapper.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansClusterMapper.java Thu Dec 10 09:34:48 2009
@@ -33,12 +33,13 @@
public class FuzzyKMeansClusterMapper extends MapReduceBase implements
Mapper<WritableComparable<?>, Vector, Text, FuzzyKMeansOutput> {
- private List<SoftCluster> clusters;
+ private final List<SoftCluster> clusters = new ArrayList<SoftCluster>();
+ private FuzzyKMeansClusterer clusterer;
@Override
public void map(WritableComparable<?> key, Vector point,
OutputCollector<Text, FuzzyKMeansOutput> output, Reporter reporter) throws IOException {
- SoftCluster.outputPointWithClusterProbabilities(key.toString(), point, clusters, output);
+ clusterer.outputPointWithClusterProbabilities(key.toString(), point, clusters, output);
}
/**
@@ -47,19 +48,21 @@
* @param clusters a List<Cluster>
*/
void config(List<SoftCluster> clusters) {
- this.clusters = clusters;
+ this.clusters.clear();
+ this.clusters.addAll(clusters);
}
@Override
public void configure(JobConf job) {
super.configure(job);
- SoftCluster.configure(job);
- clusters = new ArrayList<SoftCluster>();
-
- FuzzyKMeansUtil.configureWithClusterInfo(job
- .get(SoftCluster.CLUSTER_PATH_KEY), clusters);
+ clusterer = new FuzzyKMeansClusterer(job);
+ String clusterPath = job.get(FuzzyKMeansConfigKeys.CLUSTER_PATH_KEY);
+ if (clusterPath != null && clusterPath.length() > 0) {
+ FuzzyKMeansUtil.configureWithClusterInfo(clusterPath, clusters);
+ }
+
if (clusters.isEmpty()) {
throw new NullPointerException("Cluster is empty!!!");
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansCombiner.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansCombiner.java?rev=889158&r1=889157&r2=889158&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansCombiner.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansCombiner.java Thu Dec 10 09:34:48 2009
@@ -30,6 +30,8 @@
public class FuzzyKMeansCombiner extends MapReduceBase implements
Reducer<Text, FuzzyKMeansInfo, Text, FuzzyKMeansInfo> {
+ private FuzzyKMeansClusterer clusterer;
+
@Override
public void reduce(Text key, Iterator<FuzzyKMeansInfo> values,
OutputCollector<Text, FuzzyKMeansInfo> output, Reporter reporter) throws IOException {
@@ -40,7 +42,7 @@
if (info.getCombinerPass() == 0) // first time thru combiner
{
- cluster.addPoint(info.getVector(), Math.pow(info.getProbability(), SoftCluster.getM()));
+ cluster.addPoint(info.getVector(), Math.pow(info.getProbability(), clusterer.getM()));
} else {
cluster.addPoints(info.getVector(), info.getProbability());
}
@@ -53,7 +55,7 @@
@Override
public void configure(JobConf job) {
super.configure(job);
- SoftCluster.configure(job);
+ clusterer = new FuzzyKMeansClusterer(job);
}
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java?rev=889158&r1=889157&r2=889158&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java Thu Dec 10 09:34:48 2009
@@ -274,10 +274,10 @@
conf.setNumMapTasks(numMapTasks);
conf.setNumReduceTasks(numReduceTasks);
- conf.set(SoftCluster.CLUSTER_PATH_KEY, clustersIn);
- conf.set(SoftCluster.DISTANCE_MEASURE_KEY, measureClass);
- conf.set(SoftCluster.CLUSTER_CONVERGENCE_KEY, String.valueOf(convergenceDelta));
- conf.set(SoftCluster.M_KEY, String.valueOf(m));
+ conf.set(FuzzyKMeansConfigKeys.CLUSTER_PATH_KEY, clustersIn);
+ conf.set(FuzzyKMeansConfigKeys.DISTANCE_MEASURE_KEY, measureClass);
+ conf.set(FuzzyKMeansConfigKeys.CLUSTER_CONVERGENCE_KEY, String.valueOf(convergenceDelta));
+ conf.set(FuzzyKMeansConfigKeys.M_KEY, String.valueOf(m));
// uncomment it to run locally
// conf.set("mapred.job.tracker", "local");
@@ -327,10 +327,10 @@
// conf.set("mapred.job.tracker", "local");
conf.setNumMapTasks(numMapTasks);
conf.setNumReduceTasks(0);
- conf.set(SoftCluster.CLUSTER_PATH_KEY, clustersIn);
- conf.set(SoftCluster.DISTANCE_MEASURE_KEY, measureClass);
- conf.set(SoftCluster.CLUSTER_CONVERGENCE_KEY, String.valueOf(convergenceDelta));
- conf.set(SoftCluster.M_KEY, String.valueOf(m));
+ conf.set(FuzzyKMeansConfigKeys.CLUSTER_PATH_KEY, clustersIn);
+ conf.set(FuzzyKMeansConfigKeys.DISTANCE_MEASURE_KEY, measureClass);
+ conf.set(FuzzyKMeansConfigKeys.CLUSTER_CONVERGENCE_KEY, String.valueOf(convergenceDelta));
+ conf.set(FuzzyKMeansConfigKeys.M_KEY, String.valueOf(m));
try {
JobClient.runJob(conf);
} catch (IOException e) {
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansMapper.java?rev=889158&r1=889157&r2=889158&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansMapper.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansMapper.java Thu Dec 10 09:34:48 2009
@@ -37,12 +37,13 @@
private static final Logger log = LoggerFactory.getLogger(FuzzyKMeansMapper.class);
- private List<SoftCluster> clusters;
-
+ private List<SoftCluster> clusters = new ArrayList<SoftCluster>();
+ private FuzzyKMeansClusterer clusterer;
+
@Override
public void map(WritableComparable<?> key, Vector point,
OutputCollector<Text, FuzzyKMeansInfo> output, Reporter reporter) throws IOException {
- SoftCluster.emitPointProbToCluster(point, clusters, output);
+ clusterer.emitPointProbToCluster(point, clusters, output);
}
/**
@@ -51,21 +52,23 @@
* @param clusters a List<Cluster>
*/
void config(List<SoftCluster> clusters) {
- this.clusters = clusters;
+ this.clusters.clear();
+ this.clusters.addAll(clusters);
}
@Override
public void configure(JobConf job) {
super.configure(job);
- SoftCluster.configure(job);
+ clusterer = new FuzzyKMeansClusterer(job);
log.info("In Mapper Configure:");
- clusters = new ArrayList<SoftCluster>();
-
- FuzzyKMeansUtil.configureWithClusterInfo(job
- .get(SoftCluster.CLUSTER_PATH_KEY), clusters);
+ String clusterPath = job.get(FuzzyKMeansConfigKeys.CLUSTER_PATH_KEY);
+ if (clusterPath != null && clusterPath.length() > 0) {
+ FuzzyKMeansUtil.configureWithClusterInfo(clusterPath, clusters);
+ }
+
if (clusters.isEmpty()) {
throw new NullPointerException("Cluster is empty!!!");
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansReducer.java?rev=889158&r1=889157&r2=889158&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansReducer.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansReducer.java Thu Dec 10 09:34:48 2009
@@ -34,8 +34,9 @@
public class FuzzyKMeansReducer extends MapReduceBase implements
Reducer<Text, FuzzyKMeansInfo, Text, SoftCluster> {
- private Map<String, SoftCluster> clusterMap;
-
+ private final Map<String, SoftCluster> clusterMap = new HashMap<String, SoftCluster>();
+ private FuzzyKMeansClusterer clusterer;
+
@Override
public void reduce(Text key, Iterator<FuzzyKMeansInfo> values,
OutputCollector<Text, SoftCluster> output, Reporter reporter) throws IOException {
@@ -47,14 +48,14 @@
if (value.getCombinerPass() == 0) // escaped from combiner
{
- cluster.addPoint(value.getVector(), Math.pow(value.getProbability(), SoftCluster.getM()));
+ cluster.addPoint(value.getVector(), Math.pow(value.getProbability(), clusterer.getM()));
} else {
cluster.addPoints(value.getVector(), value.getProbability());
}
}
// force convergence calculation
- cluster.computeConvergence();
+ clusterer.computeConvergence(cluster);
output.collect(new Text(cluster.getIdentifier()), cluster);
}
@@ -62,21 +63,22 @@
public void configure(JobConf job) {
super.configure(job);
- SoftCluster.configure(job);
- clusterMap = new HashMap<String, SoftCluster>();
+ clusterer = new FuzzyKMeansClusterer(job);
List<SoftCluster> clusters = new ArrayList<SoftCluster>();
- FuzzyKMeansUtil.configureWithClusterInfo(job
- .get(SoftCluster.CLUSTER_PATH_KEY), clusters);
- setClusterMap(clusters);
-
+ String clusterPath = job.get(FuzzyKMeansConfigKeys.CLUSTER_PATH_KEY);
+ if (clusterPath != null && clusterPath.length() > 0) {
+ FuzzyKMeansUtil.configureWithClusterInfo(clusterPath, clusters);
+ setClusterMap(clusters);
+ }
+
if (clusterMap.isEmpty()) {
throw new NullPointerException("Cluster is empty!!!");
}
}
private void setClusterMap(List<SoftCluster> clusters) {
- clusterMap = new HashMap<String, SoftCluster>();
+ clusterMap.clear();
for (SoftCluster cluster : clusters) {
clusterMap.put(cluster.getIdentifier(), cluster);
}
@@ -85,7 +87,6 @@
public void config(List<SoftCluster> clusters) {
setClusterMap(clusters);
-
}
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/SoftCluster.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/SoftCluster.java?rev=889158&r1=889157&r2=889158&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/SoftCluster.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/SoftCluster.java Thu Dec 10 09:34:48 2009
@@ -17,46 +17,18 @@
package org.apache.mahout.clustering.fuzzykmeans;
-import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
-import org.apache.hadoop.mapred.JobConf;
-import org.apache.hadoop.mapred.OutputCollector;
import org.apache.mahout.matrix.AbstractVector;
import org.apache.mahout.matrix.SparseVector;
import org.apache.mahout.matrix.SquareRootFunction;
import org.apache.mahout.matrix.Vector;
-import org.apache.mahout.common.distance.DistanceMeasure;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
public class SoftCluster implements Writable {
- public static final String DISTANCE_MEASURE_KEY = "org.apache.mahout.clustering.kmeans.measure";
-
- public static final String CLUSTER_PATH_KEY = "org.apache.mahout.clustering.kmeans.path";
-
- public static final String CLUSTER_CONVERGENCE_KEY = "org.apache.mahout.clustering.kmeans.convergence";
-
- public static final String M_KEY = "org.apache.mahout.clustering.fuzzykmeans.m";
-
- private static double m = 2.0; // default value
-
- private static final double MINIMAL_VALUE = 0.0000000001; // using it for
-
- // adding
-
- // exception
- // this value to any
- // zero valued
- // variable to avoid
- // divide by Zero
-
- private static int nextClusterId = 0;
-
// this cluster's clusterId
private int clusterId;
@@ -81,11 +53,7 @@
private Vector s1;
private Vector s2;
-
- private static DistanceMeasure measure;
-
- private static double convergenceDelta = 0;
-
+
/**
* Format the SoftCluster for output
*
@@ -135,126 +103,12 @@
this.weightedPointTotal = center.like();
}
-
- /**
- * Configure the distance measure from the job
- *
- * @param job the JobConf for the job
- */
- public static void configure(JobConf job) {
- try {
- ClassLoader ccl = Thread.currentThread().getContextClassLoader();
- Class<?> cl = ccl.loadClass(job.get(DISTANCE_MEASURE_KEY));
- measure = (DistanceMeasure) cl.newInstance();
- measure.configure(job);
- convergenceDelta = Double.parseDouble(job.get(CLUSTER_CONVERGENCE_KEY));
- nextClusterId = 0;
- m = Double.parseDouble(job.get(M_KEY));
- } catch (ClassNotFoundException e) {
- throw new IllegalStateException(e);
- } catch (IllegalAccessException e) {
- throw new IllegalStateException(e);
- } catch (InstantiationException e) {
- throw new IllegalStateException(e);
- }
- }
-
- /**
- * Configure the distance measure directly. Used by unit tests.
- *
- * @param aMeasure the DistanceMeasure
- * @param aConvergenceDelta the delta value used to define convergence
- */
- public static void config(DistanceMeasure aMeasure, double aConvergenceDelta) {
- measure = aMeasure;
- convergenceDelta = aConvergenceDelta;
- nextClusterId = 0;
- }
-
- /**
- * Emit the point and its probability of belongingness to each cluster
- *
- * @param point a point
- * @param clusters a List<SoftCluster>
- * @param output the OutputCollector to emit into
- */
- public static void emitPointProbToCluster(Vector point,
- List<SoftCluster> clusters,
- OutputCollector<Text, FuzzyKMeansInfo> output) throws IOException {
- List<Double> clusterDistanceList = new ArrayList<Double>();
- for (SoftCluster cluster : clusters) {
- clusterDistanceList.add(measure.distance(cluster.getCenter(), point));
- }
-
- for (int i = 0; i < clusters.size(); i++) {
- double probWeight = computeProbWeight(clusterDistanceList.get(i),
- clusterDistanceList);
- Text key = new Text(clusters.get(i).getIdentifier()); // just output the
- // identifier,avoids
- // too much data
- // traffic
- /*Text value = new Text(Double.toString(probWeight)
- + FuzzyKMeansDriver.MAPPER_VALUE_SEPARATOR + values.toString());*/
- FuzzyKMeansInfo value = new FuzzyKMeansInfo(probWeight, point);
- output.collect(key, value);
- }
- }
-
- /**
- * Output point with cluster info (Cluster and probability)
- *
- * @param point a point
- * @param clusters a List<SoftCluster> to test
- * @param output the OutputCollector to emit into
- */
- public static void outputPointWithClusterProbabilities(String key,
- Vector point, List<SoftCluster> clusters,
- OutputCollector<Text, FuzzyKMeansOutput> output) throws IOException {
- List<Double> clusterDistanceList = new ArrayList<Double>();
-
- for (SoftCluster cluster : clusters) {
- clusterDistanceList.add(measure.distance(point, cluster.getCenter()));
- }
- FuzzyKMeansOutput fOutput = new FuzzyKMeansOutput(clusters.size());
- for (int i = 0; i < clusters.size(); i++) {
- // System.out.print("cluster:" + i + "\t" + clusterDistanceList.get(i));
-
- double probWeight = computeProbWeight(clusterDistanceList.get(i),
- clusterDistanceList);
- /*outputValue.append(clusters.get(i).clusterId).append(':').append(
- probWeight).append(' ');*/
- fOutput.add(i, clusters.get(i), probWeight);
- }
- String name = point.getName();
- output.collect(new Text(name != null && name.length() != 0 ? name
- : point.asFormatString()),
- fOutput);
- }
-
- /** Computes the probability of a point belonging to a cluster */
- public static double computeProbWeight(double clusterDistance,
- List<Double> clusterDistanceList) {
- if (clusterDistance == 0) {
- clusterDistance = MINIMAL_VALUE;
- }
- double denom = 0.0;
- for (double eachCDist : clusterDistanceList) {
- if (eachCDist == 0.0) {
- eachCDist = MINIMAL_VALUE;
- }
-
- denom += Math.pow(clusterDistance / eachCDist, 2.0 / (m - 1));
-
- }
- return 1.0 / denom;
- }
-
/**
* Compute the centroid
*
* @return the new centroid
*/
- private Vector computeCentroid() {
+ public Vector computeCentroid() {
if (pointProbSum == 0) {
return weightedPointTotal;
} else if (centroid == null) {
@@ -274,7 +128,6 @@
* @param center the center point
*/
public SoftCluster(Vector center) {
- this.clusterId = nextClusterId++;
this.center = center;
this.pointProbSum = 0;
@@ -389,17 +242,6 @@
weightedPointTotal = center.like();
}
- /**
- * Return if the cluster is converged by comparing its center and centroid.
- *
- * @return if the cluster is converged
- */
- public boolean computeConvergence() {
- Vector centroid = computeCentroid();
- converged = measure.distance(center, centroid) <= convergenceDelta;
- return converged;
- }
-
public Vector getWeightedPointTotal() {
return weightedPointTotal;
}
@@ -412,7 +254,7 @@
return converged;
}
- private void setConverged(boolean converged) {
+ public void setConverged(boolean converged) {
this.converged = converged;
}
@@ -420,8 +262,6 @@
return clusterId;
}
- public static double getM() {
- return m;
- }
+
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/Cluster.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/Cluster.java?rev=889158&r1=889157&r2=889158&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/Cluster.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/Cluster.java Thu Dec 10 09:34:48 2009
@@ -16,9 +16,6 @@
*/
package org.apache.mahout.clustering.kmeans;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapred.JobConf;
-import org.apache.hadoop.mapred.OutputCollector;
import org.apache.mahout.clustering.ClusterBase;
import org.apache.mahout.matrix.AbstractVector;
import org.apache.mahout.matrix.SquareRootFunction;
@@ -28,42 +25,26 @@
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
-import java.util.List;
public class Cluster extends ClusterBase {
- private static final String ERROR_UNKNOWN_CLUSTER_FORMAT="Unknown cluster format:\n";
-
- public static final String DISTANCE_MEASURE_KEY = "org.apache.mahout.clustering.kmeans.measure";
+ /** Error message for unknown cluster format in output. */
+ private static final String ERROR_UNKNOWN_CLUSTER_FORMAT = "Unknown cluster format:\n";
- public static final String CLUSTER_PATH_KEY = "org.apache.mahout.clustering.kmeans.path";
-
- public static final String CLUSTER_CONVERGENCE_KEY = "org.apache.mahout.clustering.kmeans.convergence";
-
- /** The number of iterations that have taken place */
- public static final String ITERATION_NUMBER = "org.apache.mahout.clustering.kmeans.iteration";
- /** Boolean value indicating whether the initial input is from Canopy clustering */
- public static final String CANOPY_INPUT = "org.apache.mahout.clustering.kmeans.canopyInput";
-
- private static int nextClusterId = 0;
-
-
- // the current centroid is lazy evaluated and may be null
+ /** The current centroid is lazy evaluated and may be null */
private Vector centroid = null;
-
- // the total of all the points squared, used for std computation
+ /** The total of all the points squared, used for std computation */
private Vector pointSquaredTotal = null;
- // has the centroid converged with the center?
+ /** Has the centroid converged with the center? */
private boolean converged = false;
- private static DistanceMeasure measure;
- private static double convergenceDelta = 0;
/**
* Format the cluster for output
- *
- * @param cluster the Cluster
+ *
+ * @param cluster
+ * the Cluster
* @return the String representation of the Cluster
*/
public static String formatCluster(Cluster cluster) {
@@ -77,16 +58,19 @@
}
/**
- * Decodes and returns a Cluster from the formattedString
- *
- * @param formattedString a String produced by formatCluster
+ * Decodes and returns a Cluster from the formattedString.
+ *
+ * @param formattedString
+ * a String produced by formatCluster
* @return a decoded Cluster, not null
- * @throws IllegalArgumentException when the string is wrongly formatted
+ * @throws IllegalArgumentException
+ * when the string is wrongly formatted
*/
public static Cluster decodeCluster(String formattedString) {
final int beginIndex = formattedString.indexOf('{');
if (beginIndex <= 0) {
- throw new IllegalArgumentException(ERROR_UNKNOWN_CLUSTER_FORMAT + formattedString);
+ throw new IllegalArgumentException(ERROR_UNKNOWN_CLUSTER_FORMAT
+ + formattedString);
}
final String id = formattedString.substring(0, beginIndex);
final String center = formattedString.substring(beginIndex);
@@ -95,17 +79,17 @@
final Cluster cluster;
if (firstChar == 'C' || startsWithV) {
final int clusterId = Integer.parseInt(formattedString.substring(1,
- beginIndex - 2));
+ beginIndex - 2));
final Vector clusterCenter = AbstractVector.decodeVector(center);
cluster = new Cluster(clusterCenter, clusterId);
cluster.setConverged(startsWithV);
} else {
- throw new IllegalArgumentException(ERROR_UNKNOWN_CLUSTER_FORMAT + formattedString);
+ throw new IllegalArgumentException(ERROR_UNKNOWN_CLUSTER_FORMAT
+ + formattedString);
}
return cluster;
}
-
@Override
public void write(DataOutput out) throws IOException {
super.write(out);
@@ -124,84 +108,8 @@
}
/**
- * Configure the distance measure from the job
- *
- * @param job the JobConf for the job
- */
- public static void configure(JobConf job) {
- try {
- ClassLoader ccl = Thread.currentThread().getContextClassLoader();
- Class<?> cl = ccl.loadClass(job.get(DISTANCE_MEASURE_KEY));
- measure = (DistanceMeasure) cl.newInstance();
- measure.configure(job);
- convergenceDelta = Double.parseDouble(job.get(CLUSTER_CONVERGENCE_KEY));
- nextClusterId = 0;
- } catch (ClassNotFoundException e) {
- throw new IllegalStateException(e);
- } catch (IllegalAccessException e) {
- throw new IllegalStateException(e);
- } catch (InstantiationException e) {
- throw new IllegalStateException(e);
- }
- }
-
- /**
- * Configure the distance measure directly. Used by unit tests.
- *
- * @param aMeasure the DistanceMeasure
- * @param aConvergenceDelta the delta value used to define convergence
- */
- public static void config(DistanceMeasure aMeasure, double aConvergenceDelta) {
- measure = aMeasure;
- convergenceDelta = aConvergenceDelta;
- nextClusterId = 0;
- }
-
- /**
- * Emit the point to the nearest cluster center
- *
- * @param point a point
- * @param clusters a List<Cluster> to test
- * @param output the OutputCollector to emit into
- */
- public static void emitPointToNearestCluster(Vector point,
- List<Cluster> clusters, OutputCollector<Text, KMeansInfo> output)
- throws IOException {
- Cluster nearestCluster = null;
- double nearestDistance = Double.MAX_VALUE;
- for (Cluster cluster : clusters) {
- Vector clusterCenter = cluster.getCenter();
- double distance = measure.distance(clusterCenter.getLengthSquared(), clusterCenter, point);
- if (distance < nearestDistance || nearestCluster == null ) {
- nearestCluster = cluster;
- nearestDistance = distance;
- }
- }
- // emit only clusterID
- output.collect(new Text(nearestCluster.getIdentifier()), new KMeansInfo(1, point));
- }
-
- public static void outputPointWithClusterInfo(Vector point,
- List<Cluster> clusters, OutputCollector<Text, Text> output)
- throws IOException {
- Cluster nearestCluster = null;
- double nearestDistance = Double.MAX_VALUE;
- for (Cluster cluster : clusters) {
- Vector clusterCenter = cluster.getCenter();
- double distance = measure.distance(clusterCenter.getLengthSquared(), clusterCenter, point);
- if (distance < nearestDistance || nearestCluster == null) {
- nearestCluster = cluster;
- nearestDistance = distance;
- }
- }
- //TODO: this is ugly
- String name = point.getName();
- output.collect(new Text(name != null && name.length() != 0 ? name : point.asFormatString()), new Text(String.valueOf(nearestCluster.getId())));
- }
-
- /**
* Compute the centroid by averaging the pointTotals
- *
+ *
* @return the new centroid
*/
private Vector computeCentroid() {
@@ -216,12 +124,12 @@
/**
* Construct a new cluster with the given point as its center
- *
- * @param center the center point
+ *
+ * @param center
+ * the center point
*/
public Cluster(Vector center) {
super();
- this.setId(nextClusterId++);
this.setCenter(center);
this.setNumPoints(0);
this.setPointTotal(center.like());
@@ -234,8 +142,9 @@
/**
* Construct a new cluster with the given point as its center
- *
- * @param center the center point
+ *
+ * @param center
+ * the center point
*/
public Cluster(Vector center, int clusterId) {
super();
@@ -269,8 +178,9 @@
/**
* Add the point to the cluster
- *
- * @param point a point to add
+ *
+ * @param point
+ * a point to add
*/
public void addPoint(Vector point) {
addPoints(1, point);
@@ -278,9 +188,11 @@
/**
* Add the point to the cluster
- *
- * @param count the number of points in the delta
- * @param delta a point to add
+ *
+ * @param count
+ * the number of points in the delta
+ * @param delta
+ * a point to add
*/
public void addPoints(int count, Vector delta) {
centroid = null;
@@ -294,7 +206,6 @@
}
}
-
/** Compute the centroid and set the center to it. */
public void recomputeCenter() {
setCenter(computeCentroid());
@@ -304,16 +215,21 @@
/**
* Return if the cluster is converged by comparing its center and centroid.
- *
+ *
+ * @param measure
+ * The distance measure to use for cluster-point comparisons.
+ * @param convergenceDelta
+ * the convergence delta to use for stopping.
* @return if the cluster is converged
*/
- public boolean computeConvergence() {
+ public boolean computeConvergence(final DistanceMeasure measure,
+ final double convergenceDelta) {
Vector centroid = computeCentroid();
- converged = measure.distance(centroid.getLengthSquared(), centroid, getCenter()) <= convergenceDelta;
+ converged = measure.distance(centroid.getLengthSquared(), centroid,
+ getCenter()) <= convergenceDelta;
return converged;
}
-
public boolean isConverged() {
return converged;
}
@@ -325,8 +241,8 @@
/** @return the std */
public double getStd() {
Vector stds = pointSquaredTotal.times(getNumPoints()).minus(
- getPointTotal().times(getPointTotal())).assign(new SquareRootFunction())
- .divide(getNumPoints());
+ getPointTotal().times(getPointTotal()))
+ .assign(new SquareRootFunction()).divide(getNumPoints());
return stds.zSum() / 2;
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterMapper.java?rev=889158&r1=889157&r2=889158&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterMapper.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterMapper.java Thu Dec 10 09:34:48 2009
@@ -24,6 +24,7 @@
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
+import org.apache.mahout.common.distance.DistanceMeasure;
import org.apache.mahout.matrix.Vector;
import java.io.IOException;
@@ -33,34 +34,52 @@
public class KMeansClusterMapper extends MapReduceBase implements
Mapper<WritableComparable<?>, Vector, Text, Text> {
- private List<Cluster> clusters;
+ private final List<Cluster> clusters = new ArrayList<Cluster>();
+ private KMeansClusterer clusterer;
@Override
- public void map(WritableComparable<?> key, Vector point, OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
- Cluster.outputPointWithClusterInfo(point, clusters, output);
+ public void map(WritableComparable<?> key, Vector point,
+ OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
+ this.clusterer.outputPointWithClusterInfo(point, clusters, output);
}
/**
* Configure the mapper by providing its clusters. Used by unit tests.
- *
- * @param clusters a List<Cluster>
+ *
+ * @param clusters
+ * a List<Cluster>
*/
void config(List<Cluster> clusters) {
- this.clusters = clusters;
+ this.clusters.clear();
+ this.clusters.addAll(clusters);
}
@Override
public void configure(JobConf job) {
super.configure(job);
- Cluster.configure(job);
- clusters = new ArrayList<Cluster>();
-
- KMeansUtil.configureWithClusterInfo(job.get(Cluster.CLUSTER_PATH_KEY),
- clusters);
-
- if (clusters.isEmpty()) {
- throw new NullPointerException("Cluster is empty!!!");
+ try {
+ ClassLoader ccl = Thread.currentThread().getContextClassLoader();
+ Class<?> cl = ccl.loadClass(job
+ .get(KMeansConfigKeys.DISTANCE_MEASURE_KEY));
+ DistanceMeasure measure = (DistanceMeasure) cl.newInstance();
+ measure.configure(job);
+
+ String clusterPath = job.get(KMeansConfigKeys.CLUSTER_PATH_KEY);
+ if (clusterPath != null && clusterPath.length() > 0) {
+ KMeansUtil.configureWithClusterInfo(clusterPath, clusters);
+ if (clusters.isEmpty()) {
+ throw new IllegalStateException("Cluster is empty!");
+ }
+ }
+
+ this.clusterer = new KMeansClusterer(measure);
+ } catch (ClassNotFoundException e) {
+ throw new IllegalStateException(e);
+ } catch (IllegalAccessException e) {
+ throw new IllegalStateException(e);
+ } catch (InstantiationException e) {
+ throw new IllegalStateException(e);
}
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansCombiner.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansCombiner.java?rev=889158&r1=889157&r2=889158&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansCombiner.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansCombiner.java Thu Dec 10 09:34:48 2009
@@ -44,7 +44,6 @@
@Override
public void configure(JobConf job) {
super.configure(job);
- Cluster.configure(job);
}
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java?rev=889158&r1=889157&r2=889158&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java Thu Dec 10 09:34:48 2009
@@ -230,11 +230,11 @@
conf.setCombinerClass(KMeansCombiner.class);
conf.setReducerClass(KMeansReducer.class);
conf.setNumReduceTasks(numReduceTasks);
- conf.set(Cluster.CLUSTER_PATH_KEY, clustersIn);
- conf.set(Cluster.DISTANCE_MEASURE_KEY, measureClass);
- conf.set(Cluster.CLUSTER_CONVERGENCE_KEY, convergenceDelta);
- conf.setInt(Cluster.ITERATION_NUMBER, iteration);
-
+ conf.set(KMeansConfigKeys.CLUSTER_PATH_KEY, clustersIn);
+ conf.set(KMeansConfigKeys.DISTANCE_MEASURE_KEY, measureClass);
+ conf.set(KMeansConfigKeys.CLUSTER_CONVERGENCE_KEY, convergenceDelta);
+ conf.set(KMeansConfigKeys.ITERATION_NUMBER, String.valueOf(iteration));
+
try {
JobClient.runJob(conf);
FileSystem fs = FileSystem.get(outPath.toUri(), conf);
@@ -277,9 +277,9 @@
conf.setMapperClass(KMeansClusterMapper.class);
conf.setNumReduceTasks(0);
- conf.set(Cluster.CLUSTER_PATH_KEY, clustersIn);
- conf.set(Cluster.DISTANCE_MEASURE_KEY, measureClass);
- conf.set(Cluster.CLUSTER_CONVERGENCE_KEY, convergenceDelta);
+ conf.set(KMeansConfigKeys.CLUSTER_PATH_KEY, clustersIn);
+ conf.set(KMeansConfigKeys.DISTANCE_MEASURE_KEY, measureClass);
+ conf.set(KMeansConfigKeys.CLUSTER_CONVERGENCE_KEY, convergenceDelta);
try {
JobClient.runJob(conf);
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansMapper.java?rev=889158&r1=889157&r2=889158&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansMapper.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansMapper.java Thu Dec 10 09:34:48 2009
@@ -23,43 +23,64 @@
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
+import org.apache.mahout.common.distance.DistanceMeasure;
import org.apache.mahout.matrix.Vector;
import java.io.IOException;
import java.util.ArrayList;
+import java.util.HashMap;
import java.util.List;
public class KMeansMapper extends MapReduceBase implements
Mapper<WritableComparable<?>, Vector, Text, KMeansInfo> {
- private List<Cluster> clusters;
+ private KMeansClusterer clusterer;
+ private final List<Cluster> clusters = new ArrayList<Cluster>();
@Override
public void map(WritableComparable<?> key, Vector point,
- OutputCollector<Text, KMeansInfo> output, Reporter reporter) throws IOException {
- Cluster.emitPointToNearestCluster(point, clusters, output);
+ OutputCollector<Text, KMeansInfo> output, Reporter reporter)
+ throws IOException {
+ this.clusterer.emitPointToNearestCluster(point, this.clusters, output);
}
/**
* Configure the mapper by providing its clusters. Used by unit tests.
- *
- * @param clusters a List<Cluster>
+ *
+ * @param clusters
+ * a List<Cluster>
*/
void config(List<Cluster> clusters) {
- this.clusters = clusters;
+ this.clusters.clear();
+ this.clusters.addAll(clusters);
}
@Override
public void configure(JobConf job) {
super.configure(job);
- Cluster.configure(job);
-
- clusters = new ArrayList<Cluster>();
- KMeansUtil.configureWithClusterInfo(job.get(Cluster.CLUSTER_PATH_KEY),
- clusters);
-
- if (clusters.isEmpty()) {
- throw new NullPointerException("Cluster is empty!!!");
+ try {
+ ClassLoader ccl = Thread.currentThread().getContextClassLoader();
+ Class<?> cl = ccl.loadClass(job
+ .get(KMeansConfigKeys.DISTANCE_MEASURE_KEY));
+ DistanceMeasure measure = (DistanceMeasure) cl.newInstance();
+ measure.configure(job);
+
+ this.clusterer = new KMeansClusterer(measure);
+
+ String clusterPath = job.get(KMeansConfigKeys.CLUSTER_PATH_KEY);
+ if (clusterPath != null && clusterPath.length() > 0) {
+ KMeansUtil.configureWithClusterInfo(clusterPath, clusters);
+ if (clusters.isEmpty()) {
+ throw new IllegalStateException("Cluster is empty!");
+ }
+ }
+
+ } catch (ClassNotFoundException e) {
+ throw new IllegalStateException(e);
+ } catch (IllegalAccessException e) {
+ throw new IllegalStateException(e);
+ } catch (InstantiationException e) {
+ throw new IllegalStateException(e);
}
}
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansReducer.java?rev=889158&r1=889157&r2=889158&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansReducer.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansReducer.java Thu Dec 10 09:34:48 2009
@@ -22,6 +22,7 @@
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
+import org.apache.mahout.common.distance.DistanceMeasure;
import java.io.IOException;
import java.util.ArrayList;
@@ -34,6 +35,8 @@
Reducer<Text, KMeansInfo, Text, Cluster> {
private Map<String, Cluster> clusterMap;
+ private double convergenceDelta;
+ private DistanceMeasure measure;
@Override
public void reduce(Text key, Iterator<KMeansInfo> values,
@@ -45,7 +48,7 @@
cluster.addPoints(delta.getPoints(), delta.getPointTotal());
}
// force convergence calculation
- cluster.computeConvergence();
+ cluster.computeConvergence(this.measure, this.convergenceDelta);
output.collect(new Text(cluster.getIdentifier()), cluster);
}
@@ -53,16 +56,33 @@
public void configure(JobConf job) {
super.configure(job);
- Cluster.configure(job);
- clusterMap = new HashMap<String, Cluster>();
-
- List<Cluster> clusters = new ArrayList<Cluster>();
- KMeansUtil.configureWithClusterInfo(job.get(Cluster.CLUSTER_PATH_KEY),
- clusters);
- setClusterMap(clusters);
-
- if (clusterMap.isEmpty()) {
- throw new NullPointerException("Cluster is empty!!!");
+ try {
+ ClassLoader ccl = Thread.currentThread().getContextClassLoader();
+ Class<?> cl = ccl.loadClass(job
+ .get(KMeansConfigKeys.DISTANCE_MEASURE_KEY));
+ this.measure = (DistanceMeasure) cl.newInstance();
+ this.measure.configure(job);
+
+ this.convergenceDelta = Double.parseDouble(job
+ .get(KMeansConfigKeys.CLUSTER_CONVERGENCE_KEY));
+
+ this.clusterMap = new HashMap<String, Cluster>();
+
+ String path = job.get(KMeansConfigKeys.CLUSTER_PATH_KEY);
+ if (job != null && path.length() > 0) {
+ List<Cluster> clusters = new ArrayList<Cluster>();
+ KMeansUtil.configureWithClusterInfo(path, clusters);
+ setClusterMap(clusters);
+ if (clusterMap.isEmpty()) {
+ throw new NullPointerException("Cluster is empty!");
+ }
+ }
+ } catch (ClassNotFoundException e) {
+ throw new IllegalStateException(e);
+ } catch (IllegalAccessException e) {
+ throw new IllegalStateException(e);
+ } catch (InstantiationException e) {
+ throw new IllegalStateException(e);
}
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/RandomSeedGenerator.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/RandomSeedGenerator.java?rev=889158&r1=889157&r2=889158&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/RandomSeedGenerator.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/RandomSeedGenerator.java Thu Dec 10 09:34:48 2009
@@ -73,8 +73,9 @@
List<Text> chosenTexts = new ArrayList<Text>(k);
List<Cluster> chosenClusters = new ArrayList<Cluster>(k);
+ int nextClusterId = 0;
while (reader.next(key, value)) {
- Cluster newCluster = new Cluster(value);
+ Cluster newCluster = new Cluster(value, nextClusterId++);
newCluster.addPoint(value);
Text newText = new Text(key.toString());
int currentSize = chosenTexts.size();
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopy.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopy.java?rev=889158&r1=889157&r2=889158&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopy.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopy.java Thu Dec 10 09:34:48 2009
@@ -20,19 +20,14 @@
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.google.gson.reflect.TypeToken;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.WritableComparable;
-import org.apache.hadoop.mapred.JobConf;
-import org.apache.hadoop.mapred.OutputCollector;
import org.apache.mahout.clustering.ClusterBase;
+import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
import org.apache.mahout.matrix.AbstractVector;
import org.apache.mahout.matrix.CardinalityException;
import org.apache.mahout.matrix.DenseVector;
import org.apache.mahout.matrix.JsonVectorAdapter;
import org.apache.mahout.matrix.PlusFunction;
import org.apache.mahout.matrix.Vector;
-import org.apache.mahout.common.distance.DistanceMeasure;
-import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
import java.io.DataInput;
import java.io.DataOutput;
@@ -48,163 +43,53 @@
*/
public class MeanShiftCanopy extends ClusterBase {
- // keys used by Driver, Mapper, Combiner & Reducer
- public static final String DISTANCE_MEASURE_KEY = "org.apache.mahout.clustering.canopy.measure";
-
- public static final String T1_KEY = "org.apache.mahout.clustering.canopy.t1";
-
- public static final String T2_KEY = "org.apache.mahout.clustering.canopy.t2";
-
- public static final String CONTROL_PATH_KEY = "org.apache.mahout.clustering.control.path";
-
- public static final String CLUSTER_CONVERGENCE_KEY = "org.apache.mahout.clustering.canopy.convergence";
-
- private static double convergenceDelta = 0;
-
- // the next canopyId to be allocated
- private static int nextCanopyId = 0;
-
- // the T1 distance threshold
- private static double t1;
-
- // the T2 distance threshold
- private static double t2;
-
- // the distance measure
- private static DistanceMeasure measure;
-
- // TODO: this is problematic, but how else to encode membership?
+ // TODO: this is problematic, but how else to encode membership?
private List<Vector> boundPoints = new ArrayList<Vector>();
private boolean converged = false;
- static double getT1() {
- return t1;
- }
-
- static double getT2() {
- return t2;
- }
-
- /**
- * Configure the Canopy and its distance measure
- *
- * @param job the JobConf for this job
- */
- public static void configure(JobConf job) {
- try {
- measure = Class.forName(job.get(DISTANCE_MEASURE_KEY)).asSubclass(
- DistanceMeasure.class).newInstance();
- measure.configure(job);
- } catch (ClassNotFoundException e) {
- throw new IllegalStateException(e);
- } catch (IllegalAccessException e) {
- throw new IllegalStateException(e);
- } catch (InstantiationException e) {
- throw new IllegalStateException(e);
- }
- nextCanopyId = 0;
- t1 = Double.parseDouble(job.get(T1_KEY));
- t2 = Double.parseDouble(job.get(T2_KEY));
- convergenceDelta = Double.parseDouble(job.get(CLUSTER_CONVERGENCE_KEY));
- }
-
- /**
- * Configure the Canopy for unit tests
- *
- * @param aDelta the convergence criteria
- */
- public static void config(DistanceMeasure aMeasure, double aT1, double aT2,
- double aDelta) {
- nextCanopyId = 100; // so canopyIds will sort properly
- measure = aMeasure;
- t1 = aT1;
- t2 = aT2;
- convergenceDelta = aDelta;
- }
-
- /**
- * Merge the given canopy into the canopies list. If it touches any existing canopy (norm<T1) then add the center of
- * each to the other. If it covers any other canopies (norm<T2), then merge the given canopy with the closest covering
- * canopy. If the given canopy does not cover any other canopies, add it to the canopies list.
- *
- * @param aCanopy a MeanShiftCanopy to be merged
- * @param canopies the List<Canopy> to be appended
- */
- public static void mergeCanopy(MeanShiftCanopy aCanopy,
- List<MeanShiftCanopy> canopies) {
- MeanShiftCanopy closestCoveringCanopy = null;
- double closestNorm = Double.MAX_VALUE;
- for (MeanShiftCanopy canopy : canopies) {
- double norm = measure.distance(canopy.getCenter(), aCanopy.getCenter());
- if (norm < t1) {
- aCanopy.touch(canopy);
- }
- if (norm < t2) {
- if (closestCoveringCanopy == null || norm < closestNorm) {
- closestNorm = norm;
- closestCoveringCanopy = canopy;
- }
- }
- }
- if (closestCoveringCanopy == null) {
- canopies.add(aCanopy);
- } else {
- closestCoveringCanopy.merge(aCanopy);
- }
- }
-
- /** Format the canopy for output */
- public static String formatCanopy(MeanShiftCanopy canopy) {
- Type vectorType = new TypeToken<Vector>() {
- }.getType();
- GsonBuilder gBuilder = new GsonBuilder();
- gBuilder.registerTypeAdapter(vectorType, new JsonVectorAdapter());
- Gson gson = gBuilder.create();
- return gson.toJson(canopy, MeanShiftCanopy.class);
- }
-
- /**
- * Decodes and returns a Canopy from the formattedString
- *
- * @param formattedString a String produced by formatCanopy
- * @return a new Canopy
- */
- public static MeanShiftCanopy decodeCanopy(String formattedString) {
- Type vectorType = new TypeToken<Vector>() {
- }.getType();
- GsonBuilder gBuilder = new GsonBuilder();
- gBuilder.registerTypeAdapter(vectorType, new JsonVectorAdapter());
- Gson gson = gBuilder.create();
- return gson.fromJson(formattedString, MeanShiftCanopy.class);
- }
-
public MeanShiftCanopy() {
super();
}
/** Create a new Canopy with the given canopyId */
+ /*
public MeanShiftCanopy(String id) {
this.setId(Integer.parseInt(id.substring(1)));
this.setCenter(null);
this.setPointTotal(null);
this.setNumPoints(0);
}
+ */
/**
* Create a new Canopy containing the given point
*
* @param point a Vector
*/
+ /*
public MeanShiftCanopy(Vector point) {
- this.setId(nextCanopyId++);
this.setCenter(point);
this.setPointTotal(point.clone());
this.setNumPoints(1);
this.boundPoints.add(point);
}
+ */
/**
+ * Create a new Canopy containing the given point
+ *
+ * @param point a Vector
+ */
+ public MeanShiftCanopy(Vector point, int id) {
+ this.setId(id);
+ this.setCenter(point);
+ this.setPointTotal(point.clone());
+ this.setNumPoints(1);
+ this.boundPoints.add(point);
+ }
+
+ /**
* Create a new Canopy containing the given point, id and bound points
*
* @param point a Vector
@@ -236,16 +121,6 @@
}
/**
- * Return if the point is closely covered by this canopy
- *
- * @param point a Vector point
- * @return if the point is covered
- */
- public boolean closelyBound(Vector point) {
- return measure.distance(getCenter(), point) < t2;
- }
-
- /**
* Compute the bound centroid by averaging the bound points
*
* @return a Vector which is the new bound centroid
@@ -271,25 +146,6 @@
}
}
- /**
- * Return if the point is covered by this canopy
- *
- * @param point a Vector point
- * @return if the point is covered
- */
- boolean covers(Vector point) {
- return measure.distance(getCenter(), point) < t1;
- }
-
- /** Emit the new canopy to the collector, keyed by the canopy's Id */
- void emitCanopy(MeanShiftCanopy canopy,
- OutputCollector<Text, WritableComparable<?>> collector)
- throws IOException {
- String identifier = this.getIdentifier();
- collector.collect(new Text(identifier),
- new Text("new " + canopy.toString()));
- }
-
public List<Vector> getBoundPoints() {
return boundPoints;
}
@@ -322,20 +178,6 @@
boundPoints.addAll(canopy.boundPoints);
}
- /**
- * Shift the center to the new centroid of the cluster
- *
- * @return if the cluster is converged
- */
- public boolean shiftToMean() {
- Vector centroid = computeCentroid();
- converged = new EuclideanDistanceMeasure().distance(centroid, getCenter()) < convergenceDelta;
- setCenter(centroid);
- setNumPoints(1);
- setPointTotal(centroid.clone());
- return converged;
- }
-
@Override
public String toString() {
return formatCanopy(this);
@@ -386,4 +228,38 @@
public String asFormatString() {
return formatCanopy(this);
}
+
+ public void setBoundPoints(List<Vector> boundPoints) {
+ this.boundPoints = boundPoints;
+ }
+
+ public void setConverged(boolean converged) {
+ this.converged = converged;
+ }
+
+ /** Format the canopy for output */
+ public static String formatCanopy(MeanShiftCanopy canopy) {
+ Type vectorType = new TypeToken<Vector>() {
+ }.getType();
+ GsonBuilder gBuilder = new GsonBuilder();
+ gBuilder.registerTypeAdapter(vectorType, new JsonVectorAdapter());
+ Gson gson = gBuilder.create();
+ return gson.toJson(canopy, MeanShiftCanopy.class);
+ }
+
+ /**
+ * Decodes and returns a Canopy from the formattedString
+ *
+ * @param formattedString a String produced by formatCanopy
+ * @return a new Canopy
+ */
+ public static MeanShiftCanopy decodeCanopy(String formattedString) {
+ Type vectorType = new TypeToken<Vector>() {
+ }.getType();
+ GsonBuilder gBuilder = new GsonBuilder();
+ gBuilder.registerTypeAdapter(vectorType, new JsonVectorAdapter());
+ Gson gson = gBuilder.create();
+ return gson.fromJson(formattedString, MeanShiftCanopy.class);
+ }
+
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java?rev=889158&r1=889157&r2=889158&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java Thu Dec 10 09:34:48 2009
@@ -90,7 +90,7 @@
double t1 = Double.parseDouble(cmdLine.getValue(threshold1Opt).toString());
double t2 = Double.parseDouble(cmdLine.getValue(threshold2Opt).toString());
double convergenceDelta = Double.parseDouble(cmdLine.getValue(convergenceDeltaOpt).toString());
- runJob(input, output, output + MeanShiftCanopy.CONTROL_PATH_KEY,
+ runJob(input, output, output + MeanShiftCanopyConfigKeys.CONTROL_PATH_KEY,
measureClassName, t1, t2, convergenceDelta);
} catch (OptionException e) {
log.error("Exception parsing command line: ", e);
@@ -127,12 +127,11 @@
conf.setNumReduceTasks(1);
conf.setInputFormat(SequenceFileInputFormat.class);
conf.setOutputFormat(SequenceFileOutputFormat.class);
- conf.set(MeanShiftCanopy.DISTANCE_MEASURE_KEY, measureClassName);
- conf.set(MeanShiftCanopy.CLUSTER_CONVERGENCE_KEY, String
- .valueOf(convergenceDelta));
- conf.set(MeanShiftCanopy.T1_KEY, String.valueOf(t1));
- conf.set(MeanShiftCanopy.T2_KEY, String.valueOf(t2));
- conf.set(MeanShiftCanopy.CONTROL_PATH_KEY, control);
+ conf.set(MeanShiftCanopyConfigKeys.DISTANCE_MEASURE_KEY, measureClassName);
+ conf.set(MeanShiftCanopyConfigKeys.CLUSTER_CONVERGENCE_KEY, String.valueOf(convergenceDelta));
+ conf.set(MeanShiftCanopyConfigKeys.T1_KEY, String.valueOf(t1));
+ conf.set(MeanShiftCanopyConfigKeys.T2_KEY, String.valueOf(t2));
+ conf.set(MeanShiftCanopyConfigKeys.CONTROL_PATH_KEY, control);
client.setConf(conf);
try {
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyMapper.java?rev=889158&r1=889157&r2=889158&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyMapper.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyMapper.java Thu Dec 10 09:34:48 2009
@@ -33,7 +33,8 @@
Mapper<WritableComparable<?>, MeanShiftCanopy, Text, MeanShiftCanopy> {
private final List<MeanShiftCanopy> canopies = new ArrayList<MeanShiftCanopy>();
-
+
+ private MeanShiftCanopyClusterer clusterer;
private OutputCollector<Text, MeanShiftCanopy> output;
@Override
@@ -41,13 +42,13 @@
OutputCollector<Text, MeanShiftCanopy> output, Reporter reporter)
throws IOException {
this.output = output;
- MeanShiftCanopy.mergeCanopy(canopy.shallowCopy(), canopies);
+ clusterer.mergeCanopy(canopy.shallowCopy(), canopies);
}
@Override
public void close() throws IOException {
for (MeanShiftCanopy canopy : canopies) {
- canopy.shiftToMean();
+ clusterer.shiftToMean(canopy);
output.collect(new Text("canopy"), canopy);
}
super.close();
@@ -56,7 +57,7 @@
@Override
public void configure(JobConf job) {
super.configure(job);
- MeanShiftCanopy.configure(job);
+ clusterer = new MeanShiftCanopyClusterer(job);
}
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyReducer.java?rev=889158&r1=889157&r2=889158&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyReducer.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyReducer.java Thu Dec 10 09:34:48 2009
@@ -35,7 +35,7 @@
Reducer<Text, MeanShiftCanopy, Text, MeanShiftCanopy> {
private final List<MeanShiftCanopy> canopies = new ArrayList<MeanShiftCanopy>();
-
+ private MeanShiftCanopyClusterer clusterer;
private boolean allConverged = true;
private JobConf conf;
@@ -47,11 +47,11 @@
while (values.hasNext()) {
MeanShiftCanopy canopy = values.next();
- MeanShiftCanopy.mergeCanopy(canopy.shallowCopy(), canopies);
+ clusterer.mergeCanopy(canopy.shallowCopy(), canopies);
}
for (MeanShiftCanopy canopy : canopies) {
- allConverged = canopy.shiftToMean() && allConverged;
+ allConverged = clusterer.shiftToMean(canopy) && allConverged;
output.collect(new Text(canopy.getIdentifier()), canopy);
}
@@ -61,13 +61,13 @@
public void configure(JobConf job) {
super.configure(job);
this.conf = job;
- MeanShiftCanopy.configure(job);
+ clusterer = new MeanShiftCanopyClusterer(job);
}
@Override
public void close() throws IOException {
if (allConverged) {
- Path path = new Path(conf.get(MeanShiftCanopy.CONTROL_PATH_KEY));
+ Path path = new Path(conf.get(MeanShiftCanopyConfigKeys.CONTROL_PATH_KEY));
FileSystem.get(conf).createNewFile(path);
}
super.close();