You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by je...@apache.org on 2012/02/13 02:05:46 UTC
svn commit: r1243387 - in
/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans:
KMeansClusterMapper.java KMeansClusterer.java KMeansCombiner.java
KMeansMapper.java KMeansReducer.java KMeansUtil.java RandomSeedGenerator.java
Author: jeastman
Date: Mon Feb 13 01:05:46 2012
New Revision: 1243387
URL: http://svn.apache.org/viewvc?rev=1243387&view=rev
Log:
MAHOUT-933:
- Model: renamed count() to getNumObservations() and added getTotalObservations()
- Cluster: removed getNumPoints() which was redundant with getNumObservations()
- AbstractCluster: renamed numPoints to numObservations and added totalObservations. Aded observation statistics to persistent state, updating write() and readFields() to serialize these fields and totalObservations
- CIMapper: added code to update policy based upon classifier model's state (esp totalObservations for Dirichlet)
- DirichletClusteringPolicy: removed totalCounts now in each model and changed update to use given prior models' totalCounts
- FuzzyKMeansClusteringPolicy: added m and convergenceDelta
- KMeansClusteringPolicy: added convergenceDelta
- Adjusted many other classes to account for these fundamental changes
All tests run
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterMapper.java
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterer.java
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansCombiner.java
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansMapper.java
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansReducer.java
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansUtil.java
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/RandomSeedGenerator.java
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterMapper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterMapper.java?rev=1243387&r1=1243386&r2=1243387&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterMapper.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterMapper.java Mon Feb 13 01:05:46 2012
@@ -41,7 +41,7 @@ import org.apache.mahout.math.VectorWrit
public class KMeansClusterMapper
extends Mapper<WritableComparable<?>,VectorWritable,IntWritable,WeightedPropertyVectorWritable> {
- private final Collection<Cluster> clusters = Lists.newArrayList();
+ private final Collection<Kluster> clusters = Lists.newArrayList();
private KMeansClusterer clusterer;
@Override
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterer.java?rev=1243387&r1=1243386&r2=1243387&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterer.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterer.java Mon Feb 13 01:05:46 2012
@@ -36,7 +36,7 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
- * This class implements the k-means clustering algorithm. It uses {@link Cluster} as a cluster
+ * This class implements the k-means clustering algorithm. It uses {@link Kluster} as a cluster
* representation. The class can be used as part of a clustering job to be started as map/reduce job.
* */
public class KMeansClusterer {
@@ -67,12 +67,12 @@ public class KMeansClusterer {
* a List<Cluster> to test.
*/
public void emitPointToNearestCluster(Vector point,
- Iterable<Cluster> clusters,
+ Iterable<Kluster> clusters,
Mapper<?,?,Text,ClusterObservations>.Context context)
throws IOException, InterruptedException {
- Cluster nearestCluster = null;
+ Kluster nearestCluster = null;
double nearestDistance = Double.MAX_VALUE;
- for (Cluster cluster : clusters) {
+ for (Kluster cluster : clusters) {
Vector clusterCenter = cluster.getCenter();
double distance = this.measure.distance(clusterCenter.getLengthSquared(), clusterCenter, point);
if (log.isDebugEnabled()) {
@@ -91,10 +91,10 @@ public class KMeansClusterer {
* @param point
* @param clusters
*/
- protected void addPointToNearestCluster(Vector point, Iterable<Cluster> clusters) {
- Cluster closestCluster = null;
+ protected void addPointToNearestCluster(Vector point, Iterable<Kluster> clusters) {
+ Kluster closestCluster = null;
double closestDistance = Double.MAX_VALUE;
- for (Cluster cluster : clusters) {
+ for (Kluster cluster : clusters) {
double distance = measure.distance(cluster.getCenter(), point);
if (closestCluster == null || closestDistance > distance) {
closestCluster = cluster;
@@ -107,9 +107,9 @@ public class KMeansClusterer {
/**
* Sequential implementation to test convergence and update cluster centers
*/
- protected boolean testConvergence(Iterable<Cluster> clusters, double distanceThreshold) {
+ protected boolean testConvergence(Iterable<Kluster> clusters, double distanceThreshold) {
boolean converged = true;
- for (Cluster cluster : clusters) {
+ for (Kluster cluster : clusters) {
if (!computeConvergence(cluster, distanceThreshold)) {
converged = false;
}
@@ -119,7 +119,7 @@ public class KMeansClusterer {
}
public void outputPointWithClusterInfo(Vector vector,
- Iterable<Cluster> clusters,
+ Iterable<Kluster> clusters,
Mapper<?,?,IntWritable,WeightedPropertyVectorWritable>.Context context)
throws IOException, InterruptedException {
AbstractCluster nearestCluster = null;
@@ -146,7 +146,7 @@ public class KMeansClusterer {
* @param clusters
* a List<Cluster> to test.
*/
- protected void emitPointToNearestCluster(Vector point, Iterable<Cluster> clusters, Writer writer)
+ protected void emitPointToNearestCluster(Vector point, Iterable<Kluster> clusters, Writer writer)
throws IOException {
AbstractCluster nearestCluster = null;
double nearestDistance = Double.MAX_VALUE;
@@ -177,21 +177,21 @@ public class KMeansClusterer {
* @param maxIter
* the maximum number of iterations
*/
- public static List<List<Cluster>> clusterPoints(Iterable<Vector> points,
- List<Cluster> clusters,
+ public static List<List<Kluster>> clusterPoints(Iterable<Vector> points,
+ List<Kluster> clusters,
DistanceMeasure measure,
int maxIter,
double distanceThreshold) {
- List<List<Cluster>> clustersList = Lists.newArrayList();
+ List<List<Kluster>> clustersList = Lists.newArrayList();
clustersList.add(clusters);
boolean converged = false;
int iteration = 0;
while (!converged && iteration < maxIter) {
log.info("Reference Iteration: {}", iteration);
- List<Cluster> next = Lists.newArrayList();
- for (Cluster c : clustersList.get(iteration)) {
- next.add(new Cluster(c.getCenter(), c.getId(), measure));
+ List<Kluster> next = Lists.newArrayList();
+ for (Kluster c : clustersList.get(iteration)) {
+ next.add(new Kluster(c.getCenter(), c.getId(), measure));
}
clustersList.add(next);
converged = runKMeansIteration(points, next, measure, distanceThreshold);
@@ -212,7 +212,7 @@ public class KMeansClusterer {
* a DistanceMeasure to use
*/
protected static boolean runKMeansIteration(Iterable<Vector> points,
- Iterable<Cluster> clusters,
+ Iterable<Kluster> clusters,
DistanceMeasure measure,
double distanceThreshold) {
// iterate through all points, assigning each to the nearest cluster
@@ -223,7 +223,7 @@ public class KMeansClusterer {
return clusterer.testConvergence(clusters, distanceThreshold);
}
- public boolean computeConvergence(Cluster cluster, double distanceThreshold) {
+ public boolean computeConvergence(Kluster cluster, double distanceThreshold) {
return cluster.computeConvergence(measure, distanceThreshold);
}
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansCombiner.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansCombiner.java?rev=1243387&r1=1243386&r2=1243387&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansCombiner.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansCombiner.java Mon Feb 13 01:05:46 2012
@@ -27,7 +27,7 @@ public class KMeansCombiner extends Redu
@Override
protected void reduce(Text key, Iterable<ClusterObservations> values, Context context)
throws IOException, InterruptedException {
- Cluster cluster = new Cluster();
+ Kluster cluster = new Kluster();
for (ClusterObservations value : values) {
cluster.observe(value);
}
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansMapper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansMapper.java?rev=1243387&r1=1243386&r2=1243387&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansMapper.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansMapper.java Mon Feb 13 01:05:46 2012
@@ -34,7 +34,7 @@ public class KMeansMapper extends Mapper
private KMeansClusterer clusterer;
- private final Collection<Cluster> clusters = Lists.newArrayList();
+ private final Collection<Kluster> clusters = Lists.newArrayList();
@Override
protected void map(WritableComparable<?> key, VectorWritable point, Context context)
@@ -67,7 +67,7 @@ public class KMeansMapper extends Mapper
* @param clusters
* a List<Cluster>
*/
- void setup(Collection<Cluster> clusters, DistanceMeasure measure) {
+ void setup(Collection<Kluster> clusters, DistanceMeasure measure) {
this.clusters.clear();
this.clusters.addAll(clusters);
this.clusterer = new KMeansClusterer(measure);
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansReducer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansReducer.java?rev=1243387&r1=1243386&r2=1243387&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansReducer.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansReducer.java Mon Feb 13 01:05:46 2012
@@ -30,16 +30,16 @@ import org.apache.mahout.clustering.Clus
import org.apache.mahout.common.ClassUtils;
import org.apache.mahout.common.distance.DistanceMeasure;
-public class KMeansReducer extends Reducer<Text, ClusterObservations, Text, Cluster> {
+public class KMeansReducer extends Reducer<Text, ClusterObservations, Text, Kluster> {
- private Map<String, Cluster> clusterMap;
+ private Map<String, Kluster> clusterMap;
private double convergenceDelta;
private KMeansClusterer clusterer;
@Override
protected void reduce(Text key, Iterable<ClusterObservations> values, Context context)
throws IOException, InterruptedException {
- Cluster cluster = clusterMap.get(key.toString());
+ Kluster cluster = clusterMap.get(key.toString());
for (ClusterObservations delta : values) {
cluster.observe(delta);
}
@@ -66,7 +66,7 @@ public class KMeansReducer extends Reduc
String path = conf.get(KMeansConfigKeys.CLUSTER_PATH_KEY);
if (!path.isEmpty()) {
- Collection<Cluster> clusters = Lists.newArrayList();
+ Collection<Kluster> clusters = Lists.newArrayList();
KMeansUtil.configureWithClusterInfo(conf, new Path(path), clusters);
setClusterMap(clusters);
if (clusterMap.isEmpty()) {
@@ -75,15 +75,15 @@ public class KMeansReducer extends Reduc
}
}
- private void setClusterMap(Collection<Cluster> clusters) {
+ private void setClusterMap(Collection<Kluster> clusters) {
clusterMap = Maps.newHashMap();
- for (Cluster cluster : clusters) {
+ for (Kluster cluster : clusters) {
clusterMap.put(cluster.getIdentifier(), cluster);
}
clusters.clear();
}
- public void setup(Collection<Cluster> clusters, DistanceMeasure measure) {
+ public void setup(Collection<Kluster> clusters, DistanceMeasure measure) {
setClusterMap(clusters);
this.clusterer = new KMeansClusterer(measure);
}
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansUtil.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansUtil.java?rev=1243387&r1=1243386&r2=1243387&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansUtil.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansUtil.java Mon Feb 13 01:05:46 2012
@@ -35,17 +35,17 @@ final class KMeansUtil {
/** Configure the mapper with the cluster info */
public static void configureWithClusterInfo(Configuration conf,
Path clusterPath,
- Collection<Cluster> clusters) {
+ Collection<Kluster> clusters) {
for (Writable value :
new SequenceFileDirValueIterable<Writable>(clusterPath, PathType.LIST, PathFilters.partFilter(), conf)) {
Class<? extends Writable> valueClass = value.getClass();
- if (valueClass.equals(Cluster.class)) {
+ if (valueClass.equals(Kluster.class)) {
// get the cluster info
- clusters.add((Cluster) value);
+ clusters.add((Kluster) value);
} else if (valueClass.equals(Canopy.class)) {
// get the cluster info
Canopy canopy = (Canopy) value;
- clusters.add(new Cluster(canopy.getCenter(), canopy.getId(), canopy.getMeasure()));
+ clusters.add(new Kluster(canopy.getCenter(), canopy.getId(), canopy.getMeasure()));
} else {
throw new IllegalStateException("Bad value class: " + valueClass);
}
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/RandomSeedGenerator.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/RandomSeedGenerator.java?rev=1243387&r1=1243386&r2=1243387&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/RandomSeedGenerator.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/RandomSeedGenerator.java Mon Feb 13 01:05:46 2012
@@ -42,7 +42,7 @@ import org.slf4j.LoggerFactory;
/**
* Given an Input Path containing a {@link org.apache.hadoop.io.SequenceFile}, randomly select k vectors and
- * write them to the output file as a {@link org.apache.mahout.clustering.kmeans.Cluster} representing the
+ * write them to the output file as a {@link org.apache.mahout.clustering.kmeans.Kluster} representing the
* initial centroid to use.
*/
public final class RandomSeedGenerator {
@@ -74,10 +74,10 @@ public final class RandomSeedGenerator {
}
FileStatus[] inputFiles = fs.globStatus(inputPathPattern, PathFilters.logsCRCFilter());
- SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, outFile, Text.class, Cluster.class);
+ SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, outFile, Text.class, Kluster.class);
Random random = RandomUtils.getRandom();
List<Text> chosenTexts = Lists.newArrayListWithCapacity(k);
- List<Cluster> chosenClusters = Lists.newArrayListWithCapacity(k);
+ List<Kluster> chosenClusters = Lists.newArrayListWithCapacity(k);
int nextClusterId = 0;
for (FileStatus fileStatus : inputFiles) {
@@ -88,7 +88,7 @@ public final class RandomSeedGenerator {
: new SequenceFileIterable<Writable,VectorWritable>(fileStatus.getPath(), true, conf)) {
Writable key = record.getFirst();
VectorWritable value = record.getSecond();
- Cluster newCluster = new Cluster(value.get(), nextClusterId++, measure);
+ Kluster newCluster = new Kluster(value.get(), nextClusterId++, measure);
newCluster.observe(value.get(), 1);
Text newText = new Text(key.toString());
int currentSize = chosenTexts.size();