You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by je...@apache.org on 2012/02/13 02:05:46 UTC

svn commit: r1243387 - in /mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans: KMeansClusterMapper.java KMeansClusterer.java KMeansCombiner.java KMeansMapper.java KMeansReducer.java KMeansUtil.java RandomSeedGenerator.java

Author: jeastman
Date: Mon Feb 13 01:05:46 2012
New Revision: 1243387

URL: http://svn.apache.org/viewvc?rev=1243387&view=rev
Log:
MAHOUT-933:
- Model: renamed count() to getNumObservations() and added getTotalObservations()
- Cluster: removed getNumPoints() which was redundant with getNumObservations()
- AbstractCluster: renamed numPoints to numObservations and added totalObservations. Aded observation statistics to persistent state, updating write() and readFields() to serialize these fields and totalObservations
- CIMapper: added code to update policy based upon classifier model's state (esp totalObservations for Dirichlet)
- DirichletClusteringPolicy: removed totalCounts now in each model and changed update to use given prior models' totalCounts
- FuzzyKMeansClusteringPolicy: added m and convergenceDelta
- KMeansClusteringPolicy: added convergenceDelta
- Adjusted many other classes to account for these fundamental changes

All tests run

Modified:
    mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterMapper.java
    mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterer.java
    mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansCombiner.java
    mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansMapper.java
    mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansReducer.java
    mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansUtil.java
    mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/RandomSeedGenerator.java

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterMapper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterMapper.java?rev=1243387&r1=1243386&r2=1243387&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterMapper.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterMapper.java Mon Feb 13 01:05:46 2012
@@ -41,7 +41,7 @@ import org.apache.mahout.math.VectorWrit
 public class KMeansClusterMapper
     extends Mapper<WritableComparable<?>,VectorWritable,IntWritable,WeightedPropertyVectorWritable> {
   
-  private final Collection<Cluster> clusters = Lists.newArrayList();
+  private final Collection<Kluster> clusters = Lists.newArrayList();
   private KMeansClusterer clusterer;
 
   @Override

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterer.java?rev=1243387&r1=1243386&r2=1243387&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterer.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterer.java Mon Feb 13 01:05:46 2012
@@ -36,7 +36,7 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 /**
- * This class implements the k-means clustering algorithm. It uses {@link Cluster} as a cluster
+ * This class implements the k-means clustering algorithm. It uses {@link Kluster} as a cluster
  * representation. The class can be used as part of a clustering job to be started as map/reduce job.
  * */
 public class KMeansClusterer {
@@ -67,12 +67,12 @@ public class KMeansClusterer {
    *          a List<Cluster> to test.
    */
   public void emitPointToNearestCluster(Vector point,
-                                        Iterable<Cluster> clusters,
+                                        Iterable<Kluster> clusters,
                                         Mapper<?,?,Text,ClusterObservations>.Context context)
     throws IOException, InterruptedException {
-    Cluster nearestCluster = null;
+    Kluster nearestCluster = null;
     double nearestDistance = Double.MAX_VALUE;
-    for (Cluster cluster : clusters) {
+    for (Kluster cluster : clusters) {
       Vector clusterCenter = cluster.getCenter();
       double distance = this.measure.distance(clusterCenter.getLengthSquared(), clusterCenter, point);
       if (log.isDebugEnabled()) {
@@ -91,10 +91,10 @@ public class KMeansClusterer {
    * @param point
    * @param clusters
    */
-  protected void addPointToNearestCluster(Vector point, Iterable<Cluster> clusters) {
-    Cluster closestCluster = null;
+  protected void addPointToNearestCluster(Vector point, Iterable<Kluster> clusters) {
+    Kluster closestCluster = null;
     double closestDistance = Double.MAX_VALUE;
-    for (Cluster cluster : clusters) {
+    for (Kluster cluster : clusters) {
       double distance = measure.distance(cluster.getCenter(), point);
       if (closestCluster == null || closestDistance > distance) {
         closestCluster = cluster;
@@ -107,9 +107,9 @@ public class KMeansClusterer {
   /**
    * Sequential implementation to test convergence and update cluster centers
    */
-  protected boolean testConvergence(Iterable<Cluster> clusters, double distanceThreshold) {
+  protected boolean testConvergence(Iterable<Kluster> clusters, double distanceThreshold) {
     boolean converged = true;
-    for (Cluster cluster : clusters) {
+    for (Kluster cluster : clusters) {
       if (!computeConvergence(cluster, distanceThreshold)) {
         converged = false;
       }
@@ -119,7 +119,7 @@ public class KMeansClusterer {
   }
 
   public void outputPointWithClusterInfo(Vector vector,
-                                         Iterable<Cluster> clusters,
+                                         Iterable<Kluster> clusters,
                                          Mapper<?,?,IntWritable,WeightedPropertyVectorWritable>.Context context)
     throws IOException, InterruptedException {
     AbstractCluster nearestCluster = null;
@@ -146,7 +146,7 @@ public class KMeansClusterer {
    * @param clusters
    *          a List<Cluster> to test.
    */
-  protected void emitPointToNearestCluster(Vector point, Iterable<Cluster> clusters, Writer writer)
+  protected void emitPointToNearestCluster(Vector point, Iterable<Kluster> clusters, Writer writer)
     throws IOException {
     AbstractCluster nearestCluster = null;
     double nearestDistance = Double.MAX_VALUE;
@@ -177,21 +177,21 @@ public class KMeansClusterer {
    * @param maxIter
    *          the maximum number of iterations
    */
-  public static List<List<Cluster>> clusterPoints(Iterable<Vector> points,
-                                                  List<Cluster> clusters,
+  public static List<List<Kluster>> clusterPoints(Iterable<Vector> points,
+                                                  List<Kluster> clusters,
                                                   DistanceMeasure measure,
                                                   int maxIter,
                                                   double distanceThreshold) {
-    List<List<Cluster>> clustersList = Lists.newArrayList();
+    List<List<Kluster>> clustersList = Lists.newArrayList();
     clustersList.add(clusters);
 
     boolean converged = false;
     int iteration = 0;
     while (!converged && iteration < maxIter) {
       log.info("Reference Iteration: {}", iteration);
-      List<Cluster> next = Lists.newArrayList();
-      for (Cluster c : clustersList.get(iteration)) {
-        next.add(new Cluster(c.getCenter(), c.getId(), measure));
+      List<Kluster> next = Lists.newArrayList();
+      for (Kluster c : clustersList.get(iteration)) {
+        next.add(new Kluster(c.getCenter(), c.getId(), measure));
       }
       clustersList.add(next);
       converged = runKMeansIteration(points, next, measure, distanceThreshold);
@@ -212,7 +212,7 @@ public class KMeansClusterer {
    *          a DistanceMeasure to use
    */
   protected static boolean runKMeansIteration(Iterable<Vector> points,
-                                              Iterable<Cluster> clusters,
+                                              Iterable<Kluster> clusters,
                                               DistanceMeasure measure,
                                               double distanceThreshold) {
     // iterate through all points, assigning each to the nearest cluster
@@ -223,7 +223,7 @@ public class KMeansClusterer {
     return clusterer.testConvergence(clusters, distanceThreshold);
   }
 
-  public boolean computeConvergence(Cluster cluster, double distanceThreshold) {
+  public boolean computeConvergence(Kluster cluster, double distanceThreshold) {
     return cluster.computeConvergence(measure, distanceThreshold);
   }
 

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansCombiner.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansCombiner.java?rev=1243387&r1=1243386&r2=1243387&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansCombiner.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansCombiner.java Mon Feb 13 01:05:46 2012
@@ -27,7 +27,7 @@ public class KMeansCombiner extends Redu
   @Override
   protected void reduce(Text key, Iterable<ClusterObservations> values, Context context)
     throws IOException, InterruptedException {
-    Cluster cluster = new Cluster();
+    Kluster cluster = new Kluster();
     for (ClusterObservations value : values) {
       cluster.observe(value);
     }

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansMapper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansMapper.java?rev=1243387&r1=1243386&r2=1243387&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansMapper.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansMapper.java Mon Feb 13 01:05:46 2012
@@ -34,7 +34,7 @@ public class KMeansMapper extends Mapper
 
   private KMeansClusterer clusterer;
 
-  private final Collection<Cluster> clusters = Lists.newArrayList();
+  private final Collection<Kluster> clusters = Lists.newArrayList();
 
   @Override
   protected void map(WritableComparable<?> key, VectorWritable point, Context context)
@@ -67,7 +67,7 @@ public class KMeansMapper extends Mapper
    * @param clusters
    *          a List<Cluster>
    */
-  void setup(Collection<Cluster> clusters, DistanceMeasure measure) {
+  void setup(Collection<Kluster> clusters, DistanceMeasure measure) {
     this.clusters.clear();
     this.clusters.addAll(clusters);
     this.clusterer = new KMeansClusterer(measure);

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansReducer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansReducer.java?rev=1243387&r1=1243386&r2=1243387&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansReducer.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansReducer.java Mon Feb 13 01:05:46 2012
@@ -30,16 +30,16 @@ import org.apache.mahout.clustering.Clus
 import org.apache.mahout.common.ClassUtils;
 import org.apache.mahout.common.distance.DistanceMeasure;
 
-public class KMeansReducer extends Reducer<Text, ClusterObservations, Text, Cluster> {
+public class KMeansReducer extends Reducer<Text, ClusterObservations, Text, Kluster> {
 
-  private Map<String, Cluster> clusterMap;
+  private Map<String, Kluster> clusterMap;
   private double convergenceDelta;
   private KMeansClusterer clusterer;
 
   @Override
   protected void reduce(Text key, Iterable<ClusterObservations> values, Context context)
     throws IOException, InterruptedException {
-    Cluster cluster = clusterMap.get(key.toString());
+    Kluster cluster = clusterMap.get(key.toString());
     for (ClusterObservations delta : values) {
       cluster.observe(delta);
     }
@@ -66,7 +66,7 @@ public class KMeansReducer extends Reduc
 
     String path = conf.get(KMeansConfigKeys.CLUSTER_PATH_KEY);
     if (!path.isEmpty()) {
-      Collection<Cluster> clusters = Lists.newArrayList();
+      Collection<Kluster> clusters = Lists.newArrayList();
       KMeansUtil.configureWithClusterInfo(conf, new Path(path), clusters);
       setClusterMap(clusters);
       if (clusterMap.isEmpty()) {
@@ -75,15 +75,15 @@ public class KMeansReducer extends Reduc
     }
   }
 
-  private void setClusterMap(Collection<Cluster> clusters) {
+  private void setClusterMap(Collection<Kluster> clusters) {
     clusterMap = Maps.newHashMap();
-    for (Cluster cluster : clusters) {
+    for (Kluster cluster : clusters) {
       clusterMap.put(cluster.getIdentifier(), cluster);
     }
     clusters.clear();
   }
 
-  public void setup(Collection<Cluster> clusters, DistanceMeasure measure) {
+  public void setup(Collection<Kluster> clusters, DistanceMeasure measure) {
     setClusterMap(clusters);
     this.clusterer = new KMeansClusterer(measure);
   }

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansUtil.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansUtil.java?rev=1243387&r1=1243386&r2=1243387&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansUtil.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansUtil.java Mon Feb 13 01:05:46 2012
@@ -35,17 +35,17 @@ final class KMeansUtil {
   /** Configure the mapper with the cluster info */
   public static void configureWithClusterInfo(Configuration conf,
                                               Path clusterPath,
-                                              Collection<Cluster> clusters) {
+                                              Collection<Kluster> clusters) {
     for (Writable value :
          new SequenceFileDirValueIterable<Writable>(clusterPath, PathType.LIST, PathFilters.partFilter(), conf)) {
       Class<? extends Writable> valueClass = value.getClass();
-      if (valueClass.equals(Cluster.class)) {
+      if (valueClass.equals(Kluster.class)) {
         // get the cluster info
-        clusters.add((Cluster) value);
+        clusters.add((Kluster) value);
       } else if (valueClass.equals(Canopy.class)) {
         // get the cluster info
         Canopy canopy = (Canopy) value;
-        clusters.add(new Cluster(canopy.getCenter(), canopy.getId(), canopy.getMeasure()));
+        clusters.add(new Kluster(canopy.getCenter(), canopy.getId(), canopy.getMeasure()));
       } else {
         throw new IllegalStateException("Bad value class: " + valueClass);
       }

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/RandomSeedGenerator.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/RandomSeedGenerator.java?rev=1243387&r1=1243386&r2=1243387&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/RandomSeedGenerator.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/RandomSeedGenerator.java Mon Feb 13 01:05:46 2012
@@ -42,7 +42,7 @@ import org.slf4j.LoggerFactory;
 
 /**
  * Given an Input Path containing a {@link org.apache.hadoop.io.SequenceFile}, randomly select k vectors and
- * write them to the output file as a {@link org.apache.mahout.clustering.kmeans.Cluster} representing the
+ * write them to the output file as a {@link org.apache.mahout.clustering.kmeans.Kluster} representing the
  * initial centroid to use.
  */
 public final class RandomSeedGenerator {
@@ -74,10 +74,10 @@ public final class RandomSeedGenerator {
       }
       
       FileStatus[] inputFiles = fs.globStatus(inputPathPattern, PathFilters.logsCRCFilter());
-      SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, outFile, Text.class, Cluster.class);
+      SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, outFile, Text.class, Kluster.class);
       Random random = RandomUtils.getRandom();
       List<Text> chosenTexts = Lists.newArrayListWithCapacity(k);
-      List<Cluster> chosenClusters = Lists.newArrayListWithCapacity(k);
+      List<Kluster> chosenClusters = Lists.newArrayListWithCapacity(k);
       int nextClusterId = 0;
       
       for (FileStatus fileStatus : inputFiles) {
@@ -88,7 +88,7 @@ public final class RandomSeedGenerator {
              : new SequenceFileIterable<Writable,VectorWritable>(fileStatus.getPath(), true, conf)) {
           Writable key = record.getFirst();
           VectorWritable value = record.getSecond();
-          Cluster newCluster = new Cluster(value.get(), nextClusterId++, measure);
+          Kluster newCluster = new Kluster(value.get(), nextClusterId++, measure);
           newCluster.observe(value.get(), 1);
           Text newText = new Text(key.toString());
           int currentSize = chosenTexts.size();