You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by is...@apache.org on 2009/12/10 10:34:54 UTC

svn commit: r889158 [1/2] - in /lucene/mahout/trunk: core/src/main/java/org/apache/mahout/clustering/ core/src/main/java/org/apache/mahout/clustering/canopy/ core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/ core/src/main/java/org/apache/mah...

Author: isabel
Date: Thu Dec 10 09:34:48 2009
New Revision: 889158

URL: http://svn.apache.org/viewvc?rev=889158&view=rev
Log:
MAHOUT-11 - refactors lots of the clustering code to get rid of its
static fields. Thanks to Drew Farris for fixing most of the code.

Modified:
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/ClusterBase.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/Canopy.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyDriver.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyMapper.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyReducer.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/ClusterDriver.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/ClusterMapper.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansClusterMapper.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansCombiner.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansMapper.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansReducer.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/SoftCluster.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/Cluster.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterMapper.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansCombiner.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansMapper.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansReducer.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/RandomSeedGenerator.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopy.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyMapper.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyReducer.java
    lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java
    lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/VisibleCanopy.java
    lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java
    lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java
    lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java
    lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/canopy/DisplayCanopy.java
    lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/fuzzykmeans/DisplayFuzzyKMeans.java
    lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/kmeans/DisplayKMeans.java
    lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/meanshift/DisplayMeanShift.java
    lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/InputMapper.java

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/ClusterBase.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/ClusterBase.java?rev=889158&r1=889157&r2=889158&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/ClusterBase.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/ClusterBase.java Thu Dec 10 09:34:48 2009
@@ -43,7 +43,7 @@
     return id;
   }
 
-  protected void setId(int id) {
+  public void setId(int id) {
     this.id = id;
   }
 
@@ -51,7 +51,7 @@
     return center;
   }
 
-  protected void setCenter(Vector center) {
+  public void setCenter(Vector center) {
     this.center = center;
   }
 
@@ -59,7 +59,7 @@
     return numPoints;
   }
 
-  protected void setNumPoints(int numPoints) {
+  public void setNumPoints(int numPoints) {
     this.numPoints = numPoints;
   }
 
@@ -67,7 +67,7 @@
     return pointTotal;
   }
 
-  protected void setPointTotal(Vector pointTotal) {
+  public void setPointTotal(Vector pointTotal) {
     this.pointTotal = pointTotal;
   }
 

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/Canopy.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/Canopy.java?rev=889158&r1=889157&r2=889158&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/Canopy.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/Canopy.java Thu Dec 10 09:34:48 2009
@@ -18,17 +18,14 @@
 package org.apache.mahout.clustering.canopy;
 
 import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapred.JobConf;
 import org.apache.hadoop.mapred.OutputCollector;
 import org.apache.mahout.clustering.ClusterBase;
 import org.apache.mahout.matrix.AbstractVector;
 import org.apache.mahout.matrix.Vector;
-import org.apache.mahout.common.distance.DistanceMeasure;
 
 import java.io.DataInput;
 import java.io.DataOutput;
 import java.io.IOException;
-import java.util.List;
 
 /**
  * This class models a canopy as a center point, the number of points that are contained within it according to the
@@ -37,45 +34,11 @@
  */
 public class Canopy extends ClusterBase {
 
-  // keys used by Driver, Mapper, Combiner & Reducer
-  public static final String DISTANCE_MEASURE_KEY = "org.apache.mahout.clustering.canopy.measure";
-
-  public static final String T1_KEY = "org.apache.mahout.clustering.canopy.t1";
-
-  public static final String T2_KEY = "org.apache.mahout.clustering.canopy.t2";
-
-  public static final String CANOPY_PATH_KEY = "org.apache.mahout.clustering.canopy.path";
-
-  // the next canopyId to be allocated
-  private static int nextCanopyId = 0;
-
-  // the T1 distance threshold
-  private static double t1;
-
-  // the T2 distance threshold
-  private static double t2;
-
-  // the distance measure
-  private static DistanceMeasure measure;
-
-
-  /** Used w */
+  /** Used for deserializaztion as a writable */
   public Canopy() {
   }
 
   /**
-   * Create a new Canopy containing the given point
-   *
-   * @param point a point in vector space
-   */
-  public Canopy(Vector point) {
-    this.setId(nextCanopyId++);
-    this.setCenter(point.clone());
-    this.setPointTotal(point.clone());
-    this.setNumPoints(1);
-  }
-
-  /**
    * Create a new Canopy containing the given point and canopyId
    *
    * @param point    a point in vector space
@@ -88,118 +51,6 @@
     this.setNumPoints(1);
   }
 
-  /**
-   * Configure the Canopy and its distance measure
-   *
-   * @param job the JobConf for this job
-   */
-  public static void configure(JobConf job) {
-    try {
-      ClassLoader ccl = Thread.currentThread().getContextClassLoader();
-      Class<?> cl = ccl.loadClass(job.get(DISTANCE_MEASURE_KEY));
-      measure = (DistanceMeasure) cl.newInstance();
-      measure.configure(job);
-    } catch (ClassNotFoundException e) {
-      throw new IllegalStateException(e);
-    } catch (IllegalAccessException e) {
-      throw new IllegalStateException(e);
-    } catch (InstantiationException e) {
-      throw new IllegalStateException(e);
-    }
-    nextCanopyId = 0;
-    t1 = Double.parseDouble(job.get(T1_KEY));
-    t2 = Double.parseDouble(job.get(T2_KEY));
-  }
-
-  /** Configure the Canopy for unit tests */
-  public static void config(DistanceMeasure aMeasure, double aT1, double aT2) {
-    nextCanopyId = 0;
-    measure = aMeasure;
-    t1 = aT1;
-    t2 = aT2;
-  }
-
-  /**
-   * This is the same algorithm as the reference but inverted to iterate over existing canopies instead of the points.
-   * Because of this it does not need to actually store the points, instead storing a total points vector and the number
-   * of points. From this a centroid can be computed. <p/> This method is used by the CanopyReducer.
-   *
-   * @param point    the point to be added
-   * @param canopies the List<Canopy> to be appended
-   */
-  public static void addPointToCanopies(Vector point, List<Canopy> canopies) {
-    boolean pointStronglyBound = false;
-    for (Canopy canopy : canopies) {
-      double dist = measure.distance(canopy.getCenter().getLengthSquared(), canopy.getCenter(), point);
-      if (dist < t1) {
-        canopy.addPoint(point);
-      }
-      pointStronglyBound = pointStronglyBound || (dist < t2);
-    }
-    if (!pointStronglyBound) {
-      canopies.add(new Canopy(point));
-    }
-  }
-
-  /**
-   * This method is used by the CanopyMapper to perform canopy inclusion tests and to emit the point and its covering
-   * canopies to the output. The CanopyCombiner will then sum the canopy points and produce the centroids.
-   *
-   * @param point     the point to be added
-   * @param canopies  the List<Canopy> to be appended
-   * @param collector an OutputCollector in which to emit the point
-   */
-  public static void emitPointToNewCanopies(Vector point,
-                                            List<Canopy> canopies, OutputCollector<Text, Vector> collector)
-      throws IOException {
-    boolean pointStronglyBound = false;
-    for (Canopy canopy : canopies) {
-      double dist = measure.distance(canopy.getCenter().getLengthSquared(), canopy.getCenter(), point);
-      if (dist < t1) {
-        canopy.emitPoint(point, collector);
-      }
-      pointStronglyBound = pointStronglyBound || (dist < t2);
-    }
-    if (!pointStronglyBound) {
-      Canopy canopy = new Canopy(point);
-      canopies.add(canopy);
-      canopy.emitPoint(point, collector);
-    }
-  }
-
-  /**
-   * This method is used by the CanopyMapper to perform canopy inclusion tests and to emit the point keyed by its
-   * covering canopies to the output. if the point is not covered by any canopies (due to canopy centroid clustering),
-   * emit the point to the closest covering canopy.
-   *
-   * @param point     the point to be added
-   * @param canopies  the List<Canopy> to be appended
-   * @param collector an OutputCollector in which to emit the point
-   */
-  public static void emitPointToExistingCanopies(Vector point,
-                                                 List<Canopy> canopies,
-                                                 OutputCollector<Text, Vector> collector) throws IOException {
-    double minDist = Double.MAX_VALUE;
-    Canopy closest = null;
-    boolean isCovered = false;
-    for (Canopy canopy : canopies) {
-      double dist = measure.distance(canopy.getCenter().getLengthSquared(), canopy.getCenter(), point);
-      if (dist < t1) {
-        isCovered = true;
-        collector.collect(new Text(canopy.getIdentifier()), point);
-      } else if (dist < minDist) {
-        minDist = dist;
-        closest = canopy;
-      }
-    }
-    // if the point is not contained in any canopies (due to canopy centroid
-    // clustering), emit the point to the closest covering canopy.
-    if (!isCovered) {
-      collector.collect(new Text(closest.getIdentifier()), point);
-    }
-  }
-
-
   @Override
   public void write(DataOutput out) throws IOException {
     super.write(out);
@@ -283,14 +134,4 @@
   public Vector computeCentroid() {
     return getPointTotal().divide(getNumPoints());
   }
-
-  /**
-   * Return if the point is covered by this canopy
-   *
-   * @param point a point
-   * @return if the point is covered
-   */
-  public boolean covers(Vector point) {
-    return measure.distance(getCenter().getLengthSquared(), getCenter(), point) < t1;
-  }
 }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyDriver.java?rev=889158&r1=889157&r2=889158&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyDriver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyDriver.java Thu Dec 10 09:34:48 2009
@@ -134,11 +134,10 @@
     log.info("Input: " + input + " Out: " + output + " Measure: " + measureClassName + " t1: " + t1
         + " t2: " + t2 + " Vector Class: " + vectorClass.getSimpleName());
     JobClient client = new JobClient();
-    JobConf conf = new JobConf(
-        org.apache.mahout.clustering.canopy.CanopyDriver.class);
-    conf.set(Canopy.DISTANCE_MEASURE_KEY, measureClassName);
-    conf.set(Canopy.T1_KEY, String.valueOf(t1));
-    conf.set(Canopy.T2_KEY, String.valueOf(t2));
+    JobConf conf = new JobConf(CanopyDriver.class);
+    conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY, measureClassName);
+    conf.set(CanopyConfigKeys.T1_KEY, String.valueOf(t1));
+    conf.set(CanopyConfigKeys.T2_KEY, String.valueOf(t2));
 
     conf.setInputFormat(SequenceFileInputFormat.class);
 

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyMapper.java?rev=889158&r1=889157&r2=889158&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyMapper.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyMapper.java Thu Dec 10 09:34:48 2009
@@ -37,17 +37,19 @@
 
   private OutputCollector<Text, Vector> outputCollector;
 
+  private CanopyClusterer canopyClusterer;
+  
   @Override
   public void map(WritableComparable<?> key, Vector point,
                   OutputCollector<Text, Vector> output, Reporter reporter) throws IOException {
     outputCollector = output;
-    Canopy.addPointToCanopies(point, canopies);
+    canopyClusterer.addPointToCanopies(point, canopies);
   }
 
   @Override
   public void configure(JobConf job) {
     super.configure(job);
-    Canopy.configure(job);
+    canopyClusterer = new CanopyClusterer(job);
   }
 
   @Override

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyReducer.java?rev=889158&r1=889157&r2=889158&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyReducer.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyReducer.java Thu Dec 10 09:34:48 2009
@@ -35,12 +35,14 @@
 
   private final List<Canopy> canopies = new ArrayList<Canopy>();
 
+  private CanopyClusterer canopyClusterer;
+  
   @Override
   public void reduce(Text key, Iterator<Vector> values,
                      OutputCollector<Text, Canopy> output, Reporter reporter) throws IOException {
     while (values.hasNext()) {
       Vector point = values.next();
-      Canopy.addPointToCanopies(point, canopies);
+      canopyClusterer.addPointToCanopies(point, canopies);
     }
     for (Canopy canopy : canopies) {
       output.collect(new Text(canopy.getIdentifier()), canopy);
@@ -50,7 +52,7 @@
   @Override
   public void configure(JobConf job) {
     super.configure(job);
-    Canopy.configure(job);
+    canopyClusterer = new CanopyClusterer(job);
   }
 
 }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/ClusterDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/ClusterDriver.java?rev=889158&r1=889157&r2=889158&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/ClusterDriver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/ClusterDriver.java Thu Dec 10 09:34:48 2009
@@ -140,13 +140,12 @@
   public static void runJob(String points, String canopies, String output,
                             String measureClassName, double t1, double t2, Class<? extends Vector> vectorClass) throws IOException {
     JobClient client = new JobClient();
-    JobConf conf = new JobConf(
-        org.apache.mahout.clustering.canopy.ClusterDriver.class);
+    JobConf conf = new JobConf(ClusterDriver.class);
 
-    conf.set(Canopy.DISTANCE_MEASURE_KEY, measureClassName);
-    conf.set(Canopy.T1_KEY, String.valueOf(t1));
-    conf.set(Canopy.T2_KEY, String.valueOf(t2));
-    conf.set(Canopy.CANOPY_PATH_KEY, canopies);
+    conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY, measureClassName);
+    conf.set(CanopyConfigKeys.T1_KEY, String.valueOf(t1));
+    conf.set(CanopyConfigKeys.T2_KEY, String.valueOf(t2));
+    conf.set(CanopyConfigKeys.CANOPY_PATH_KEY, canopies);
 
     conf.setInputFormat(SequenceFileInputFormat.class);
 

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/ClusterMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/ClusterMapper.java?rev=889158&r1=889157&r2=889158&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/ClusterMapper.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/ClusterMapper.java Thu Dec 10 09:34:48 2009
@@ -36,12 +36,13 @@
 public class ClusterMapper extends MapReduceBase implements
     Mapper<WritableComparable<?>, Vector, Text, Vector> {
 
-  private List<Canopy> canopies;
+  private CanopyClusterer canopyClusterer;
+  private final List<Canopy> canopies = new ArrayList<Canopy>();
 
   @Override
   public void map(WritableComparable<?> key, Vector point,
                   OutputCollector<Text, Vector> output, Reporter reporter) throws IOException {
-    Canopy.emitPointToExistingCanopies(point, canopies, output);
+    canopyClusterer.emitPointToExistingCanopies(point, canopies, output);
   }
 
   /**
@@ -50,33 +51,42 @@
    * @param canopies a List<Canopy>
    */
   public void config(List<Canopy> canopies) {
-    this.canopies = canopies;
+    this.canopies.clear();
+    this.canopies.addAll(canopies);
   }
 
   @Override
   public void configure(JobConf job) {
     super.configure(job);
-    Canopy.configure(job);
+    canopyClusterer = new CanopyClusterer(job);
 
-    String canopyPath = job.get(Canopy.CANOPY_PATH_KEY);
-    canopies = new ArrayList<Canopy>();
-
-    try {
-      Path path = new Path(canopyPath + "/part-00000");
-      FileSystem fs = FileSystem.get(path.toUri(), job);
-      SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, job);
+    String canopyPath = job.get(CanopyConfigKeys.CANOPY_PATH_KEY);
+    if (canopyPath != null && canopyPath.length() > 0) {
       try {
-        Text key = new Text();
-        Canopy value = new Canopy();
-        while (reader.next(key, value)) {
-          canopies.add(value);
-          value = new Canopy();
+        Path path = new Path(canopyPath + "/part-00000");
+        FileSystem fs = FileSystem.get(path.toUri(), job);
+        SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, job);
+        try {
+          Text key = new Text();
+          Canopy value = new Canopy();
+          while (reader.next(key, value)) {
+            canopies.add(value);
+            value = new Canopy();
+          }
+        } finally {
+          reader.close();
         }
-      } finally {
-        reader.close();
+      } catch (IOException e) {
+        throw new IllegalStateException(e);
+      }
+      
+      if (canopies.isEmpty()) {
+        throw new NullPointerException("Canopies are empty!");
       }
-    } catch (IOException e) {
-      throw new IllegalStateException(e);
     }
   }
+
+  public boolean canopyCovers(Canopy canopy, Vector point) {
+    return canopyClusterer.canopyCovers(canopy, point);
+  }
 }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansClusterMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansClusterMapper.java?rev=889158&r1=889157&r2=889158&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansClusterMapper.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansClusterMapper.java Thu Dec 10 09:34:48 2009
@@ -33,12 +33,13 @@
 public class FuzzyKMeansClusterMapper extends MapReduceBase implements
     Mapper<WritableComparable<?>, Vector, Text, FuzzyKMeansOutput> {
 
-  private List<SoftCluster> clusters;
+  private final List<SoftCluster> clusters = new ArrayList<SoftCluster>();
+  private FuzzyKMeansClusterer clusterer;
 
   @Override
   public void map(WritableComparable<?> key, Vector point,
                   OutputCollector<Text, FuzzyKMeansOutput> output, Reporter reporter) throws IOException {
-    SoftCluster.outputPointWithClusterProbabilities(key.toString(), point, clusters, output);
+    clusterer.outputPointWithClusterProbabilities(key.toString(), point, clusters, output);
   }
 
   /**
@@ -47,19 +48,21 @@
    * @param clusters a List<Cluster>
    */
   void config(List<SoftCluster> clusters) {
-    this.clusters = clusters;
+    this.clusters.clear();
+    this.clusters.addAll(clusters);
   }
 
   @Override
   public void configure(JobConf job) {
 
     super.configure(job);
-    SoftCluster.configure(job);
-    clusters = new ArrayList<SoftCluster>();
-
-    FuzzyKMeansUtil.configureWithClusterInfo(job
-        .get(SoftCluster.CLUSTER_PATH_KEY), clusters);
+    clusterer = new FuzzyKMeansClusterer(job);
 
+    String clusterPath = job.get(FuzzyKMeansConfigKeys.CLUSTER_PATH_KEY);
+    if (clusterPath != null && clusterPath.length() > 0) {
+      FuzzyKMeansUtil.configureWithClusterInfo(clusterPath, clusters);
+    }
+    
     if (clusters.isEmpty()) {
       throw new NullPointerException("Cluster is empty!!!");
     }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansCombiner.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansCombiner.java?rev=889158&r1=889157&r2=889158&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansCombiner.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansCombiner.java Thu Dec 10 09:34:48 2009
@@ -30,6 +30,8 @@
 public class FuzzyKMeansCombiner extends MapReduceBase implements
     Reducer<Text, FuzzyKMeansInfo, Text, FuzzyKMeansInfo> {
 
+  private FuzzyKMeansClusterer clusterer;
+  
   @Override
   public void reduce(Text key, Iterator<FuzzyKMeansInfo> values,
                      OutputCollector<Text, FuzzyKMeansInfo> output, Reporter reporter) throws IOException {
@@ -40,7 +42,7 @@
 
       if (info.getCombinerPass() == 0) // first time thru combiner
       {
-        cluster.addPoint(info.getVector(), Math.pow(info.getProbability(), SoftCluster.getM()));
+        cluster.addPoint(info.getVector(), Math.pow(info.getProbability(), clusterer.getM()));
       } else {
         cluster.addPoints(info.getVector(), info.getProbability());
       }
@@ -53,7 +55,7 @@
   @Override
   public void configure(JobConf job) {
     super.configure(job);
-    SoftCluster.configure(job);
+    clusterer = new FuzzyKMeansClusterer(job);
   }
 
 }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java?rev=889158&r1=889157&r2=889158&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java Thu Dec 10 09:34:48 2009
@@ -274,10 +274,10 @@
     conf.setNumMapTasks(numMapTasks);
     conf.setNumReduceTasks(numReduceTasks);
 
-    conf.set(SoftCluster.CLUSTER_PATH_KEY, clustersIn);
-    conf.set(SoftCluster.DISTANCE_MEASURE_KEY, measureClass);
-    conf.set(SoftCluster.CLUSTER_CONVERGENCE_KEY, String.valueOf(convergenceDelta));
-    conf.set(SoftCluster.M_KEY, String.valueOf(m));
+    conf.set(FuzzyKMeansConfigKeys.CLUSTER_PATH_KEY, clustersIn);
+    conf.set(FuzzyKMeansConfigKeys.DISTANCE_MEASURE_KEY, measureClass);
+    conf.set(FuzzyKMeansConfigKeys.CLUSTER_CONVERGENCE_KEY, String.valueOf(convergenceDelta));
+    conf.set(FuzzyKMeansConfigKeys.M_KEY, String.valueOf(m));
 
     // uncomment it to run locally
     // conf.set("mapred.job.tracker", "local");
@@ -327,10 +327,10 @@
     // conf.set("mapred.job.tracker", "local");
     conf.setNumMapTasks(numMapTasks);
     conf.setNumReduceTasks(0);
-    conf.set(SoftCluster.CLUSTER_PATH_KEY, clustersIn);
-    conf.set(SoftCluster.DISTANCE_MEASURE_KEY, measureClass);
-    conf.set(SoftCluster.CLUSTER_CONVERGENCE_KEY, String.valueOf(convergenceDelta));
-    conf.set(SoftCluster.M_KEY, String.valueOf(m));
+    conf.set(FuzzyKMeansConfigKeys.CLUSTER_PATH_KEY, clustersIn);
+    conf.set(FuzzyKMeansConfigKeys.DISTANCE_MEASURE_KEY, measureClass);
+    conf.set(FuzzyKMeansConfigKeys.CLUSTER_CONVERGENCE_KEY, String.valueOf(convergenceDelta));
+    conf.set(FuzzyKMeansConfigKeys.M_KEY, String.valueOf(m));
     try {
       JobClient.runJob(conf);
     } catch (IOException e) {

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansMapper.java?rev=889158&r1=889157&r2=889158&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansMapper.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansMapper.java Thu Dec 10 09:34:48 2009
@@ -37,12 +37,13 @@
 
   private static final Logger log = LoggerFactory.getLogger(FuzzyKMeansMapper.class);
 
-  private List<SoftCluster> clusters;
-
+  private List<SoftCluster> clusters = new ArrayList<SoftCluster>();
+  private FuzzyKMeansClusterer clusterer; 
+  
   @Override
   public void map(WritableComparable<?> key, Vector point,
                   OutputCollector<Text, FuzzyKMeansInfo> output, Reporter reporter) throws IOException {
-    SoftCluster.emitPointProbToCluster(point, clusters, output);
+    clusterer.emitPointProbToCluster(point, clusters, output);
   }
 
   /**
@@ -51,21 +52,23 @@
    * @param clusters a List<Cluster>
    */
   void config(List<SoftCluster> clusters) {
-    this.clusters = clusters;
+    this.clusters.clear();
+    this.clusters.addAll(clusters);
   }
 
   @Override
   public void configure(JobConf job) {
 
     super.configure(job);
-    SoftCluster.configure(job);
+    clusterer = new FuzzyKMeansClusterer(job);
 
     log.info("In Mapper Configure:");
-    clusters = new ArrayList<SoftCluster>();
-
-    FuzzyKMeansUtil.configureWithClusterInfo(job
-        .get(SoftCluster.CLUSTER_PATH_KEY), clusters);
 
+    String clusterPath = job.get(FuzzyKMeansConfigKeys.CLUSTER_PATH_KEY);
+    if (clusterPath != null && clusterPath.length() > 0) {
+      FuzzyKMeansUtil.configureWithClusterInfo(clusterPath, clusters);
+    }
+    
     if (clusters.isEmpty()) {
       throw new NullPointerException("Cluster is empty!!!");
     }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansReducer.java?rev=889158&r1=889157&r2=889158&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansReducer.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansReducer.java Thu Dec 10 09:34:48 2009
@@ -34,8 +34,9 @@
 public class FuzzyKMeansReducer extends MapReduceBase implements
     Reducer<Text, FuzzyKMeansInfo, Text, SoftCluster> {
 
-  private Map<String, SoftCluster> clusterMap;
-
+  private final Map<String, SoftCluster> clusterMap = new HashMap<String, SoftCluster>();
+  private FuzzyKMeansClusterer clusterer; 
+  
   @Override
   public void reduce(Text key, Iterator<FuzzyKMeansInfo> values,
                      OutputCollector<Text, SoftCluster> output, Reporter reporter) throws IOException {
@@ -47,14 +48,14 @@
 
       if (value.getCombinerPass() == 0) // escaped from combiner
       {
-        cluster.addPoint(value.getVector(), Math.pow(value.getProbability(), SoftCluster.getM()));
+        cluster.addPoint(value.getVector(), Math.pow(value.getProbability(), clusterer.getM()));
       } else {
         cluster.addPoints(value.getVector(), value.getProbability());
       }
 
     }
     // force convergence calculation
-    cluster.computeConvergence();
+    clusterer.computeConvergence(cluster);
     output.collect(new Text(cluster.getIdentifier()), cluster);
   }
 
@@ -62,21 +63,22 @@
   public void configure(JobConf job) {
 
     super.configure(job);
-    SoftCluster.configure(job);
-    clusterMap = new HashMap<String, SoftCluster>();
+    clusterer = new FuzzyKMeansClusterer(job);
 
     List<SoftCluster> clusters = new ArrayList<SoftCluster>();
-    FuzzyKMeansUtil.configureWithClusterInfo(job
-        .get(SoftCluster.CLUSTER_PATH_KEY), clusters);
-    setClusterMap(clusters);
-
+    String clusterPath = job.get(FuzzyKMeansConfigKeys.CLUSTER_PATH_KEY);
+    if (clusterPath != null && clusterPath.length() > 0) {
+      FuzzyKMeansUtil.configureWithClusterInfo(clusterPath, clusters);
+      setClusterMap(clusters);
+    }
+    
     if (clusterMap.isEmpty()) {
       throw new NullPointerException("Cluster is empty!!!");
     }
   }
 
   private void setClusterMap(List<SoftCluster> clusters) {
-    clusterMap = new HashMap<String, SoftCluster>();
+    clusterMap.clear();
     for (SoftCluster cluster : clusters) {
       clusterMap.put(cluster.getIdentifier(), cluster);
     }
@@ -85,7 +87,6 @@
 
   public void config(List<SoftCluster> clusters) {
     setClusterMap(clusters);
-
   }
 
 }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/SoftCluster.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/SoftCluster.java?rev=889158&r1=889157&r2=889158&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/SoftCluster.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/SoftCluster.java Thu Dec 10 09:34:48 2009
@@ -17,46 +17,18 @@
 
 package org.apache.mahout.clustering.fuzzykmeans;
 
-import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.Writable;
-import org.apache.hadoop.mapred.JobConf;
-import org.apache.hadoop.mapred.OutputCollector;
 import org.apache.mahout.matrix.AbstractVector;
 import org.apache.mahout.matrix.SparseVector;
 import org.apache.mahout.matrix.SquareRootFunction;
 import org.apache.mahout.matrix.Vector;
-import org.apache.mahout.common.distance.DistanceMeasure;
 
 import java.io.DataInput;
 import java.io.DataOutput;
 import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
 
 public class SoftCluster implements Writable {
 
-  public static final String DISTANCE_MEASURE_KEY = "org.apache.mahout.clustering.kmeans.measure";
-
-  public static final String CLUSTER_PATH_KEY = "org.apache.mahout.clustering.kmeans.path";
-
-  public static final String CLUSTER_CONVERGENCE_KEY = "org.apache.mahout.clustering.kmeans.convergence";
-
-  public static final String M_KEY = "org.apache.mahout.clustering.fuzzykmeans.m";
-
-  private static double m = 2.0; // default value
-
-  private static final double MINIMAL_VALUE = 0.0000000001; // using it for
-
-  // adding
-
-  // exception
-  // this value to any
-  // zero valued
-  // variable to avoid
-  // divide by Zero
-
-  private static int nextClusterId = 0;
-
   // this cluster's clusterId
   private int clusterId;
 
@@ -81,11 +53,7 @@
   private Vector s1;
 
   private Vector s2;
-
-  private static DistanceMeasure measure;
-
-  private static double convergenceDelta = 0;
-
+  
   /**
    * Format the SoftCluster for output
    *
@@ -135,126 +103,12 @@
     this.weightedPointTotal = center.like();
   }
 
-
-  /**
-   * Configure the distance measure from the job
-   *
-   * @param job the JobConf for the job
-   */
-  public static void configure(JobConf job) {
-    try {
-      ClassLoader ccl = Thread.currentThread().getContextClassLoader();
-      Class<?> cl = ccl.loadClass(job.get(DISTANCE_MEASURE_KEY));
-      measure = (DistanceMeasure) cl.newInstance();
-      measure.configure(job);
-      convergenceDelta = Double.parseDouble(job.get(CLUSTER_CONVERGENCE_KEY));
-      nextClusterId = 0;
-      m = Double.parseDouble(job.get(M_KEY));
-    } catch (ClassNotFoundException e) {
-      throw new IllegalStateException(e);
-    } catch (IllegalAccessException e) {
-      throw new IllegalStateException(e);
-    } catch (InstantiationException e) {
-      throw new IllegalStateException(e);
-    }
-  }
-
-  /**
-   * Configure the distance measure directly. Used by unit tests.
-   *
-   * @param aMeasure          the DistanceMeasure
-   * @param aConvergenceDelta the delta value used to define convergence
-   */
-  public static void config(DistanceMeasure aMeasure, double aConvergenceDelta) {
-    measure = aMeasure;
-    convergenceDelta = aConvergenceDelta;
-    nextClusterId = 0;
-  }
-
-  /**
-   * Emit the point and its probability of belongingness to each cluster
-   *
-   * @param point    a point
-   * @param clusters a List<SoftCluster>
-   * @param output   the OutputCollector to emit into
-   */
-  public static void emitPointProbToCluster(Vector point,
-                                            List<SoftCluster> clusters,
-                                            OutputCollector<Text, FuzzyKMeansInfo> output) throws IOException {
-    List<Double> clusterDistanceList = new ArrayList<Double>();
-    for (SoftCluster cluster : clusters) {
-      clusterDistanceList.add(measure.distance(cluster.getCenter(), point));
-    }
-
-    for (int i = 0; i < clusters.size(); i++) {
-      double probWeight = computeProbWeight(clusterDistanceList.get(i),
-          clusterDistanceList);
-      Text key = new Text(clusters.get(i).getIdentifier()); // just output the
-      // identifier,avoids
-      // too much data
-      // traffic
-      /*Text value = new Text(Double.toString(probWeight)
-          + FuzzyKMeansDriver.MAPPER_VALUE_SEPARATOR + values.toString());*/
-      FuzzyKMeansInfo value = new FuzzyKMeansInfo(probWeight, point);
-      output.collect(key, value);
-    }
-  }
-
-  /**
-   * Output point with cluster info (Cluster and probability)
-   *
-   * @param point    a point
-   * @param clusters a List<SoftCluster> to test
-   * @param output   the OutputCollector to emit into
-   */
-  public static void outputPointWithClusterProbabilities(String key,
-                                                         Vector point, List<SoftCluster> clusters,
-                                                         OutputCollector<Text, FuzzyKMeansOutput> output) throws IOException {
-    List<Double> clusterDistanceList = new ArrayList<Double>();
-
-    for (SoftCluster cluster : clusters) {
-      clusterDistanceList.add(measure.distance(point, cluster.getCenter()));
-    }
-    FuzzyKMeansOutput fOutput = new FuzzyKMeansOutput(clusters.size());
-    for (int i = 0; i < clusters.size(); i++) {
-      // System.out.print("cluster:" + i + "\t" + clusterDistanceList.get(i));
-
-      double probWeight = computeProbWeight(clusterDistanceList.get(i),
-          clusterDistanceList);
-      /*outputValue.append(clusters.get(i).clusterId).append(':').append(
-          probWeight).append(' ');*/
-      fOutput.add(i, clusters.get(i), probWeight);
-    }
-    String name = point.getName();
-    output.collect(new Text(name != null && name.length() != 0 ? name
-        : point.asFormatString()),
-        fOutput);
-  }
-
-  /** Computes the probability of a point belonging to a cluster */
-  public static double computeProbWeight(double clusterDistance,
-                                         List<Double> clusterDistanceList) {
-    if (clusterDistance == 0) {
-      clusterDistance = MINIMAL_VALUE;
-    }
-    double denom = 0.0;
-    for (double eachCDist : clusterDistanceList) {
-      if (eachCDist == 0.0) {
-        eachCDist = MINIMAL_VALUE;
-      }
-
-      denom += Math.pow(clusterDistance / eachCDist, 2.0 / (m - 1));
-
-    }
-    return 1.0 / denom;
-  }
-
   /**
    * Compute the centroid
    *
    * @return the new centroid
    */
-  private Vector computeCentroid() {
+  public Vector computeCentroid() {
     if (pointProbSum == 0) {
       return weightedPointTotal;
     } else if (centroid == null) {
@@ -274,7 +128,6 @@
    * @param center the center point
    */
   public SoftCluster(Vector center) {
-    this.clusterId = nextClusterId++;
     this.center = center;
     this.pointProbSum = 0;
 
@@ -389,17 +242,6 @@
     weightedPointTotal = center.like();
   }
 
-  /**
-   * Return if the cluster is converged by comparing its center and centroid.
-   *
-   * @return if the cluster is converged
-   */
-  public boolean computeConvergence() {
-    Vector centroid = computeCentroid();
-    converged = measure.distance(center, centroid) <= convergenceDelta;
-    return converged;
-  }
-
   public Vector getWeightedPointTotal() {
     return weightedPointTotal;
   }
@@ -412,7 +254,7 @@
     return converged;
   }
 
-  private void setConverged(boolean converged) {
+  public void setConverged(boolean converged) {
     this.converged = converged;
   }
 
@@ -420,8 +262,6 @@
     return clusterId;
   }
 
-  public static double getM() {
-    return m;
-  }
+
 
 }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/Cluster.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/Cluster.java?rev=889158&r1=889157&r2=889158&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/Cluster.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/Cluster.java Thu Dec 10 09:34:48 2009
@@ -16,9 +16,6 @@
  */
 package org.apache.mahout.clustering.kmeans;
 
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapred.JobConf;
-import org.apache.hadoop.mapred.OutputCollector;
 import org.apache.mahout.clustering.ClusterBase;
 import org.apache.mahout.matrix.AbstractVector;
 import org.apache.mahout.matrix.SquareRootFunction;
@@ -28,42 +25,26 @@
 import java.io.DataInput;
 import java.io.DataOutput;
 import java.io.IOException;
-import java.util.List;
 
 public class Cluster extends ClusterBase {
 
-  private static final String ERROR_UNKNOWN_CLUSTER_FORMAT="Unknown cluster format:\n";
-    
-  public static final String DISTANCE_MEASURE_KEY = "org.apache.mahout.clustering.kmeans.measure";
+  /** Error message for unknown cluster format in output. */
+  private static final String ERROR_UNKNOWN_CLUSTER_FORMAT = "Unknown cluster format:\n";
 
-  public static final String CLUSTER_PATH_KEY = "org.apache.mahout.clustering.kmeans.path";
-
-  public static final String CLUSTER_CONVERGENCE_KEY = "org.apache.mahout.clustering.kmeans.convergence";
-
-  /** The number of iterations that have taken place */
-  public static final String ITERATION_NUMBER = "org.apache.mahout.clustering.kmeans.iteration";
-  /** Boolean value indicating whether the initial input is from Canopy clustering */
-  public static final String CANOPY_INPUT = "org.apache.mahout.clustering.kmeans.canopyInput";
-
-  private static int nextClusterId = 0;
-
-
-  // the current centroid is lazy evaluated and may be null
+  /** The current centroid is lazy evaluated and may be null */
   private Vector centroid = null;
 
-
-  // the total of all the points squared, used for std computation
+  /** The total of all the points squared, used for std computation */
   private Vector pointSquaredTotal = null;
 
-  // has the centroid converged with the center?
+  /** Has the centroid converged with the center? */
   private boolean converged = false;
-  private static DistanceMeasure measure;
-  private static double convergenceDelta = 0;
 
   /**
    * Format the cluster for output
-   *
-   * @param cluster the Cluster
+   * 
+   * @param cluster
+   *          the Cluster
    * @return the String representation of the Cluster
    */
   public static String formatCluster(Cluster cluster) {
@@ -77,16 +58,19 @@
   }
 
   /**
-   * Decodes and returns a Cluster from the formattedString
-   *
-   * @param formattedString a String produced by formatCluster
+   * Decodes and returns a Cluster from the formattedString.
+   * 
+   * @param formattedString
+   *          a String produced by formatCluster
    * @return a decoded Cluster, not null
-   * @throws IllegalArgumentException when the string is wrongly formatted
+   * @throws IllegalArgumentException
+   *           when the string is wrongly formatted
    */
   public static Cluster decodeCluster(String formattedString) {
     final int beginIndex = formattedString.indexOf('{');
     if (beginIndex <= 0) {
-      throw new IllegalArgumentException(ERROR_UNKNOWN_CLUSTER_FORMAT + formattedString);
+      throw new IllegalArgumentException(ERROR_UNKNOWN_CLUSTER_FORMAT
+          + formattedString);
     }
     final String id = formattedString.substring(0, beginIndex);
     final String center = formattedString.substring(beginIndex);
@@ -95,17 +79,17 @@
     final Cluster cluster;
     if (firstChar == 'C' || startsWithV) {
       final int clusterId = Integer.parseInt(formattedString.substring(1,
-        beginIndex - 2));
+          beginIndex - 2));
       final Vector clusterCenter = AbstractVector.decodeVector(center);
       cluster = new Cluster(clusterCenter, clusterId);
       cluster.setConverged(startsWithV);
     } else {
-     throw new IllegalArgumentException(ERROR_UNKNOWN_CLUSTER_FORMAT + formattedString);
+      throw new IllegalArgumentException(ERROR_UNKNOWN_CLUSTER_FORMAT
+          + formattedString);
     }
     return cluster;
   }
 
-
   @Override
   public void write(DataOutput out) throws IOException {
     super.write(out);
@@ -124,84 +108,8 @@
   }
 
   /**
-   * Configure the distance measure from the job
-   *
-   * @param job the JobConf for the job
-   */
-  public static void configure(JobConf job) {
-    try {
-      ClassLoader ccl = Thread.currentThread().getContextClassLoader();
-      Class<?> cl = ccl.loadClass(job.get(DISTANCE_MEASURE_KEY));
-      measure = (DistanceMeasure) cl.newInstance();
-      measure.configure(job);
-      convergenceDelta = Double.parseDouble(job.get(CLUSTER_CONVERGENCE_KEY));
-      nextClusterId = 0;
-    } catch (ClassNotFoundException e) {
-      throw new IllegalStateException(e);
-    } catch (IllegalAccessException e) {
-      throw new IllegalStateException(e);
-    } catch (InstantiationException e) {
-      throw new IllegalStateException(e);
-    }
-  }
-
-  /**
-   * Configure the distance measure directly. Used by unit tests.
-   *
-   * @param aMeasure          the DistanceMeasure
-   * @param aConvergenceDelta the delta value used to define convergence
-   */
-  public static void config(DistanceMeasure aMeasure, double aConvergenceDelta) {
-    measure = aMeasure;
-    convergenceDelta = aConvergenceDelta;
-    nextClusterId = 0;
-  }
-
-  /**
-   * Emit the point to the nearest cluster center
-   *
-   * @param point    a point
-   * @param clusters a List<Cluster> to test
-   * @param output   the OutputCollector to emit into
-   */
-  public static void emitPointToNearestCluster(Vector point,
-                                               List<Cluster> clusters, OutputCollector<Text, KMeansInfo> output)
-      throws IOException {
-    Cluster nearestCluster = null;
-    double nearestDistance = Double.MAX_VALUE;
-    for (Cluster cluster : clusters) {
-      Vector clusterCenter = cluster.getCenter();
-      double distance = measure.distance(clusterCenter.getLengthSquared(), clusterCenter, point);
-      if (distance < nearestDistance || nearestCluster == null ) {
-        nearestCluster = cluster;
-        nearestDistance = distance;
-      }
-    }
-    // emit only clusterID
-    output.collect(new Text(nearestCluster.getIdentifier()), new KMeansInfo(1, point));
-  }
-
-  public static void outputPointWithClusterInfo(Vector point,
-                                                List<Cluster> clusters, OutputCollector<Text, Text> output)
-      throws IOException {
-    Cluster nearestCluster = null;
-    double nearestDistance = Double.MAX_VALUE;
-    for (Cluster cluster : clusters) {
-      Vector clusterCenter = cluster.getCenter();
-      double distance = measure.distance(clusterCenter.getLengthSquared(), clusterCenter, point);
-      if (distance < nearestDistance || nearestCluster == null) {
-        nearestCluster = cluster;
-        nearestDistance = distance;
-      }
-    }
-    //TODO: this is ugly
-    String name = point.getName();
-    output.collect(new Text(name != null && name.length() != 0 ? name : point.asFormatString()), new Text(String.valueOf(nearestCluster.getId())));
-  }
-
-  /**
    * Compute the centroid by averaging the pointTotals
-   *
+   * 
    * @return the new centroid
    */
   private Vector computeCentroid() {
@@ -216,12 +124,12 @@
 
   /**
    * Construct a new cluster with the given point as its center
-   *
-   * @param center the center point
+   * 
+   * @param center
+   *          the center point
    */
   public Cluster(Vector center) {
     super();
-    this.setId(nextClusterId++);
     this.setCenter(center);
     this.setNumPoints(0);
     this.setPointTotal(center.like());
@@ -234,8 +142,9 @@
 
   /**
    * Construct a new cluster with the given point as its center
-   *
-   * @param center the center point
+   * 
+   * @param center
+   *          the center point
    */
   public Cluster(Vector center, int clusterId) {
     super();
@@ -269,8 +178,9 @@
 
   /**
    * Add the point to the cluster
-   *
-   * @param point a point to add
+   * 
+   * @param point
+   *          a point to add
    */
   public void addPoint(Vector point) {
     addPoints(1, point);
@@ -278,9 +188,11 @@
 
   /**
    * Add the point to the cluster
-   *
-   * @param count the number of points in the delta
-   * @param delta a point to add
+   * 
+   * @param count
+   *          the number of points in the delta
+   * @param delta
+   *          a point to add
    */
   public void addPoints(int count, Vector delta) {
     centroid = null;
@@ -294,7 +206,6 @@
     }
   }
 
-
   /** Compute the centroid and set the center to it. */
   public void recomputeCenter() {
     setCenter(computeCentroid());
@@ -304,16 +215,21 @@
 
   /**
    * Return if the cluster is converged by comparing its center and centroid.
-   *
+   * 
+   * @param measure
+   *          The distance measure to use for cluster-point comparisons.
+   * @param convergenceDelta
+   *          the convergence delta to use for stopping.
    * @return if the cluster is converged
    */
-  public boolean computeConvergence() {
+  public boolean computeConvergence(final DistanceMeasure measure,
+      final double convergenceDelta) {
     Vector centroid = computeCentroid();
-    converged = measure.distance(centroid.getLengthSquared(), centroid, getCenter()) <= convergenceDelta;
+    converged = measure.distance(centroid.getLengthSquared(), centroid,
+        getCenter()) <= convergenceDelta;
     return converged;
   }
 
-
   public boolean isConverged() {
     return converged;
   }
@@ -325,8 +241,8 @@
   /** @return the std */
   public double getStd() {
     Vector stds = pointSquaredTotal.times(getNumPoints()).minus(
-          getPointTotal().times(getPointTotal())).assign(new SquareRootFunction())
-          .divide(getNumPoints());
+        getPointTotal().times(getPointTotal()))
+        .assign(new SquareRootFunction()).divide(getNumPoints());
     return stds.zSum() / 2;
   }
 

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterMapper.java?rev=889158&r1=889157&r2=889158&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterMapper.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterMapper.java Thu Dec 10 09:34:48 2009
@@ -24,6 +24,7 @@
 import org.apache.hadoop.mapred.Mapper;
 import org.apache.hadoop.mapred.OutputCollector;
 import org.apache.hadoop.mapred.Reporter;
+import org.apache.mahout.common.distance.DistanceMeasure;
 import org.apache.mahout.matrix.Vector;
 
 import java.io.IOException;
@@ -33,34 +34,52 @@
 public class KMeansClusterMapper extends MapReduceBase implements
     Mapper<WritableComparable<?>, Vector, Text, Text> {
 
-  private List<Cluster> clusters;
+  private final List<Cluster> clusters = new ArrayList<Cluster>();
+  private KMeansClusterer clusterer;
 
   @Override
-  public void map(WritableComparable<?> key, Vector point, OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
-    Cluster.outputPointWithClusterInfo(point, clusters, output);
+  public void map(WritableComparable<?> key, Vector point,
+      OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
+    this.clusterer.outputPointWithClusterInfo(point, clusters, output);
   }
 
   /**
    * Configure the mapper by providing its clusters. Used by unit tests.
-   *
-   * @param clusters a List<Cluster>
+   * 
+   * @param clusters
+   *          a List<Cluster>
    */
   void config(List<Cluster> clusters) {
-    this.clusters = clusters;
+    this.clusters.clear();
+    this.clusters.addAll(clusters);
   }
 
   @Override
   public void configure(JobConf job) {
     super.configure(job);
-    Cluster.configure(job);
 
-    clusters = new ArrayList<Cluster>();
-
-    KMeansUtil.configureWithClusterInfo(job.get(Cluster.CLUSTER_PATH_KEY),
-        clusters);
-
-    if (clusters.isEmpty()) {
-      throw new NullPointerException("Cluster is empty!!!");
+    try {
+      ClassLoader ccl = Thread.currentThread().getContextClassLoader();
+      Class<?> cl = ccl.loadClass(job
+          .get(KMeansConfigKeys.DISTANCE_MEASURE_KEY));
+      DistanceMeasure measure = (DistanceMeasure) cl.newInstance();
+      measure.configure(job);
+
+      String clusterPath = job.get(KMeansConfigKeys.CLUSTER_PATH_KEY);
+      if (clusterPath != null && clusterPath.length() > 0) {
+        KMeansUtil.configureWithClusterInfo(clusterPath, clusters);
+        if (clusters.isEmpty()) {
+          throw new IllegalStateException("Cluster is empty!");
+        }
+      }
+
+      this.clusterer = new KMeansClusterer(measure);
+    } catch (ClassNotFoundException e) {
+      throw new IllegalStateException(e);
+    } catch (IllegalAccessException e) {
+      throw new IllegalStateException(e);
+    } catch (InstantiationException e) {
+      throw new IllegalStateException(e);
     }
   }
 

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansCombiner.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansCombiner.java?rev=889158&r1=889157&r2=889158&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansCombiner.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansCombiner.java Thu Dec 10 09:34:48 2009
@@ -44,7 +44,6 @@
   @Override
   public void configure(JobConf job) {
     super.configure(job);
-    Cluster.configure(job);
   }
 
 }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java?rev=889158&r1=889157&r2=889158&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java Thu Dec 10 09:34:48 2009
@@ -230,11 +230,11 @@
     conf.setCombinerClass(KMeansCombiner.class);
     conf.setReducerClass(KMeansReducer.class);
     conf.setNumReduceTasks(numReduceTasks);
-    conf.set(Cluster.CLUSTER_PATH_KEY, clustersIn);
-    conf.set(Cluster.DISTANCE_MEASURE_KEY, measureClass);
-    conf.set(Cluster.CLUSTER_CONVERGENCE_KEY, convergenceDelta);
-    conf.setInt(Cluster.ITERATION_NUMBER, iteration);
-
+    conf.set(KMeansConfigKeys.CLUSTER_PATH_KEY, clustersIn);
+    conf.set(KMeansConfigKeys.DISTANCE_MEASURE_KEY, measureClass);
+    conf.set(KMeansConfigKeys.CLUSTER_CONVERGENCE_KEY, convergenceDelta);
+    conf.set(KMeansConfigKeys.ITERATION_NUMBER, String.valueOf(iteration));
+    
     try {
       JobClient.runJob(conf);
       FileSystem fs = FileSystem.get(outPath.toUri(), conf);
@@ -277,9 +277,9 @@
 
     conf.setMapperClass(KMeansClusterMapper.class);
     conf.setNumReduceTasks(0);
-    conf.set(Cluster.CLUSTER_PATH_KEY, clustersIn);
-    conf.set(Cluster.DISTANCE_MEASURE_KEY, measureClass);
-    conf.set(Cluster.CLUSTER_CONVERGENCE_KEY, convergenceDelta);
+    conf.set(KMeansConfigKeys.CLUSTER_PATH_KEY, clustersIn);
+    conf.set(KMeansConfigKeys.DISTANCE_MEASURE_KEY, measureClass);
+    conf.set(KMeansConfigKeys.CLUSTER_CONVERGENCE_KEY, convergenceDelta);
 
     try {
       JobClient.runJob(conf);

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansMapper.java?rev=889158&r1=889157&r2=889158&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansMapper.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansMapper.java Thu Dec 10 09:34:48 2009
@@ -23,43 +23,64 @@
 import org.apache.hadoop.mapred.Mapper;
 import org.apache.hadoop.mapred.OutputCollector;
 import org.apache.hadoop.mapred.Reporter;
+import org.apache.mahout.common.distance.DistanceMeasure;
 import org.apache.mahout.matrix.Vector;
 
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.HashMap;
 import java.util.List;
 
 public class KMeansMapper extends MapReduceBase implements
     Mapper<WritableComparable<?>, Vector, Text, KMeansInfo> {
 
-  private List<Cluster> clusters;
+  private KMeansClusterer clusterer;
+  private final List<Cluster> clusters = new ArrayList<Cluster>();
 
   @Override
   public void map(WritableComparable<?> key, Vector point,
-                  OutputCollector<Text, KMeansInfo> output, Reporter reporter) throws IOException {
-    Cluster.emitPointToNearestCluster(point, clusters, output);
+      OutputCollector<Text, KMeansInfo> output, Reporter reporter)
+      throws IOException {
+   this.clusterer.emitPointToNearestCluster(point, this.clusters, output);
   }
 
   /**
    * Configure the mapper by providing its clusters. Used by unit tests.
-   *
-   * @param clusters a List<Cluster>
+   * 
+   * @param clusters
+   *          a List<Cluster>
    */
   void config(List<Cluster> clusters) {
-    this.clusters = clusters;
+    this.clusters.clear();
+    this.clusters.addAll(clusters);
   }
 
   @Override
   public void configure(JobConf job) {
     super.configure(job);
-    Cluster.configure(job);
-
-    clusters = new ArrayList<Cluster>();
-    KMeansUtil.configureWithClusterInfo(job.get(Cluster.CLUSTER_PATH_KEY),
-        clusters);
-
-    if (clusters.isEmpty()) {
-      throw new NullPointerException("Cluster is empty!!!");
+    try {
+      ClassLoader ccl = Thread.currentThread().getContextClassLoader();
+      Class<?> cl = ccl.loadClass(job
+          .get(KMeansConfigKeys.DISTANCE_MEASURE_KEY));
+      DistanceMeasure measure = (DistanceMeasure) cl.newInstance();
+      measure.configure(job);
+
+      this.clusterer = new KMeansClusterer(measure);
+
+      String clusterPath = job.get(KMeansConfigKeys.CLUSTER_PATH_KEY);
+      if (clusterPath != null && clusterPath.length() > 0) {
+        KMeansUtil.configureWithClusterInfo(clusterPath, clusters);
+        if (clusters.isEmpty()) {
+          throw new IllegalStateException("Cluster is empty!");
+        }
+      }
+
+    } catch (ClassNotFoundException e) {
+      throw new IllegalStateException(e);
+    } catch (IllegalAccessException e) {
+      throw new IllegalStateException(e);
+    } catch (InstantiationException e) {
+      throw new IllegalStateException(e);
     }
   }
 }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansReducer.java?rev=889158&r1=889157&r2=889158&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansReducer.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansReducer.java Thu Dec 10 09:34:48 2009
@@ -22,6 +22,7 @@
 import org.apache.hadoop.mapred.OutputCollector;
 import org.apache.hadoop.mapred.Reducer;
 import org.apache.hadoop.mapred.Reporter;
+import org.apache.mahout.common.distance.DistanceMeasure;
 
 import java.io.IOException;
 import java.util.ArrayList;
@@ -34,6 +35,8 @@
     Reducer<Text, KMeansInfo, Text, Cluster> {
 
   private Map<String, Cluster> clusterMap;
+  private double convergenceDelta;
+  private DistanceMeasure measure;
 
   @Override
   public void reduce(Text key, Iterator<KMeansInfo> values,
@@ -45,7 +48,7 @@
       cluster.addPoints(delta.getPoints(), delta.getPointTotal());
     }
     // force convergence calculation
-    cluster.computeConvergence();
+    cluster.computeConvergence(this.measure, this.convergenceDelta);
     output.collect(new Text(cluster.getIdentifier()), cluster);
   }
 
@@ -53,16 +56,33 @@
   public void configure(JobConf job) {
 
     super.configure(job);
-    Cluster.configure(job);
-    clusterMap = new HashMap<String, Cluster>();
-
-    List<Cluster> clusters = new ArrayList<Cluster>();
-    KMeansUtil.configureWithClusterInfo(job.get(Cluster.CLUSTER_PATH_KEY),
-        clusters);
-    setClusterMap(clusters);
-
-    if (clusterMap.isEmpty()) {
-      throw new NullPointerException("Cluster is empty!!!");
+    try {
+      ClassLoader ccl = Thread.currentThread().getContextClassLoader();
+      Class<?> cl = ccl.loadClass(job
+          .get(KMeansConfigKeys.DISTANCE_MEASURE_KEY));
+      this.measure = (DistanceMeasure) cl.newInstance();
+      this.measure.configure(job);
+
+      this.convergenceDelta = Double.parseDouble(job
+          .get(KMeansConfigKeys.CLUSTER_CONVERGENCE_KEY));
+
+      this.clusterMap = new HashMap<String, Cluster>();
+
+      String path = job.get(KMeansConfigKeys.CLUSTER_PATH_KEY);
+      if (job != null && path.length() > 0) {
+        List<Cluster> clusters = new ArrayList<Cluster>();
+        KMeansUtil.configureWithClusterInfo(path, clusters);
+        setClusterMap(clusters);
+        if (clusterMap.isEmpty()) {
+          throw new NullPointerException("Cluster is empty!");
+        }
+      }
+    } catch (ClassNotFoundException e) {
+      throw new IllegalStateException(e);
+    } catch (IllegalAccessException e) {
+      throw new IllegalStateException(e);
+    } catch (InstantiationException e) {
+      throw new IllegalStateException(e);
     }
   }
 

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/RandomSeedGenerator.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/RandomSeedGenerator.java?rev=889158&r1=889157&r2=889158&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/RandomSeedGenerator.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/RandomSeedGenerator.java Thu Dec 10 09:34:48 2009
@@ -73,8 +73,9 @@
 
       List<Text> chosenTexts = new ArrayList<Text>(k);
       List<Cluster> chosenClusters = new ArrayList<Cluster>(k);
+      int nextClusterId = 0;
       while (reader.next(key, value)) {
-        Cluster newCluster = new Cluster(value);
+        Cluster newCluster = new Cluster(value, nextClusterId++);
         newCluster.addPoint(value);
         Text newText = new Text(key.toString());
         int currentSize = chosenTexts.size();

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopy.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopy.java?rev=889158&r1=889157&r2=889158&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopy.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopy.java Thu Dec 10 09:34:48 2009
@@ -20,19 +20,14 @@
 import com.google.gson.Gson;
 import com.google.gson.GsonBuilder;
 import com.google.gson.reflect.TypeToken;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.WritableComparable;
-import org.apache.hadoop.mapred.JobConf;
-import org.apache.hadoop.mapred.OutputCollector;
 import org.apache.mahout.clustering.ClusterBase;
+import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
 import org.apache.mahout.matrix.AbstractVector;
 import org.apache.mahout.matrix.CardinalityException;
 import org.apache.mahout.matrix.DenseVector;
 import org.apache.mahout.matrix.JsonVectorAdapter;
 import org.apache.mahout.matrix.PlusFunction;
 import org.apache.mahout.matrix.Vector;
-import org.apache.mahout.common.distance.DistanceMeasure;
-import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
 
 import java.io.DataInput;
 import java.io.DataOutput;
@@ -48,163 +43,53 @@
  */
 public class MeanShiftCanopy extends ClusterBase {
 
-  // keys used by Driver, Mapper, Combiner & Reducer
-  public static final String DISTANCE_MEASURE_KEY = "org.apache.mahout.clustering.canopy.measure";
-
-  public static final String T1_KEY = "org.apache.mahout.clustering.canopy.t1";
-
-  public static final String T2_KEY = "org.apache.mahout.clustering.canopy.t2";
-
-  public static final String CONTROL_PATH_KEY = "org.apache.mahout.clustering.control.path";
-
-  public static final String CLUSTER_CONVERGENCE_KEY = "org.apache.mahout.clustering.canopy.convergence";
-
-  private static double convergenceDelta = 0;
-
-  // the next canopyId to be allocated
-  private static int nextCanopyId = 0;
-
-  // the T1 distance threshold
-  private static double t1;
-
-  // the T2 distance threshold
-  private static double t2;
-
-  // the distance measure
-  private static DistanceMeasure measure;
-
-  // TODO: this is problematic, but how else to encode membership?
+   // TODO: this is problematic, but how else to encode membership?
   private List<Vector> boundPoints = new ArrayList<Vector>();
 
   private boolean converged = false;
 
-  static double getT1() {
-    return t1;
-  }
-
-  static double getT2() {
-    return t2;
-  }
-
-  /**
-   * Configure the Canopy and its distance measure
-   *
-   * @param job the JobConf for this job
-   */
-  public static void configure(JobConf job) {
-    try {
-      measure = Class.forName(job.get(DISTANCE_MEASURE_KEY)).asSubclass(
-          DistanceMeasure.class).newInstance();
-      measure.configure(job);
-    } catch (ClassNotFoundException e) {
-      throw new IllegalStateException(e);
-    } catch (IllegalAccessException e) {
-      throw new IllegalStateException(e);
-    } catch (InstantiationException e) {
-      throw new IllegalStateException(e);
-    }
-    nextCanopyId = 0;
-    t1 = Double.parseDouble(job.get(T1_KEY));
-    t2 = Double.parseDouble(job.get(T2_KEY));
-    convergenceDelta = Double.parseDouble(job.get(CLUSTER_CONVERGENCE_KEY));
-  }
-
-  /**
-   * Configure the Canopy for unit tests
-   *
-   * @param aDelta the convergence criteria
-   */
-  public static void config(DistanceMeasure aMeasure, double aT1, double aT2,
-                            double aDelta) {
-    nextCanopyId = 100; // so canopyIds will sort properly
-    measure = aMeasure;
-    t1 = aT1;
-    t2 = aT2;
-    convergenceDelta = aDelta;
-  }
-
-  /**
-   * Merge the given canopy into the canopies list. If it touches any existing canopy (norm<T1) then add the center of
-   * each to the other. If it covers any other canopies (norm<T2), then merge the given canopy with the closest covering
-   * canopy. If the given canopy does not cover any other canopies, add it to the canopies list.
-   *
-   * @param aCanopy  a MeanShiftCanopy to be merged
-   * @param canopies the List<Canopy> to be appended
-   */
-  public static void mergeCanopy(MeanShiftCanopy aCanopy,
-                                 List<MeanShiftCanopy> canopies) {
-    MeanShiftCanopy closestCoveringCanopy = null;
-    double closestNorm = Double.MAX_VALUE;
-    for (MeanShiftCanopy canopy : canopies) {
-      double norm = measure.distance(canopy.getCenter(), aCanopy.getCenter());
-      if (norm < t1) {
-        aCanopy.touch(canopy);
-      }
-      if (norm < t2) {
-        if (closestCoveringCanopy == null || norm < closestNorm) {
-          closestNorm = norm;
-          closestCoveringCanopy = canopy;
-        }
-      }
-    }
-    if (closestCoveringCanopy == null) {
-      canopies.add(aCanopy);
-    } else {
-      closestCoveringCanopy.merge(aCanopy);
-    }
-  }
-
-  /** Format the canopy for output */
-  public static String formatCanopy(MeanShiftCanopy canopy) {
-    Type vectorType = new TypeToken<Vector>() {
-    }.getType();
-    GsonBuilder gBuilder = new GsonBuilder();
-    gBuilder.registerTypeAdapter(vectorType, new JsonVectorAdapter());
-    Gson gson = gBuilder.create();
-    return gson.toJson(canopy, MeanShiftCanopy.class);
-  }
-
-  /**
-   * Decodes and returns a Canopy from the formattedString
-   *
-   * @param formattedString a String produced by formatCanopy
-   * @return a new Canopy
-   */
-  public static MeanShiftCanopy decodeCanopy(String formattedString) {
-    Type vectorType = new TypeToken<Vector>() {
-    }.getType();
-    GsonBuilder gBuilder = new GsonBuilder();
-    gBuilder.registerTypeAdapter(vectorType, new JsonVectorAdapter());
-    Gson gson = gBuilder.create();
-    return gson.fromJson(formattedString, MeanShiftCanopy.class);
-  }
-
   public MeanShiftCanopy() {
     super();
   }
 
   /** Create a new Canopy with the given canopyId */
+  /*
   public MeanShiftCanopy(String id) {
     this.setId(Integer.parseInt(id.substring(1)));
     this.setCenter(null);
     this.setPointTotal(null);
     this.setNumPoints(0);
   }
+  */
 
   /**
    * Create a new Canopy containing the given point
    *
    * @param point a Vector
    */
+  /*
   public MeanShiftCanopy(Vector point) {
-    this.setId(nextCanopyId++);
     this.setCenter(point);
     this.setPointTotal(point.clone());
     this.setNumPoints(1);
     this.boundPoints.add(point);
   }
+  */
 
   /**
+   * Create a new Canopy containing the given point
+   *
+   * @param point a Vector
+   */
+  public MeanShiftCanopy(Vector point, int id) {
+    this.setId(id);
+    this.setCenter(point);
+    this.setPointTotal(point.clone());
+    this.setNumPoints(1);
+    this.boundPoints.add(point);
+  }
+  
+  /**
    * Create a new Canopy containing the given point, id and bound points
    *
    * @param point       a Vector
@@ -236,16 +121,6 @@
   }
 
   /**
-   * Return if the point is closely covered by this canopy
-   *
-   * @param point a Vector point
-   * @return if the point is covered
-   */
-  public boolean closelyBound(Vector point) {
-    return measure.distance(getCenter(), point) < t2;
-  }
-
-  /**
    * Compute the bound centroid by averaging the bound points
    *
    * @return a Vector which is the new bound centroid
@@ -271,25 +146,6 @@
     }
   }
 
-  /**
-   * Return if the point is covered by this canopy
-   *
-   * @param point a Vector point
-   * @return if the point is covered
-   */
-  boolean covers(Vector point) {
-    return measure.distance(getCenter(), point) < t1;
-  }
-
-  /** Emit the new canopy to the collector, keyed by the canopy's Id */
-  void emitCanopy(MeanShiftCanopy canopy,
-                  OutputCollector<Text, WritableComparable<?>> collector)
-      throws IOException {
-    String identifier = this.getIdentifier();
-    collector.collect(new Text(identifier),
-        new Text("new " + canopy.toString()));
-  }
-
   public List<Vector> getBoundPoints() {
     return boundPoints;
   }
@@ -322,20 +178,6 @@
     boundPoints.addAll(canopy.boundPoints);
   }
 
-  /**
-   * Shift the center to the new centroid of the cluster
-   *
-   * @return if the cluster is converged
-   */
-  public boolean shiftToMean() {
-    Vector centroid = computeCentroid();
-    converged = new EuclideanDistanceMeasure().distance(centroid, getCenter()) < convergenceDelta;
-    setCenter(centroid);
-    setNumPoints(1);
-    setPointTotal(centroid.clone());
-    return converged;
-  }
-
   @Override
   public String toString() {
     return formatCanopy(this);
@@ -386,4 +228,38 @@
   public String asFormatString() {
     return formatCanopy(this);
   }
+  
+  public void setBoundPoints(List<Vector> boundPoints) {
+    this.boundPoints = boundPoints;
+  }
+
+  public void setConverged(boolean converged) {
+    this.converged = converged;
+  }
+
+  /** Format the canopy for output */
+  public static String formatCanopy(MeanShiftCanopy canopy) {
+    Type vectorType = new TypeToken<Vector>() {
+    }.getType();
+    GsonBuilder gBuilder = new GsonBuilder();
+    gBuilder.registerTypeAdapter(vectorType, new JsonVectorAdapter());
+    Gson gson = gBuilder.create();
+    return gson.toJson(canopy, MeanShiftCanopy.class);
+  }
+
+  /**
+   * Decodes and returns a Canopy from the formattedString
+   *
+   * @param formattedString a String produced by formatCanopy
+   * @return a new Canopy
+   */
+  public static MeanShiftCanopy decodeCanopy(String formattedString) {
+    Type vectorType = new TypeToken<Vector>() {
+    }.getType();
+    GsonBuilder gBuilder = new GsonBuilder();
+    gBuilder.registerTypeAdapter(vectorType, new JsonVectorAdapter());
+    Gson gson = gBuilder.create();
+    return gson.fromJson(formattedString, MeanShiftCanopy.class);
+  }
+
 }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java?rev=889158&r1=889157&r2=889158&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java Thu Dec 10 09:34:48 2009
@@ -90,7 +90,7 @@
       double t1 = Double.parseDouble(cmdLine.getValue(threshold1Opt).toString());
       double t2 = Double.parseDouble(cmdLine.getValue(threshold2Opt).toString());
       double convergenceDelta = Double.parseDouble(cmdLine.getValue(convergenceDeltaOpt).toString());
-      runJob(input, output, output + MeanShiftCanopy.CONTROL_PATH_KEY,
+      runJob(input, output, output + MeanShiftCanopyConfigKeys.CONTROL_PATH_KEY,
         measureClassName, t1, t2, convergenceDelta);
     } catch (OptionException e) {
       log.error("Exception parsing command line: ", e);
@@ -127,12 +127,11 @@
     conf.setNumReduceTasks(1);
     conf.setInputFormat(SequenceFileInputFormat.class);
     conf.setOutputFormat(SequenceFileOutputFormat.class);
-    conf.set(MeanShiftCanopy.DISTANCE_MEASURE_KEY, measureClassName);
-    conf.set(MeanShiftCanopy.CLUSTER_CONVERGENCE_KEY, String
-        .valueOf(convergenceDelta));
-    conf.set(MeanShiftCanopy.T1_KEY, String.valueOf(t1));
-    conf.set(MeanShiftCanopy.T2_KEY, String.valueOf(t2));
-    conf.set(MeanShiftCanopy.CONTROL_PATH_KEY, control);
+    conf.set(MeanShiftCanopyConfigKeys.DISTANCE_MEASURE_KEY, measureClassName);
+    conf.set(MeanShiftCanopyConfigKeys.CLUSTER_CONVERGENCE_KEY, String.valueOf(convergenceDelta));
+    conf.set(MeanShiftCanopyConfigKeys.T1_KEY, String.valueOf(t1));
+    conf.set(MeanShiftCanopyConfigKeys.T2_KEY, String.valueOf(t2));
+    conf.set(MeanShiftCanopyConfigKeys.CONTROL_PATH_KEY, control);
 
     client.setConf(conf);
     try {

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyMapper.java?rev=889158&r1=889157&r2=889158&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyMapper.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyMapper.java Thu Dec 10 09:34:48 2009
@@ -33,7 +33,8 @@
     Mapper<WritableComparable<?>, MeanShiftCanopy, Text, MeanShiftCanopy> {
 
   private final List<MeanShiftCanopy> canopies = new ArrayList<MeanShiftCanopy>();
-
+  
+  private MeanShiftCanopyClusterer clusterer;
   private OutputCollector<Text, MeanShiftCanopy> output;
 
   @Override
@@ -41,13 +42,13 @@
                   OutputCollector<Text, MeanShiftCanopy> output, Reporter reporter)
       throws IOException {
     this.output = output;
-    MeanShiftCanopy.mergeCanopy(canopy.shallowCopy(), canopies);
+    clusterer.mergeCanopy(canopy.shallowCopy(), canopies);
   }
 
   @Override
   public void close() throws IOException {
     for (MeanShiftCanopy canopy : canopies) {
-      canopy.shiftToMean();
+      clusterer.shiftToMean(canopy);
       output.collect(new Text("canopy"), canopy);
     }
     super.close();
@@ -56,7 +57,7 @@
   @Override
   public void configure(JobConf job) {
     super.configure(job);
-    MeanShiftCanopy.configure(job);
+    clusterer = new MeanShiftCanopyClusterer(job);
   }
 
 }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyReducer.java?rev=889158&r1=889157&r2=889158&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyReducer.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyReducer.java Thu Dec 10 09:34:48 2009
@@ -35,7 +35,7 @@
     Reducer<Text, MeanShiftCanopy, Text, MeanShiftCanopy> {
 
   private final List<MeanShiftCanopy> canopies = new ArrayList<MeanShiftCanopy>();
-
+  private MeanShiftCanopyClusterer clusterer;
   private boolean allConverged = true;
 
   private JobConf conf;
@@ -47,11 +47,11 @@
 
     while (values.hasNext()) {
       MeanShiftCanopy canopy = values.next();
-      MeanShiftCanopy.mergeCanopy(canopy.shallowCopy(), canopies);
+      clusterer.mergeCanopy(canopy.shallowCopy(), canopies);
     }
 
     for (MeanShiftCanopy canopy : canopies) {
-      allConverged = canopy.shiftToMean() && allConverged;
+      allConverged = clusterer.shiftToMean(canopy) && allConverged;
       output.collect(new Text(canopy.getIdentifier()), canopy);
     }
 
@@ -61,13 +61,13 @@
   public void configure(JobConf job) {
     super.configure(job);
     this.conf = job;
-    MeanShiftCanopy.configure(job);
+    clusterer = new MeanShiftCanopyClusterer(job);
   }
 
   @Override
   public void close() throws IOException {
     if (allConverged) {
-      Path path = new Path(conf.get(MeanShiftCanopy.CONTROL_PATH_KEY));
+      Path path = new Path(conf.get(MeanShiftCanopyConfigKeys.CONTROL_PATH_KEY));
       FileSystem.get(conf).createNewFile(path);
     }
     super.close();