You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ak...@apache.org on 2014/06/01 02:07:22 UTC

git commit: Removing references to deprecated canopy clustering tests and examples.

Repository: mahout
Updated Branches:
  refs/heads/master 1d1134ee9 -> 88bddb08a


Removing references to deprecated canopy clustering tests and examples.


Project: http://git-wip-us.apache.org/repos/asf/mahout/repo
Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/88bddb08
Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/88bddb08
Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/88bddb08

Branch: refs/heads/master
Commit: 88bddb08ab014df9d1d14b75fc96f6a8380b4545
Parents: 1d1134e
Author: Andrew Musselman <ak...@apache.org>
Authored: Sat May 31 15:24:11 2014 -0700
Committer: Andrew Musselman <ak...@apache.org>
Committed: Sat May 31 15:24:11 2014 -0700

----------------------------------------------------------------------
 examples/bin/cluster-syntheticcontrol.sh        |  1 -
 .../mahout/clustering/TestClusterDumper.java    | 66 +++++++++-----------
 .../mahout/clustering/TestClusterEvaluator.java |  2 +-
 .../mahout/clustering/AbstractCluster.java      | 20 ------
 .../org/apache/mahout/clustering/Cluster.java   |  5 +-
 .../mahout/clustering/TestClusterInterface.java | 47 --------------
 .../iterator/TestClusterClassifier.java         | 15 -----
 7 files changed, 33 insertions(+), 123 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mahout/blob/88bddb08/examples/bin/cluster-syntheticcontrol.sh
----------------------------------------------------------------------
diff --git a/examples/bin/cluster-syntheticcontrol.sh b/examples/bin/cluster-syntheticcontrol.sh
index 550964e..188d166 100755
--- a/examples/bin/cluster-syntheticcontrol.sh
+++ b/examples/bin/cluster-syntheticcontrol.sh
@@ -34,7 +34,6 @@ else
   echo "Please select a number to choose the corresponding clustering algorithm"
   echo "1. ${algorithm[0]} clustering"
   echo "2. ${algorithm[1]} clustering"
-  echo "3. ${algorithm[2]} clustering"
   read -p "Enter your choice : " choice
 fi
 echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]} Clustering"

http://git-wip-us.apache.org/repos/asf/mahout/blob/88bddb08/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java b/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
index f273f3c..4bbab65 100644
--- a/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
+++ b/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
@@ -33,9 +33,9 @@ import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.store.RAMDirectory;
 import org.apache.lucene.util.Version;
-import org.apache.mahout.clustering.canopy.CanopyDriver;
 import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver;
 import org.apache.mahout.clustering.kmeans.KMeansDriver;
+import org.apache.mahout.clustering.kmeans.RandomSeedGenerator;
 import org.apache.mahout.common.MahoutTestCase;
 import org.apache.mahout.common.distance.DistanceMeasure;
 import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
@@ -177,49 +177,42 @@ public final class TestClusterDumper extends MahoutTestCase {
   }
 
   @Test
-  public void testCanopy() throws Exception { // now run the Job
-    DistanceMeasure measure = new EuclideanDistanceMeasure();
-
-    Path output = getTestTempDirPath("output");
-    CanopyDriver.run(getConfiguration(), getTestTempDirPath("testdata"),
-        output, measure, 8, 4, true, 0.0, true);
-    // run ClusterDumper
-    ClusterDumper clusterDumper = new ClusterDumper(new Path(output,
-        "clusters-0-final"), new Path(output, "clusteredPoints"));
-    clusterDumper.printClusters(termDictionary);
-  }
-
-  @Test
   public void testKmeans() throws Exception {
     DistanceMeasure measure = new EuclideanDistanceMeasure();
-    // now run the Canopy job to prime kMeans canopies
+    Path input = getTestTempFilePath("input");
     Path output = getTestTempDirPath("output");
+    Path initialPoints = new Path(output, Cluster.CLUSTERS_DIR + '0' + Cluster.FINAL_ITERATION_SUFFIX);
     Configuration conf = getConfiguration();
-    CanopyDriver.run(conf, getTestTempDirPath("testdata"), output, measure, 8,
-        4, false, 0.0, true);
-    // now run the KMeans job
+    FileSystem fs = FileSystem.get(conf);
+    // Write test data to file
+    ClusteringTestUtils.writePointsToFile(sampleData, input, fs, conf);
+    // Select initial centroids
+    RandomSeedGenerator.buildRandom(conf, input, initialPoints, 8, measure, 1L);
+    // Run k-means
     Path kMeansOutput = new Path(output, "kmeans");
-    KMeansDriver.run(conf, getTestTempDirPath("testdata"), new Path(output,
-        "clusters-0-final"), kMeansOutput, 0.001, 10, true, 0.0, false);
-    // run ClusterDumper
+    KMeansDriver.run(conf, getTestTempDirPath("testdata"), initialPoints, kMeansOutput, 0.001, 10, true, 0.0, false);
+    // Print out clusters
     ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
-        output, 10), new Path(kMeansOutput, "clusteredPoints"));
+            output, 10), new Path(kMeansOutput, "clusteredPoints"));
     clusterDumper.printClusters(termDictionary);
   }
 
   @Test
   public void testJsonClusterDumper() throws Exception {
     DistanceMeasure measure = new EuclideanDistanceMeasure();
-    // now run the Canopy job to prime kMeans canopies
+    Path input = getTestTempFilePath("input");
     Path output = getTestTempDirPath("output");
+    Path initialPoints = new Path(output, Cluster.CLUSTERS_DIR + '0' + Cluster.FINAL_ITERATION_SUFFIX);
     Configuration conf = getConfiguration();
-    CanopyDriver.run(conf, getTestTempDirPath("testdata"), output, measure, 8,
-        4, false, 0.0, true);
-    // now run the KMeans job
+    FileSystem fs = FileSystem.get(conf);
+    // Write test data to file
+    ClusteringTestUtils.writePointsToFile(sampleData, input, fs, conf);
+    // Select initial centroids
+    RandomSeedGenerator.buildRandom(conf, input, initialPoints, 8, measure, 1L);
+    // Run k-means
     Path kmeansOutput = new Path(output, "kmeans");
-    KMeansDriver.run(conf, getTestTempDirPath("testdata"), new Path(output,
-        "clusters-0-final"), kmeansOutput, 0.001, 10, true, 0.0, false);
-    // run ClusterDumper
+    KMeansDriver.run(conf, getTestTempDirPath("testdata"), initialPoints, kmeansOutput, 0.001, 10, true, 0.0, false);
+    // Print out clusters
     ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
         output, 10), new Path(kmeansOutput, "clusteredPoints"));
     clusterDumper.setOutputFormat(ClusterDumper.OUTPUT_FORMAT.JSON);
@@ -229,15 +222,18 @@ public final class TestClusterDumper extends MahoutTestCase {
   @Test
   public void testFuzzyKmeans() throws Exception {
     DistanceMeasure measure = new EuclideanDistanceMeasure();
-    // now run the Canopy job to prime kMeans canopies
+    Path input = getTestTempFilePath("input");
     Path output = getTestTempDirPath("output");
+    Path initialPoints = new Path(output, Cluster.CLUSTERS_DIR + '0' + Cluster.FINAL_ITERATION_SUFFIX);
     Configuration conf = getConfiguration();
-    CanopyDriver.run(conf, getTestTempDirPath("testdata"), output, measure, 8,
-        4, false, 0.0, true);
-    // now run the Fuzzy KMeans job
+    FileSystem fs = FileSystem.get(conf);
+    // Write test data to file
+    ClusteringTestUtils.writePointsToFile(sampleData, input, fs, conf);
+    // Select initial centroids
+    RandomSeedGenerator.buildRandom(conf, input, initialPoints, 8, measure, 1L);
+    // Run k-means
     Path kMeansOutput = new Path(output, "kmeans");
-    FuzzyKMeansDriver.run(conf, getTestTempDirPath("testdata"), new Path(
-        output, "clusters-0-final"), kMeansOutput, 0.001, 10, 1.1f, true,
+    FuzzyKMeansDriver.run(conf, getTestTempDirPath("testdata"), initialPoints, kMeansOutput, 0.001, 10, 1.1f, true,
         true, 0, true);
     // run ClusterDumper
     ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,

http://git-wip-us.apache.org/repos/asf/mahout/blob/88bddb08/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java b/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java
index d0a54cf..8a226a0 100644
--- a/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java
+++ b/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java
@@ -138,7 +138,7 @@ public final class TestClusterEvaluator extends MahoutTestCase {
       points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {-dP, dP}))));
     }
   }
-  
+
   @Test
   public void testRepresentativePoints() throws Exception {
     ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata, "file1"), fs, conf);

http://git-wip-us.apache.org/repos/asf/mahout/blob/88bddb08/mrlegacy/src/main/java/org/apache/mahout/clustering/AbstractCluster.java
----------------------------------------------------------------------
diff --git a/mrlegacy/src/main/java/org/apache/mahout/clustering/AbstractCluster.java b/mrlegacy/src/main/java/org/apache/mahout/clustering/AbstractCluster.java
index 6392286..9d6b135 100644
--- a/mrlegacy/src/main/java/org/apache/mahout/clustering/AbstractCluster.java
+++ b/mrlegacy/src/main/java/org/apache/mahout/clustering/AbstractCluster.java
@@ -355,18 +355,12 @@ public abstract class AbstractCluster implements Cluster {
    */
   public static List<Object> formatVectorAsJson(Vector v, String[] bindings) throws IOException {
 
-    List<TermIndexWeight> vectorTerms = Lists.newArrayList();
-
     boolean hasBindings = bindings != null;
     boolean isSparse = !v.isDense() && v.getNumNondefaultElements() != v.size();
 
     // we assume sequential access in the output
     Vector provider = v.isSequentialAccess() ? v : new SequentialAccessSparseVector(v);
 
-    for (Vector.Element elt : v.nonZeroes()) {
-      vectorTerms.add(new TermIndexWeight(elt.index(), elt.get()));
-    }
-
     List<Object> terms = Lists.newLinkedList();
     String term = "";
 
@@ -391,20 +385,6 @@ public abstract class AbstractCluster implements Cluster {
     return terms;
   }
 
-  /**
-   * Convenience class for sorting terms
-   *
-   */
-  private static class TermIndexWeight {
-    private final int index;
-    private final double weight;
-
-    TermIndexWeight(int index, double weight) {
-      this.index = index;
-      this.weight = weight;
-    }
-  }
-
   @Override
   public boolean isConverged() {
     // Convergence has no meaning yet, perhaps in subclasses

http://git-wip-us.apache.org/repos/asf/mahout/blob/88bddb08/mrlegacy/src/main/java/org/apache/mahout/clustering/Cluster.java
----------------------------------------------------------------------
diff --git a/mrlegacy/src/main/java/org/apache/mahout/clustering/Cluster.java b/mrlegacy/src/main/java/org/apache/mahout/clustering/Cluster.java
index d216318..07d6927 100644
--- a/mrlegacy/src/main/java/org/apache/mahout/clustering/Cluster.java
+++ b/mrlegacy/src/main/java/org/apache/mahout/clustering/Cluster.java
@@ -27,10 +27,7 @@ import java.util.Map;
  * 
  */
 public interface Cluster extends Model<VectorWritable>, Parametered {
-  
-  // default directory for all clustered points
-  String CLUSTERED_POINTS_DIR = "clusteredPoints";
-  
+
   // default directory for initial clusters to prime iterative clustering
   // algorithms
   String INITIAL_CLUSTERS_DIR = "clusters-0";

http://git-wip-us.apache.org/repos/asf/mahout/blob/88bddb08/mrlegacy/src/test/java/org/apache/mahout/clustering/TestClusterInterface.java
----------------------------------------------------------------------
diff --git a/mrlegacy/src/test/java/org/apache/mahout/clustering/TestClusterInterface.java b/mrlegacy/src/test/java/org/apache/mahout/clustering/TestClusterInterface.java
index 4ca1249..1866747 100644
--- a/mrlegacy/src/test/java/org/apache/mahout/clustering/TestClusterInterface.java
+++ b/mrlegacy/src/test/java/org/apache/mahout/clustering/TestClusterInterface.java
@@ -17,65 +17,18 @@
 
 package org.apache.mahout.clustering;
 
-import org.apache.mahout.clustering.canopy.Canopy;
 import org.apache.mahout.common.MahoutTestCase;
 import org.apache.mahout.common.distance.DistanceMeasure;
 import org.apache.mahout.common.distance.ManhattanDistanceMeasure;
 import org.apache.mahout.math.DenseVector;
 import org.apache.mahout.math.SequentialAccessSparseVector;
 import org.apache.mahout.math.Vector;
-
-import org.codehaus.jackson.map.ObjectMapper;
 import org.junit.Test;
 
-import java.io.IOException;
-import java.util.Map;
-
 public final class TestClusterInterface extends MahoutTestCase {
 
   private static final DistanceMeasure measure = new ManhattanDistanceMeasure();
 
-  private final ObjectMapper jxn = new ObjectMapper();
-
-  @Test
-  public void testCanopyAsFormatString() throws IOException {
-    double[] d = { 1.1, 2.2, 3.3 };
-    Vector m = new DenseVector(d);
-    Cluster cluster = new Canopy(m, 123, measure);
-    String formatString = cluster.asFormatString(null);
-    assertEquals("{\"r\":[],\"c\":[1.1,2.2,3.3],\"n\":0,\"identifier\":\"C-123\"}", formatString);
-  }
-
-  @Test
-  public void testCanopyAsFormatStringSparse() {
-    double[] d = { 1.1, 0.0, 3.3 };
-    Vector m = new SequentialAccessSparseVector(3);
-    m.assign(d);
-    Cluster cluster = new Canopy(m, 123, measure);
-    String formatString = cluster.asFormatString(null);
-    assertEquals("{\"r\":[],\"c\":[{\"0\":1.1},{\"2\":3.3}],\"n\":0,\"identifier\":\"C-123\"}", formatString);
-  }
-
-  @Test
-  public void testCanopyAsFormatStringWithBindings() {
-    double[] d = { 1.1, 2.2, 3.3 };
-    Vector m = new DenseVector(d);
-    Cluster cluster = new Canopy(m, 123, measure);
-    String[] bindings = { "fee", null, null };
-    String formatString = cluster.asFormatString(bindings);
-    assertEquals("{\"r\":[],\"c\":[{\"fee\":1.1},{\"1\":2.2},{\"2\":3.3}],\"n\":0,\"identifier\":\"C-123\"}", formatString);
-  }
-
-  @Test
-  public void testCanopyAsFormatStringSparseWithBindings() {
-    double[] d = { 1.1, 0.0, 3.3 };
-    Vector m = new SequentialAccessSparseVector(3);
-    m.assign(d);
-    Cluster cluster = new Canopy(m, 123, measure);
-    String formatString = cluster.asFormatString(null);
-    assertEquals("{\"r\":[],\"c\":[{\"0\":1.1},{\"2\":3.3}],\"n\":0,\"identifier\":\"C-123\"}", formatString);
-  }
-
   @Test
   public void testClusterAsFormatString() {
     double[] d = { 1.1, 2.2, 3.3 };

http://git-wip-us.apache.org/repos/asf/mahout/blob/88bddb08/mrlegacy/src/test/java/org/apache/mahout/clustering/iterator/TestClusterClassifier.java
----------------------------------------------------------------------
diff --git a/mrlegacy/src/test/java/org/apache/mahout/clustering/iterator/TestClusterClassifier.java b/mrlegacy/src/test/java/org/apache/mahout/clustering/iterator/TestClusterClassifier.java
index 3ddc95d..fdcfd64 100644
--- a/mrlegacy/src/test/java/org/apache/mahout/clustering/iterator/TestClusterClassifier.java
+++ b/mrlegacy/src/test/java/org/apache/mahout/clustering/iterator/TestClusterClassifier.java
@@ -26,7 +26,6 @@ import org.apache.hadoop.fs.Path;
 import org.apache.mahout.clustering.AbstractCluster;
 import org.apache.mahout.clustering.Cluster;
 import org.apache.mahout.clustering.ClusteringTestUtils;
-import org.apache.mahout.clustering.canopy.Canopy;
 import org.apache.mahout.clustering.classify.ClusterClassifier;
 import org.apache.mahout.clustering.fuzzykmeans.SoftCluster;
 import org.apache.mahout.clustering.kmeans.TestKmeansClustering;
@@ -97,20 +96,6 @@ public final class TestClusterClassifier extends MahoutTestCase {
   }
   
   @Test
-  public void testCanopyClassification() {
-    List<Cluster> models = Lists.newArrayList();
-    DistanceMeasure measure = new ManhattanDistanceMeasure();
-    models.add(new Canopy(new DenseVector(2).assign(1), 0, measure));
-    models.add(new Canopy(new DenseVector(2), 1, measure));
-    models.add(new Canopy(new DenseVector(2).assign(-1), 2, measure));
-    ClusterClassifier classifier = new ClusterClassifier(models, new CanopyClusteringPolicy());
-    Vector pdf = classifier.classify(new DenseVector(2));
-    assertEquals("[0,0]", "[0.2,0.6,0.2]", AbstractCluster.formatVector(pdf, null));
-    pdf = classifier.classify(new DenseVector(2).assign(2));
-    assertEquals("[2,2]", "[0.493,0.296,0.211]", AbstractCluster.formatVector(pdf, null));
-  }
-  
-  @Test
   public void testClusterClassification() {
     ClusterClassifier classifier = newKlusterClassifier();
     Vector pdf = classifier.classify(new DenseVector(2));