You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ak...@apache.org on 2014/06/01 02:07:22 UTC
git commit: Removing references to deprecated canopy clustering tests
and examples.
Repository: mahout
Updated Branches:
refs/heads/master 1d1134ee9 -> 88bddb08a
Removing references to deprecated canopy clustering tests and examples.
Project: http://git-wip-us.apache.org/repos/asf/mahout/repo
Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/88bddb08
Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/88bddb08
Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/88bddb08
Branch: refs/heads/master
Commit: 88bddb08ab014df9d1d14b75fc96f6a8380b4545
Parents: 1d1134e
Author: Andrew Musselman <ak...@apache.org>
Authored: Sat May 31 15:24:11 2014 -0700
Committer: Andrew Musselman <ak...@apache.org>
Committed: Sat May 31 15:24:11 2014 -0700
----------------------------------------------------------------------
examples/bin/cluster-syntheticcontrol.sh | 1 -
.../mahout/clustering/TestClusterDumper.java | 66 +++++++++-----------
.../mahout/clustering/TestClusterEvaluator.java | 2 +-
.../mahout/clustering/AbstractCluster.java | 20 ------
.../org/apache/mahout/clustering/Cluster.java | 5 +-
.../mahout/clustering/TestClusterInterface.java | 47 --------------
.../iterator/TestClusterClassifier.java | 15 -----
7 files changed, 33 insertions(+), 123 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mahout/blob/88bddb08/examples/bin/cluster-syntheticcontrol.sh
----------------------------------------------------------------------
diff --git a/examples/bin/cluster-syntheticcontrol.sh b/examples/bin/cluster-syntheticcontrol.sh
index 550964e..188d166 100755
--- a/examples/bin/cluster-syntheticcontrol.sh
+++ b/examples/bin/cluster-syntheticcontrol.sh
@@ -34,7 +34,6 @@ else
echo "Please select a number to choose the corresponding clustering algorithm"
echo "1. ${algorithm[0]} clustering"
echo "2. ${algorithm[1]} clustering"
- echo "3. ${algorithm[2]} clustering"
read -p "Enter your choice : " choice
fi
echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]} Clustering"
http://git-wip-us.apache.org/repos/asf/mahout/blob/88bddb08/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java b/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
index f273f3c..4bbab65 100644
--- a/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
+++ b/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
@@ -33,9 +33,9 @@ import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
-import org.apache.mahout.clustering.canopy.CanopyDriver;
import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver;
import org.apache.mahout.clustering.kmeans.KMeansDriver;
+import org.apache.mahout.clustering.kmeans.RandomSeedGenerator;
import org.apache.mahout.common.MahoutTestCase;
import org.apache.mahout.common.distance.DistanceMeasure;
import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
@@ -177,49 +177,42 @@ public final class TestClusterDumper extends MahoutTestCase {
}
@Test
- public void testCanopy() throws Exception { // now run the Job
- DistanceMeasure measure = new EuclideanDistanceMeasure();
-
- Path output = getTestTempDirPath("output");
- CanopyDriver.run(getConfiguration(), getTestTempDirPath("testdata"),
- output, measure, 8, 4, true, 0.0, true);
- // run ClusterDumper
- ClusterDumper clusterDumper = new ClusterDumper(new Path(output,
- "clusters-0-final"), new Path(output, "clusteredPoints"));
- clusterDumper.printClusters(termDictionary);
- }
-
- @Test
public void testKmeans() throws Exception {
DistanceMeasure measure = new EuclideanDistanceMeasure();
- // now run the Canopy job to prime kMeans canopies
+ Path input = getTestTempFilePath("input");
Path output = getTestTempDirPath("output");
+ Path initialPoints = new Path(output, Cluster.CLUSTERS_DIR + '0' + Cluster.FINAL_ITERATION_SUFFIX);
Configuration conf = getConfiguration();
- CanopyDriver.run(conf, getTestTempDirPath("testdata"), output, measure, 8,
- 4, false, 0.0, true);
- // now run the KMeans job
+ FileSystem fs = FileSystem.get(conf);
+ // Write test data to file
+ ClusteringTestUtils.writePointsToFile(sampleData, input, fs, conf);
+ // Select initial centroids
+ RandomSeedGenerator.buildRandom(conf, input, initialPoints, 8, measure, 1L);
+ // Run k-means
Path kMeansOutput = new Path(output, "kmeans");
- KMeansDriver.run(conf, getTestTempDirPath("testdata"), new Path(output,
- "clusters-0-final"), kMeansOutput, 0.001, 10, true, 0.0, false);
- // run ClusterDumper
+ KMeansDriver.run(conf, getTestTempDirPath("testdata"), initialPoints, kMeansOutput, 0.001, 10, true, 0.0, false);
+ // Print out clusters
ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
- output, 10), new Path(kMeansOutput, "clusteredPoints"));
+ output, 10), new Path(kMeansOutput, "clusteredPoints"));
clusterDumper.printClusters(termDictionary);
}
@Test
public void testJsonClusterDumper() throws Exception {
DistanceMeasure measure = new EuclideanDistanceMeasure();
- // now run the Canopy job to prime kMeans canopies
+ Path input = getTestTempFilePath("input");
Path output = getTestTempDirPath("output");
+ Path initialPoints = new Path(output, Cluster.CLUSTERS_DIR + '0' + Cluster.FINAL_ITERATION_SUFFIX);
Configuration conf = getConfiguration();
- CanopyDriver.run(conf, getTestTempDirPath("testdata"), output, measure, 8,
- 4, false, 0.0, true);
- // now run the KMeans job
+ FileSystem fs = FileSystem.get(conf);
+ // Write test data to file
+ ClusteringTestUtils.writePointsToFile(sampleData, input, fs, conf);
+ // Select initial centroids
+ RandomSeedGenerator.buildRandom(conf, input, initialPoints, 8, measure, 1L);
+ // Run k-means
Path kmeansOutput = new Path(output, "kmeans");
- KMeansDriver.run(conf, getTestTempDirPath("testdata"), new Path(output,
- "clusters-0-final"), kmeansOutput, 0.001, 10, true, 0.0, false);
- // run ClusterDumper
+ KMeansDriver.run(conf, getTestTempDirPath("testdata"), initialPoints, kmeansOutput, 0.001, 10, true, 0.0, false);
+ // Print out clusters
ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
output, 10), new Path(kmeansOutput, "clusteredPoints"));
clusterDumper.setOutputFormat(ClusterDumper.OUTPUT_FORMAT.JSON);
@@ -229,15 +222,18 @@ public final class TestClusterDumper extends MahoutTestCase {
@Test
public void testFuzzyKmeans() throws Exception {
DistanceMeasure measure = new EuclideanDistanceMeasure();
- // now run the Canopy job to prime kMeans canopies
+ Path input = getTestTempFilePath("input");
Path output = getTestTempDirPath("output");
+ Path initialPoints = new Path(output, Cluster.CLUSTERS_DIR + '0' + Cluster.FINAL_ITERATION_SUFFIX);
Configuration conf = getConfiguration();
- CanopyDriver.run(conf, getTestTempDirPath("testdata"), output, measure, 8,
- 4, false, 0.0, true);
- // now run the Fuzzy KMeans job
+ FileSystem fs = FileSystem.get(conf);
+ // Write test data to file
+ ClusteringTestUtils.writePointsToFile(sampleData, input, fs, conf);
+ // Select initial centroids
+ RandomSeedGenerator.buildRandom(conf, input, initialPoints, 8, measure, 1L);
+ // Run k-means
Path kMeansOutput = new Path(output, "kmeans");
- FuzzyKMeansDriver.run(conf, getTestTempDirPath("testdata"), new Path(
- output, "clusters-0-final"), kMeansOutput, 0.001, 10, 1.1f, true,
+ FuzzyKMeansDriver.run(conf, getTestTempDirPath("testdata"), initialPoints, kMeansOutput, 0.001, 10, 1.1f, true,
true, 0, true);
// run ClusterDumper
ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
http://git-wip-us.apache.org/repos/asf/mahout/blob/88bddb08/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java
----------------------------------------------------------------------
diff --git a/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java b/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java
index d0a54cf..8a226a0 100644
--- a/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java
+++ b/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java
@@ -138,7 +138,7 @@ public final class TestClusterEvaluator extends MahoutTestCase {
points.add(new VectorWritable(cluster.getCenter().plus(new DenseVector(new double[] {-dP, dP}))));
}
}
-
+
@Test
public void testRepresentativePoints() throws Exception {
ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata, "file1"), fs, conf);
http://git-wip-us.apache.org/repos/asf/mahout/blob/88bddb08/mrlegacy/src/main/java/org/apache/mahout/clustering/AbstractCluster.java
----------------------------------------------------------------------
diff --git a/mrlegacy/src/main/java/org/apache/mahout/clustering/AbstractCluster.java b/mrlegacy/src/main/java/org/apache/mahout/clustering/AbstractCluster.java
index 6392286..9d6b135 100644
--- a/mrlegacy/src/main/java/org/apache/mahout/clustering/AbstractCluster.java
+++ b/mrlegacy/src/main/java/org/apache/mahout/clustering/AbstractCluster.java
@@ -355,18 +355,12 @@ public abstract class AbstractCluster implements Cluster {
*/
public static List<Object> formatVectorAsJson(Vector v, String[] bindings) throws IOException {
- List<TermIndexWeight> vectorTerms = Lists.newArrayList();
-
boolean hasBindings = bindings != null;
boolean isSparse = !v.isDense() && v.getNumNondefaultElements() != v.size();
// we assume sequential access in the output
Vector provider = v.isSequentialAccess() ? v : new SequentialAccessSparseVector(v);
- for (Vector.Element elt : v.nonZeroes()) {
- vectorTerms.add(new TermIndexWeight(elt.index(), elt.get()));
- }
-
List<Object> terms = Lists.newLinkedList();
String term = "";
@@ -391,20 +385,6 @@ public abstract class AbstractCluster implements Cluster {
return terms;
}
- /**
- * Convenience class for sorting terms
- *
- */
- private static class TermIndexWeight {
- private final int index;
- private final double weight;
-
- TermIndexWeight(int index, double weight) {
- this.index = index;
- this.weight = weight;
- }
- }
-
@Override
public boolean isConverged() {
// Convergence has no meaning yet, perhaps in subclasses
http://git-wip-us.apache.org/repos/asf/mahout/blob/88bddb08/mrlegacy/src/main/java/org/apache/mahout/clustering/Cluster.java
----------------------------------------------------------------------
diff --git a/mrlegacy/src/main/java/org/apache/mahout/clustering/Cluster.java b/mrlegacy/src/main/java/org/apache/mahout/clustering/Cluster.java
index d216318..07d6927 100644
--- a/mrlegacy/src/main/java/org/apache/mahout/clustering/Cluster.java
+++ b/mrlegacy/src/main/java/org/apache/mahout/clustering/Cluster.java
@@ -27,10 +27,7 @@ import java.util.Map;
*
*/
public interface Cluster extends Model<VectorWritable>, Parametered {
-
- // default directory for all clustered points
- String CLUSTERED_POINTS_DIR = "clusteredPoints";
-
+
// default directory for initial clusters to prime iterative clustering
// algorithms
String INITIAL_CLUSTERS_DIR = "clusters-0";
http://git-wip-us.apache.org/repos/asf/mahout/blob/88bddb08/mrlegacy/src/test/java/org/apache/mahout/clustering/TestClusterInterface.java
----------------------------------------------------------------------
diff --git a/mrlegacy/src/test/java/org/apache/mahout/clustering/TestClusterInterface.java b/mrlegacy/src/test/java/org/apache/mahout/clustering/TestClusterInterface.java
index 4ca1249..1866747 100644
--- a/mrlegacy/src/test/java/org/apache/mahout/clustering/TestClusterInterface.java
+++ b/mrlegacy/src/test/java/org/apache/mahout/clustering/TestClusterInterface.java
@@ -17,65 +17,18 @@
package org.apache.mahout.clustering;
-import org.apache.mahout.clustering.canopy.Canopy;
import org.apache.mahout.common.MahoutTestCase;
import org.apache.mahout.common.distance.DistanceMeasure;
import org.apache.mahout.common.distance.ManhattanDistanceMeasure;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.SequentialAccessSparseVector;
import org.apache.mahout.math.Vector;
-
-import org.codehaus.jackson.map.ObjectMapper;
import org.junit.Test;
-import java.io.IOException;
-import java.util.Map;
-
public final class TestClusterInterface extends MahoutTestCase {
private static final DistanceMeasure measure = new ManhattanDistanceMeasure();
- private final ObjectMapper jxn = new ObjectMapper();
-
- @Test
- public void testCanopyAsFormatString() throws IOException {
- double[] d = { 1.1, 2.2, 3.3 };
- Vector m = new DenseVector(d);
- Cluster cluster = new Canopy(m, 123, measure);
- String formatString = cluster.asFormatString(null);
- assertEquals("{\"r\":[],\"c\":[1.1,2.2,3.3],\"n\":0,\"identifier\":\"C-123\"}", formatString);
- }
-
- @Test
- public void testCanopyAsFormatStringSparse() {
- double[] d = { 1.1, 0.0, 3.3 };
- Vector m = new SequentialAccessSparseVector(3);
- m.assign(d);
- Cluster cluster = new Canopy(m, 123, measure);
- String formatString = cluster.asFormatString(null);
- assertEquals("{\"r\":[],\"c\":[{\"0\":1.1},{\"2\":3.3}],\"n\":0,\"identifier\":\"C-123\"}", formatString);
- }
-
- @Test
- public void testCanopyAsFormatStringWithBindings() {
- double[] d = { 1.1, 2.2, 3.3 };
- Vector m = new DenseVector(d);
- Cluster cluster = new Canopy(m, 123, measure);
- String[] bindings = { "fee", null, null };
- String formatString = cluster.asFormatString(bindings);
- assertEquals("{\"r\":[],\"c\":[{\"fee\":1.1},{\"1\":2.2},{\"2\":3.3}],\"n\":0,\"identifier\":\"C-123\"}", formatString);
- }
-
- @Test
- public void testCanopyAsFormatStringSparseWithBindings() {
- double[] d = { 1.1, 0.0, 3.3 };
- Vector m = new SequentialAccessSparseVector(3);
- m.assign(d);
- Cluster cluster = new Canopy(m, 123, measure);
- String formatString = cluster.asFormatString(null);
- assertEquals("{\"r\":[],\"c\":[{\"0\":1.1},{\"2\":3.3}],\"n\":0,\"identifier\":\"C-123\"}", formatString);
- }
-
@Test
public void testClusterAsFormatString() {
double[] d = { 1.1, 2.2, 3.3 };
http://git-wip-us.apache.org/repos/asf/mahout/blob/88bddb08/mrlegacy/src/test/java/org/apache/mahout/clustering/iterator/TestClusterClassifier.java
----------------------------------------------------------------------
diff --git a/mrlegacy/src/test/java/org/apache/mahout/clustering/iterator/TestClusterClassifier.java b/mrlegacy/src/test/java/org/apache/mahout/clustering/iterator/TestClusterClassifier.java
index 3ddc95d..fdcfd64 100644
--- a/mrlegacy/src/test/java/org/apache/mahout/clustering/iterator/TestClusterClassifier.java
+++ b/mrlegacy/src/test/java/org/apache/mahout/clustering/iterator/TestClusterClassifier.java
@@ -26,7 +26,6 @@ import org.apache.hadoop.fs.Path;
import org.apache.mahout.clustering.AbstractCluster;
import org.apache.mahout.clustering.Cluster;
import org.apache.mahout.clustering.ClusteringTestUtils;
-import org.apache.mahout.clustering.canopy.Canopy;
import org.apache.mahout.clustering.classify.ClusterClassifier;
import org.apache.mahout.clustering.fuzzykmeans.SoftCluster;
import org.apache.mahout.clustering.kmeans.TestKmeansClustering;
@@ -97,20 +96,6 @@ public final class TestClusterClassifier extends MahoutTestCase {
}
@Test
- public void testCanopyClassification() {
- List<Cluster> models = Lists.newArrayList();
- DistanceMeasure measure = new ManhattanDistanceMeasure();
- models.add(new Canopy(new DenseVector(2).assign(1), 0, measure));
- models.add(new Canopy(new DenseVector(2), 1, measure));
- models.add(new Canopy(new DenseVector(2).assign(-1), 2, measure));
- ClusterClassifier classifier = new ClusterClassifier(models, new CanopyClusteringPolicy());
- Vector pdf = classifier.classify(new DenseVector(2));
- assertEquals("[0,0]", "[0.2,0.6,0.2]", AbstractCluster.formatVector(pdf, null));
- pdf = classifier.classify(new DenseVector(2).assign(2));
- assertEquals("[2,2]", "[0.493,0.296,0.211]", AbstractCluster.formatVector(pdf, null));
- }
-
- @Test
public void testClusterClassification() {
ClusterClassifier classifier = newKlusterClassifier();
Vector pdf = classifier.classify(new DenseVector(2));