You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by je...@apache.org on 2010/08/18 23:47:32 UTC
svn commit: r986960 [2/2] - in /mahout/trunk:
core/src/main/java/org/apache/mahout/clustering/
core/src/main/java/org/apache/mahout/clustering/canopy/
core/src/main/java/org/apache/mahout/clustering/dirichlet/
core/src/main/java/org/apache/mahout/clust...
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java?rev=986960&r1=986959&r2=986960&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java Wed Aug 18 21:47:30 2010
@@ -39,6 +39,7 @@ import org.apache.mahout.clustering.Abst
import org.apache.mahout.clustering.Cluster;
import org.apache.mahout.clustering.WeightedVectorWritable;
import org.apache.mahout.clustering.canopy.CanopyDriver;
+import org.apache.mahout.clustering.kmeans.KMeansConfigKeys;
import org.apache.mahout.clustering.kmeans.OutputLogFilter;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.HadoopUtil;
@@ -69,8 +70,8 @@ public class MeanShiftCanopyDriver exten
* the input pathname String
* @param output
* the output pathname String
- * @param measureClassName
- * the DistanceMeasure class name
+ * @param measure
+ * the DistanceMeasure
* @param t1
* the T1 distance threshold
* @param t2
@@ -87,7 +88,7 @@ public class MeanShiftCanopyDriver exten
*/
public static void runJob(Path input,
Path output,
- String measureClassName,
+ DistanceMeasure measure,
double t1,
double t2,
double convergenceDelta,
@@ -98,7 +99,7 @@ public class MeanShiftCanopyDriver exten
throws IOException, InterruptedException, ClassNotFoundException, InstantiationException, IllegalAccessException {
new MeanShiftCanopyDriver().job(input,
output,
- measureClassName,
+ measure,
t1,
t2,
convergenceDelta,
@@ -142,10 +143,12 @@ public class MeanShiftCanopyDriver exten
boolean inputIsCanopies = hasOption(INPUT_IS_CANOPIES_OPTION);
boolean runSequential = (getOption(DefaultOptionCreator.METHOD_OPTION).equalsIgnoreCase(
DefaultOptionCreator.SEQUENTIAL_METHOD));
+ ClassLoader ccl = Thread.currentThread().getContextClassLoader();
+ DistanceMeasure measure = (DistanceMeasure) ((Class<?>) ccl.loadClass(measureClass)).newInstance();
job(input,
output,
- measureClass,
+ measure,
t1,
t2,
convergenceDelta,
@@ -215,8 +218,8 @@ public class MeanShiftCanopyDriver exten
* the input pathname String
* @param output
* the output pathname String
- * @param measureClassName
- * the DistanceMeasure class name
+ * @param measure
+ * the DistanceMeasure
* @param t1
* the T1 distance threshold
* @param t2
@@ -233,7 +236,7 @@ public class MeanShiftCanopyDriver exten
*/
public void job(Path input,
Path output,
- String measureClassName,
+ DistanceMeasure measure,
double t1,
double t2,
double convergenceDelta,
@@ -246,11 +249,8 @@ public class MeanShiftCanopyDriver exten
if (inputIsCanopies) {
clustersIn = input;
} else {
- createCanopyFromVectors(input, clustersIn, runSequential);
+ createCanopyFromVectors(input, clustersIn, measure, runSequential);
}
- ClassLoader ccl = Thread.currentThread().getContextClassLoader();
- Class<?> cl = ccl.loadClass(measureClassName);
- DistanceMeasure measure = (DistanceMeasure) cl.newInstance();
Path clustersOut =
buildClusters(clustersIn, output, measure, t1, t2, convergenceDelta, maxIterations, runSequential);
@@ -263,20 +263,21 @@ public class MeanShiftCanopyDriver exten
}
}
- public void createCanopyFromVectors(Path input, Path output, boolean runSequential)
+ public void createCanopyFromVectors(Path input, Path output, DistanceMeasure measure, boolean runSequential)
throws IOException, InterruptedException, ClassNotFoundException, InstantiationException, IllegalAccessException {
if (runSequential) {
- createCanopyFromVectorsSeq(input, output);
+ createCanopyFromVectorsSeq(input, output, measure);
} else {
- createCanopyFromVectorsMR(input, output);
+ createCanopyFromVectorsMR(input, output, measure);
}
}
/**
* @param input the Path to the input VectorWritable data
* @param output the Path to the initial clusters directory
+ * @param measure the DistanceMeasure
*/
- private void createCanopyFromVectorsSeq(Path input, Path output)
+ private void createCanopyFromVectorsSeq(Path input, Path output, DistanceMeasure measure)
throws IOException, InstantiationException, IllegalAccessException {
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(input.toUri(), conf);
@@ -294,7 +295,7 @@ public class MeanShiftCanopyDriver exten
Writable key = reader.getKeyClass().asSubclass(Writable.class).newInstance();
VectorWritable vw = (VectorWritable) reader.getValueClass().newInstance();
while (reader.next(key, vw)) {
- writer.append(new Text(), new MeanShiftCanopy(vw.get(), id++));
+ writer.append(new Text(), new MeanShiftCanopy(vw.get(), id++, measure));
vw = (VectorWritable) reader.getValueClass().newInstance();
}
} finally {
@@ -304,9 +305,10 @@ public class MeanShiftCanopyDriver exten
}
}
- private void createCanopyFromVectorsMR(Path input, Path output)
+ private void createCanopyFromVectorsMR(Path input, Path output, DistanceMeasure measure)
throws IOException, InterruptedException, ClassNotFoundException {
Configuration conf = new Configuration();
+ conf.set(KMeansConfigKeys.DISTANCE_MEASURE_KEY, measure.getClass().getName());
Job job = new Job(conf);
job.setJarByClass(MeanShiftCanopyDriver.class);
job.setOutputKeyClass(Text.class);
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/TestClusterInterface.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/TestClusterInterface.java?rev=986960&r1=986959&r2=986960&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/TestClusterInterface.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/TestClusterInterface.java Wed Aug 18 21:47:30 2010
@@ -29,6 +29,8 @@ import org.apache.mahout.clustering.diri
import org.apache.mahout.clustering.dirichlet.models.SampledNormalModel;
import org.apache.mahout.clustering.meanshift.MeanShiftCanopy;
import org.apache.mahout.common.MahoutTestCase;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.common.distance.ManhattanDistanceMeasure;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.SequentialAccessSparseVector;
import org.apache.mahout.math.Vector;
@@ -42,6 +44,7 @@ public class TestClusterInterface extend
private static final Type MODEL_TYPE = new TypeToken<Model<Vector>>() {}.getType();
private static final Type CLUSTER_TYPE = new TypeToken<DirichletCluster<Vector>>() {}.getType();
+ private static final DistanceMeasure measure = new ManhattanDistanceMeasure();
public void testDirichletNormalModel() {
double[] d = { 1.1, 2.2, 3.3 };
@@ -173,7 +176,7 @@ public class TestClusterInterface extend
public void testCanopyAsFormatString() {
double[] d = { 1.1, 2.2, 3.3 };
Vector m = new DenseVector(d);
- Cluster cluster = new Canopy(m, 123);
+ Cluster cluster = new Canopy(m, 123, measure);
String formatString = cluster.asFormatString(null);
System.out.println(formatString);
assertEquals("format", "C-123{n=0 c=[1.100, 2.200, 3.300] r=[]}", formatString);
@@ -183,7 +186,7 @@ public class TestClusterInterface extend
double[] d = { 1.1, 0.0, 3.3 };
Vector m = new SequentialAccessSparseVector(3);
m.assign(d);
- Cluster cluster = new Canopy(m, 123);
+ Cluster cluster = new Canopy(m, 123, measure);
String formatString = cluster.asFormatString(null);
System.out.println(formatString);
assertEquals("format", "C-123{n=0 c=[0:1.100, 2:3.300] r=[]}", formatString);
@@ -192,7 +195,7 @@ public class TestClusterInterface extend
public void testCanopyAsFormatStringWithBindings() {
double[] d = { 1.1, 2.2, 3.3 };
Vector m = new DenseVector(d);
- Cluster cluster = new Canopy(m, 123);
+ Cluster cluster = new Canopy(m, 123, measure);
String[] bindings = { "fee", null, null };
String formatString = cluster.asFormatString(bindings);
System.out.println(formatString);
@@ -203,7 +206,7 @@ public class TestClusterInterface extend
double[] d = { 1.1, 0.0, 3.3 };
Vector m = new SequentialAccessSparseVector(3);
m.assign(d);
- Cluster cluster = new Canopy(m, 123);
+ Cluster cluster = new Canopy(m, 123, measure);
String formatString = cluster.asFormatString(null);
System.out.println(formatString);
assertEquals("format", "C-123{n=0 c=[0:1.100, 2:3.300] r=[]}", formatString);
@@ -212,7 +215,7 @@ public class TestClusterInterface extend
public void testClusterAsFormatString() {
double[] d = { 1.1, 2.2, 3.3 };
Vector m = new DenseVector(d);
- Cluster cluster = new org.apache.mahout.clustering.kmeans.Cluster(m, 123);
+ Cluster cluster = new org.apache.mahout.clustering.kmeans.Cluster(m, 123, measure);
String formatString = cluster.asFormatString(null);
System.out.println(formatString);
assertEquals("format", "CL-123{n=0 c=[1.100, 2.200, 3.300] r=[]}", formatString);
@@ -222,7 +225,7 @@ public class TestClusterInterface extend
double[] d = { 1.1, 0.0, 3.3 };
Vector m = new SequentialAccessSparseVector(3);
m.assign(d);
- Cluster cluster = new org.apache.mahout.clustering.kmeans.Cluster(m, 123);
+ Cluster cluster = new org.apache.mahout.clustering.kmeans.Cluster(m, 123, measure);
String formatString = cluster.asFormatString(null);
System.out.println(formatString);
assertEquals("format", "CL-123{n=0 c=[0:1.100, 2:3.300] r=[]}", formatString);
@@ -231,7 +234,7 @@ public class TestClusterInterface extend
public void testClusterAsFormatStringWithBindings() {
double[] d = { 1.1, 2.2, 3.3 };
Vector m = new DenseVector(d);
- Cluster cluster = new org.apache.mahout.clustering.kmeans.Cluster(m, 123);
+ Cluster cluster = new org.apache.mahout.clustering.kmeans.Cluster(m, 123, measure);
String[] bindings = { "fee", null, "foo" };
String formatString = cluster.asFormatString(bindings);
System.out.println(formatString);
@@ -242,7 +245,7 @@ public class TestClusterInterface extend
double[] d = { 1.1, 0.0, 3.3 };
Vector m = new SequentialAccessSparseVector(3);
m.assign(d);
- Cluster cluster = new org.apache.mahout.clustering.kmeans.Cluster(m, 123);
+ Cluster cluster = new org.apache.mahout.clustering.kmeans.Cluster(m, 123, measure);
String formatString = cluster.asFormatString(null);
System.out.println(formatString);
assertEquals("format", "CL-123{n=0 c=[0:1.100, 2:3.300] r=[]}", formatString);
@@ -251,7 +254,7 @@ public class TestClusterInterface extend
public void testMSCanopyAsFormatString() {
double[] d = { 1.1, 2.2, 3.3 };
Vector m = new DenseVector(d);
- Cluster cluster = new MeanShiftCanopy(m, 123);
+ Cluster cluster = new MeanShiftCanopy(m, 123, measure);
String formatString = cluster.asFormatString(null);
System.out.println(formatString);
assertEquals("format", "MSC-123{n=0 c=[1.100, 2.200, 3.300] r=[]}", formatString);
@@ -261,7 +264,7 @@ public class TestClusterInterface extend
double[] d = { 1.1, 0.0, 3.3 };
Vector m = new SequentialAccessSparseVector(3);
m.assign(d);
- Cluster cluster = new MeanShiftCanopy(m, 123);
+ Cluster cluster = new MeanShiftCanopy(m, 123, measure);
String formatString = cluster.asFormatString(null);
System.out.println(formatString);
assertEquals("format", "MSC-123{n=0 c=[0:1.100, 2:3.300] r=[]}", formatString);
@@ -270,7 +273,7 @@ public class TestClusterInterface extend
public void testMSCanopyAsFormatStringWithBindings() {
double[] d = { 1.1, 2.2, 3.3 };
Vector m = new DenseVector(d);
- Cluster cluster = new MeanShiftCanopy(m, 123);
+ Cluster cluster = new MeanShiftCanopy(m, 123, measure);
String[] bindings = { "fee", null, "foo" };
String formatString = cluster.asFormatString(bindings);
System.out.println(formatString);
@@ -281,7 +284,7 @@ public class TestClusterInterface extend
double[] d = { 1.1, 0.0, 3.3 };
Vector m = new SequentialAccessSparseVector(3);
m.assign(d);
- Cluster cluster = new MeanShiftCanopy(m, 123);
+ Cluster cluster = new MeanShiftCanopy(m, 123, measure);
String[] bindings = { "fee", null, "foo" };
String formatString = cluster.asFormatString(bindings);
System.out.println(formatString);
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java?rev=986960&r1=986959&r2=986960&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java Wed Aug 18 21:47:30 2010
@@ -288,7 +288,7 @@ public class TestCanopyCreation extends
ClusteringTestUtils.writePointsToFile(points, getTestTempFilePath("testdata/file2"), fs, config);
// now run the Canopy Driver
Path output = getTestTempDirPath("output");
- CanopyDriver.runJob(getTestTempDirPath("testdata"), output, ManhattanDistanceMeasure.class.getName(), 3.1, 2.1, false, false);
+ CanopyDriver.runJob(getTestTempDirPath("testdata"), output, manhattanDistanceMeasure, 3.1, 2.1, false, false);
// verify output from sequence file
Path path = new Path(output, "clusters-0/part-r-00000");
@@ -319,7 +319,7 @@ public class TestCanopyCreation extends
ClusteringTestUtils.writePointsToFile(points, getTestTempFilePath("testdata/file2"), fs, job);
// now run the Canopy Driver
Path output = getTestTempDirPath("output");
- CanopyDriver.runJob(getTestTempDirPath("testdata"), output, EuclideanDistanceMeasure.class.getName(), 3.1, 2.1, false, false);
+ CanopyDriver.runJob(getTestTempDirPath("testdata"), output, euclideanDistanceMeasure, 3.1, 2.1, false, false);
// verify output from sequence file
Path path = new Path(output, "clusters-0/part-r-00000");
@@ -354,7 +354,7 @@ public class TestCanopyCreation extends
List<Canopy> canopies = new ArrayList<Canopy>();
int nextCanopyId = 0;
for (Vector centroid : manhattanCentroids) {
- canopies.add(new Canopy(centroid, nextCanopyId++));
+ canopies.add(new Canopy(centroid, nextCanopyId++, manhattanDistanceMeasure));
}
mapper.config(canopies);
List<VectorWritable> points = getPointsWritable();
@@ -389,7 +389,7 @@ public class TestCanopyCreation extends
List<Canopy> canopies = new ArrayList<Canopy>();
int nextCanopyId = 0;
for (Vector centroid : euclideanCentroids) {
- canopies.add(new Canopy(centroid, nextCanopyId++));
+ canopies.add(new Canopy(centroid, nextCanopyId++, euclideanDistanceMeasure));
}
mapper.config(canopies);
List<VectorWritable> points = getPointsWritable();
@@ -416,7 +416,7 @@ public class TestCanopyCreation extends
ClusteringTestUtils.writePointsToFile(points, getTestTempFilePath("testdata/file1"), fs, config);
// now run the Canopy Driver in sequential mode
Path output = getTestTempDirPath("output");
- CanopyDriver.runJob(getTestTempDirPath("testdata"), output, ManhattanDistanceMeasure.class.getName(), 3.1, 2.1, true, true);
+ CanopyDriver.runJob(getTestTempDirPath("testdata"), output, manhattanDistanceMeasure, 3.1, 2.1, true, true);
// verify output from sequence file
Path path = new Path(output, "clusters-0/part-r-00000");
@@ -494,7 +494,7 @@ public class TestCanopyCreation extends
ClusteringTestUtils.writePointsToFile(points, getTestTempFilePath("testdata/file2"), fs, conf);
// now run the Job
Path output = getTestTempDirPath("output");
- CanopyDriver.runJob(getTestTempDirPath("testdata"), output, ManhattanDistanceMeasure.class.getName(), 3.1, 2.1, true, false);
+ CanopyDriver.runJob(getTestTempDirPath("testdata"), output, manhattanDistanceMeasure, 3.1, 2.1, true, false);
Path path = new Path(output, "clusteredPoints/part-m-00000");
SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
int count = 0;
@@ -547,7 +547,7 @@ public class TestCanopyCreation extends
// now run the Canopy Driver. User defined measure happens to be a Manhattan
// subclass so results are same.
Path output = getTestTempDirPath("output");
- CanopyDriver.runJob(getTestTempDirPath("testdata"), output, UserDefinedDistanceMeasure.class.getName(), 3.1, 2.1, false, false);
+ CanopyDriver.runJob(getTestTempDirPath("testdata"), output, new UserDefinedDistanceMeasure(), 3.1, 2.1, false, false);
// verify output from sequence file
Configuration job = new Configuration();
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/dirichlet/TestMapReduce.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/dirichlet/TestMapReduce.java?rev=986960&r1=986959&r2=986960&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/dirichlet/TestMapReduce.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/dirichlet/TestMapReduce.java Wed Aug 18 21:47:30 2010
@@ -441,7 +441,6 @@ public class TestMapReduce extends Mahou
in.reset(out.getData(), out.getLength());
model2.readFields(in);
assertEquals("models", model.toString(), model2.toString());
- assertEquals("ids", 5, model.getId());
}
public void testSampledNormalModelWritableSerialization() throws Exception {
@@ -454,7 +453,6 @@ public class TestMapReduce extends Mahou
in.reset(out.getData(), out.getLength());
model2.readFields(in);
assertEquals("models", model.toString(), model2.toString());
- assertEquals("ids", 5, model.getId());
}
public void testAsymmetricSampledNormalModelWritableSerialization() throws Exception {
@@ -468,7 +466,6 @@ public class TestMapReduce extends Mahou
in.reset(out.getData(), out.getLength());
model2.readFields(in);
assertEquals("models", model.toString(), model2.toString());
- assertEquals("ids", 5, model.getId());
}
public void testClusterWritableSerialization() throws Exception {
@@ -483,7 +480,6 @@ public class TestMapReduce extends Mahou
assertEquals("count", cluster.getTotalCount(), cluster2.getTotalCount());
assertNotNull("model null", cluster2.getModel());
assertEquals("model", cluster.getModel().toString(), cluster2.getModel().toString());
- assertEquals("ids", 5, cluster.getId());
}
}
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java?rev=986960&r1=986959&r2=986960&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java Wed Aug 18 21:47:30 2010
@@ -52,6 +52,8 @@ import org.apache.mahout.math.VectorWrit
public class TestFuzzyKmeansClustering extends MahoutTestCase {
private FileSystem fs;
+
+ private DistanceMeasure measure = new EuclideanDistanceMeasure();
private static void rmr(String path) {
File f = new File(path);
@@ -151,6 +153,7 @@ public class TestFuzzyKmeansClustering e
public void testReferenceImplementation() throws Exception {
List<Vector> points = TestKmeansClustering.getPoints(TestKmeansClustering.reference);
+ EuclideanDistanceMeasure measure = new EuclideanDistanceMeasure();
for (int k = 0; k < points.size(); k++) {
System.out.println("test k= " + k);
@@ -158,7 +161,7 @@ public class TestFuzzyKmeansClustering e
// pick k initial cluster centers at random
for (int i = 0; i < k + 1; i++) {
Vector vec = tweakValue(points.get(i));
- SoftCluster cluster = new SoftCluster(vec, i);
+ SoftCluster cluster = new SoftCluster(vec, i, measure);
// add the center so the centroid will be correct upon output
//cluster.addPoint(cluster.getCenter(), 1);
clusterList.add(cluster);
@@ -167,13 +170,13 @@ public class TestFuzzyKmeansClustering e
// run reference FuzzyKmeans algorithm
List<List<SoftCluster>> clusters = FuzzyKMeansClusterer.clusterPoints(points,
clusterList,
- new EuclideanDistanceMeasure(),
+ measure,
0.001,
2,
2);
computeCluster(points,
clusters.get(clusters.size() - 1),
- new FuzzyKMeansClusterer(new EuclideanDistanceMeasure(), 0.001, 2),
+ new FuzzyKMeansClusterer(measure, 0.001, 2),
pointClusterInfo);
// iterate for each cluster
@@ -204,7 +207,7 @@ public class TestFuzzyKmeansClustering e
for (int i = 0; i < k + 1; i++) {
Vector vec = tweakValue(points.get(i).get());
- SoftCluster cluster = new SoftCluster(vec, i);
+ SoftCluster cluster = new SoftCluster(vec, i, measure);
// add the center so the centroid will be correct upon output
cluster.observe(cluster.getCenter(), 1);
/*
@@ -267,7 +270,7 @@ public class TestFuzzyKmeansClustering e
for (int i = 0; i < k + 1; i++) {
Vector vec = tweakValue(points.get(i).get());
- SoftCluster cluster = new SoftCluster(vec, i);
+ SoftCluster cluster = new SoftCluster(vec, i, measure);
// add the center so the centroid will be correct upon output
cluster.observe(cluster.getCenter(), 1);
/*
@@ -321,7 +324,7 @@ public class TestFuzzyKmeansClustering e
for (int i = 0; i < k + 1; i++) {
Vector vec = tweakValue(points.get(i).get());
- SoftCluster cluster = new SoftCluster(vec, i);
+ SoftCluster cluster = new SoftCluster(vec, i, measure);
cluster.observe(cluster.getCenter(), 1);
clusterList.add(cluster);
}
@@ -383,7 +386,7 @@ public class TestFuzzyKmeansClustering e
for (int i = 0; i < k + 1; i++) {
Vector vec = tweakValue(points.get(i).get());
- SoftCluster cluster = new SoftCluster(vec, i);
+ SoftCluster cluster = new SoftCluster(vec, i, measure);
cluster.observe(cluster.getCenter(), 1);
clusterList.add(cluster);
}
@@ -439,7 +442,7 @@ public class TestFuzzyKmeansClustering e
for (int i = 0; i < k + 1; i++) {
Vector vec = tweakValue(points.get(i).get());
- SoftCluster cluster = new SoftCluster(vec, i);
+ SoftCluster cluster = new SoftCluster(vec, i, measure);
// cluster.addPoint(cluster.getCenter(), 1);
clusterList.add(cluster);
}
@@ -493,7 +496,7 @@ public class TestFuzzyKmeansClustering e
List<SoftCluster> reference = new ArrayList<SoftCluster>();
for (int i = 0; i < k + 1; i++) {
Vector vec = tweakValue(points.get(i).get());
- reference.add(new SoftCluster(vec, i));
+ reference.add(new SoftCluster(vec, i, measure));
}
List<Vector> pointsVectors = new ArrayList<Vector>();
for (VectorWritable point : points) {
@@ -526,7 +529,7 @@ public class TestFuzzyKmeansClustering e
for (int i = 0; i < k + 1; i++) {
Vector vec = tweakValue(points.get(i).get());
- SoftCluster cluster = new SoftCluster(vec, i);
+ SoftCluster cluster = new SoftCluster(vec, i, measure);
cluster.observe(cluster.getCenter(), 1);
clusterList.add(cluster);
}
@@ -601,7 +604,7 @@ public class TestFuzzyKmeansClustering e
List<SoftCluster> reference = new ArrayList<SoftCluster>();
for (int i = 0; i < k + 1; i++) {
Vector vec = tweakValue(points.get(i).get());
- reference.add(new SoftCluster(vec, i));
+ reference.add(new SoftCluster(vec, i, measure));
}
Map<Integer, List<WeightedVectorWritable>> refClusters = new HashMap<Integer, List<WeightedVectorWritable>>();
List<Vector> pointsVectors = new ArrayList<Vector>();
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java?rev=986960&r1=986959&r2=986960&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java Wed Aug 18 21:47:30 2010
@@ -113,7 +113,7 @@ public class TestKmeansClustering extend
List<Cluster> clusters = new ArrayList<Cluster>();
for (int i = 0; i < k + 1; i++) {
Vector vec = points.get(i);
- clusters.add(new Cluster(vec, i));
+ clusters.add(new Cluster(vec, i, measure));
}
// iterate clusters until they converge
int maxIter = 10;
@@ -153,7 +153,7 @@ public class TestKmeansClustering extend
List<Cluster> clusters = new ArrayList<Cluster>();
for (int i = 0; i < k + 1; i++) {
- Cluster cluster = new Cluster(points.get(i).get(), i);
+ Cluster cluster = new Cluster(points.get(i).get(), i, measure);
// add the center so the centroid will be correct upon output
cluster.observe(cluster.getCenter(), 1);
clusters.add(cluster);
@@ -200,7 +200,7 @@ public class TestKmeansClustering extend
for (int i = 0; i < k + 1; i++) {
Vector vec = points.get(i).get();
- Cluster cluster = new Cluster(vec, i);
+ Cluster cluster = new Cluster(vec, i, measure);
// add the center so the centroid will be correct upon output
cluster.observe(cluster.getCenter(), 1);
clusters.add(cluster);
@@ -258,7 +258,7 @@ public class TestKmeansClustering extend
List<Cluster> clusters = new ArrayList<Cluster>();
for (int i = 0; i < k + 1; i++) {
Vector vec = points.get(i).get();
- Cluster cluster = new Cluster(vec, i);
+ Cluster cluster = new Cluster(vec, i, measure);
// add the center so the centroid will be correct upon output
// cluster.addPoint(cluster.getCenter());
clusters.add(cluster);
@@ -296,7 +296,7 @@ public class TestKmeansClustering extend
List<Cluster> reference = new ArrayList<Cluster>();
for (int i = 0; i < k + 1; i++) {
Vector vec = points.get(i).get();
- reference.add(new Cluster(vec, i));
+ reference.add(new Cluster(vec, i, measure));
}
List<Vector> pointsVectors = new ArrayList<Vector>();
for (VectorWritable point : points) {
@@ -331,6 +331,7 @@ public class TestKmeansClustering extend
/** Story: User wishes to run kmeans job on reference data */
public void testKMeansSeqJob() throws Exception {
+ DistanceMeasure measure = new EuclideanDistanceMeasure();
List<VectorWritable> points = getPointsWritable(reference);
Path pointsPath = getTestTempDirPath("points");
@@ -348,7 +349,7 @@ public class TestKmeansClustering extend
for (int i = 0; i < k + 1; i++) {
Vector vec = points.get(i).get();
- Cluster cluster = new Cluster(vec, i);
+ Cluster cluster = new Cluster(vec, i, measure);
// add the center so the centroid will be correct upon output
cluster.observe(cluster.getCenter(), 1);
writer.append(new Text(cluster.getIdentifier()), cluster);
@@ -386,6 +387,7 @@ public class TestKmeansClustering extend
/** Story: User wishes to run kmeans job on reference data */
public void testKMeansMRJob() throws Exception {
+ DistanceMeasure measure = new EuclideanDistanceMeasure();
List<VectorWritable> points = getPointsWritable(reference);
Path pointsPath = getTestTempDirPath("points");
@@ -403,7 +405,7 @@ public class TestKmeansClustering extend
for (int i = 0; i < k + 1; i++) {
Vector vec = points.get(i).get();
- Cluster cluster = new Cluster(vec, i);
+ Cluster cluster = new Cluster(vec, i, measure);
// add the center so the centroid will be correct upon output
cluster.observe(cluster.getCenter(), 1);
writer.append(new Text(cluster.getIdentifier()), cluster);
@@ -455,13 +457,13 @@ public class TestKmeansClustering extend
Path outputPath = getTestTempDirPath("output");
// now run the Canopy job
- CanopyDriver.runJob(pointsPath, outputPath, ManhattanDistanceMeasure.class.getName(), 3.1, 2.1, false, false);
+ CanopyDriver.runJob(pointsPath, outputPath, new ManhattanDistanceMeasure(), 3.1, 2.1, false, false);
// now run the KMeans job
KMeansDriver.runJob(pointsPath,
new Path(outputPath, "clusters-0"),
outputPath,
- EuclideanDistanceMeasure.class.getName(),
+ new EuclideanDistanceMeasure(),
0.001,
10,
1,
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestRandomSeedGenerator.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestRandomSeedGenerator.java?rev=986960&r1=986959&r2=986960&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestRandomSeedGenerator.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestRandomSeedGenerator.java Wed Aug 18 21:47:30 2010
@@ -32,6 +32,7 @@ import org.apache.hadoop.mapreduce.Job;
import org.apache.mahout.clustering.AbstractCluster;
import org.apache.mahout.clustering.ClusteringTestUtils;
import org.apache.mahout.common.MahoutTestCase;
+import org.apache.mahout.common.distance.ManhattanDistanceMeasure;
import org.apache.mahout.math.RandomAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
@@ -84,7 +85,7 @@ public class TestRandomSeedGenerator ext
Path output = getTestTempDirPath("random-output");
ClusteringTestUtils.writePointsToFile(points, input, fs, conf);
- RandomSeedGenerator.buildRandom(input, output, 4);
+ RandomSeedGenerator.buildRandom(input, output, 4, new ManhattanDistanceMeasure());
SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path(output, "part-randomSeed"), conf);
Writable key = (Writable) reader.getKeyClass().newInstance();
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java?rev=986960&r1=986959&r2=986960&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java Wed Aug 18 21:47:30 2010
@@ -84,7 +84,7 @@ public class TestMeanShift extends Mahou
int nextCanopyId = 0;
List<MeanShiftCanopy> canopies = new ArrayList<MeanShiftCanopy>();
for (Vector point : raw) {
- canopies.add(new MeanShiftCanopy(point, nextCanopyId++));
+ canopies.add(new MeanShiftCanopy(point, nextCanopyId++, euclideanDistanceMeasure));
}
return canopies;
}
@@ -119,7 +119,7 @@ public class TestMeanShift extends Mahou
// add all points to the canopies
int nextCanopyId = 0;
for (Vector aRaw : raw) {
- clusterer.mergeCanopy(new MeanShiftCanopy(aRaw, nextCanopyId++), canopies);
+ clusterer.mergeCanopy(new MeanShiftCanopy(aRaw, nextCanopyId++, euclideanDistanceMeasure), canopies);
}
boolean done = false;
int iter = 1;
@@ -145,7 +145,7 @@ public class TestMeanShift extends Mahou
List<Vector> points = new ArrayList<Vector>();
for (Vector v : raw)
points.add(v);
- List<MeanShiftCanopy> canopies = MeanShiftCanopyClusterer.clusterPoints(points, new EuclideanDistanceMeasure(), 0.5, 4, 1, 10);
+ List<MeanShiftCanopy> canopies = MeanShiftCanopyClusterer.clusterPoints(points, euclideanDistanceMeasure, 0.5, 4, 1, 10);
printCanopies(canopies);
printImage(canopies);
}
@@ -162,7 +162,7 @@ public class TestMeanShift extends Mahou
List<MeanShiftCanopy> refCanopies = new ArrayList<MeanShiftCanopy>();
int nextCanopyId = 0;
for (Vector aRaw : raw) {
- clusterer.mergeCanopy(new MeanShiftCanopy(aRaw, nextCanopyId++), refCanopies);
+ clusterer.mergeCanopy(new MeanShiftCanopy(aRaw, nextCanopyId++, euclideanDistanceMeasure), refCanopies);
}
Configuration conf = new Configuration();
@@ -222,7 +222,7 @@ public class TestMeanShift extends Mahou
List<MeanShiftCanopy> mapperReference = new ArrayList<MeanShiftCanopy>();
int nextCanopyId = 0;
for (Vector aRaw : raw) {
- clusterer.mergeCanopy(new MeanShiftCanopy(aRaw, nextCanopyId++), mapperReference);
+ clusterer.mergeCanopy(new MeanShiftCanopy(aRaw, nextCanopyId++, euclideanDistanceMeasure), mapperReference);
}
for (MeanShiftCanopy canopy : mapperReference) {
clusterer.shiftToMean(canopy);
Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayCanopy.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayCanopy.java?rev=986960&r1=986959&r2=986960&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayCanopy.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayCanopy.java Wed Aug 18 21:47:30 2010
@@ -50,7 +50,7 @@ class DisplayCanopy extends DisplayClust
writeSampleData(samples);
//boolean b = true;
//if (b) {
- new CanopyDriver().buildClusters(samples, output, ManhattanDistanceMeasure.class.getName(), T1, T2, true);
+ new CanopyDriver().buildClusters(samples, output, new ManhattanDistanceMeasure(), T1, T2, true);
loadClusters(output);
//} else {
// List<Vector> points = new ArrayList<Vector>();
Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayDirichlet.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayDirichlet.java?rev=986960&r1=986959&r2=986960&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayDirichlet.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayDirichlet.java Wed Aug 18 21:47:30 2010
@@ -84,8 +84,8 @@ public class DisplayDirichlet extends Di
for (Model<VectorWritable>[] models : result) {
List<Cluster> clusters = new ArrayList<Cluster>();
for (Model<VectorWritable> cluster : models) {
- if (isSignificant(cluster)) {
- clusters.add(cluster);
+ if (isSignificant((Cluster)cluster)) {
+ clusters.add((Cluster)cluster);
}
}
CLUSTERS.add(clusters);
Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayFuzzyKMeans.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayFuzzyKMeans.java?rev=986960&r1=986959&r2=986960&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayFuzzyKMeans.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayFuzzyKMeans.java Wed Aug 18 21:47:30 2010
@@ -57,11 +57,11 @@ class DisplayFuzzyKMeans extends Display
int m = 3;
//if (b) {
writeSampleData(samples);
- Path clusters = RandomSeedGenerator.buildRandom(samples, new Path(output, "clusters-0"), 3);
+ Path clusters = RandomSeedGenerator.buildRandom(samples, new Path(output, "clusters-0"), 3, measure);
FuzzyKMeansDriver.runJob(samples,
clusters,
output,
- measure.getClass().getName(),
+ measure,
threshold,
numIterations,
1,
Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayKMeans.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayKMeans.java?rev=986960&r1=986959&r2=986960&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayKMeans.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayKMeans.java Wed Aug 18 21:47:30 2010
@@ -51,9 +51,9 @@ class DisplayKMeans extends DisplayClust
int maxIter = 10;
double distanceThreshold = 0.001;
//if (b) {
- Path clusters = RandomSeedGenerator.buildRandom(samples, new Path(output, "clusters-0"), 3);
- KMeansDriver.runJob(samples, clusters, output, measure.getClass().getName(), distanceThreshold, maxIter, 1, true, true);
- loadClusters(output);
+ Path clusters = RandomSeedGenerator.buildRandom(samples, new Path(output, "clusters-0"), 3, measure);
+ KMeansDriver.runJob(samples, clusters, output, measure, distanceThreshold, maxIter, 1, true, true);
+ loadClusters(output);
//} else {
// List<Vector> points = new ArrayList<Vector>();
// for (VectorWritable sample : SAMPLE_DATA) {
Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayMeanShift.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayMeanShift.java?rev=986960&r1=986959&r2=986960&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayMeanShift.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayMeanShift.java Wed Aug 18 21:47:30 2010
@@ -112,7 +112,7 @@ final class DisplayMeanShift extends Dis
writeSampleData(samples);
boolean b = true;
if (b) {
- MeanShiftCanopyDriver.runJob(samples, output, measure.getClass().getName(), t1, t2, 0.005, 20, false, true, true);
+ MeanShiftCanopyDriver.runJob(samples, output, measure, t1, t2, 0.005, 20, false, true, true);
loadClusters(output);
} else {
List<Vector> points = new ArrayList<Vector>();
Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java?rev=986960&r1=986959&r2=986960&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java Wed Aug 18 21:47:30 2010
@@ -25,6 +25,8 @@ import org.apache.mahout.clustering.cano
import org.apache.mahout.clustering.syntheticcontrol.Constants;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
import org.apache.mahout.utils.clustering.ClusterDumper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -44,7 +46,7 @@ public final class Job extends CanopyDri
log.info("Running with default arguments");
Path output = new Path("output");
HadoopUtil.overwriteOutput(output);
- job(new Path("testdata"), output, "org.apache.mahout.common.distance.EuclideanDistanceMeasure", 80, 55);
+ job(new Path("testdata"), output, new EuclideanDistanceMeasure(), 80, 55);
}
}
@@ -60,22 +62,20 @@ public final class Job extends CanopyDri
* the String denoting the input directory path
* @param output
* the String denoting the output directory path
- * @param measureClassName
- * the String class name of the DistanceMeasure to use
+ * @param measure
+ * the DistanceMeasure to use
* @param t1
* the canopy T1 threshold
* @param t2
* the canopy T2 threshold
*/
- private static void job(Path input, Path output, String measureClassName, double t1, double t2)
- throws IOException,
+ private static void job(Path input, Path output, DistanceMeasure measure, double t1, double t2) throws IOException,
InstantiationException, IllegalAccessException, InterruptedException, ClassNotFoundException {
Path directoryContainingConvertedInput = new Path(output, Constants.DIRECTORY_CONTAINING_CONVERTED_INPUT);
InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector");
- CanopyDriver.runJob(directoryContainingConvertedInput, output, measureClassName, t1, t2, true, false);
+ CanopyDriver.runJob(directoryContainingConvertedInput, output, measure, t1, t2, true, false);
// run ClusterDumper
- ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-0"),
- new Path(output, "clusteredPoints"));
+ ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-0"), new Path(output, "clusteredPoints"));
clusterDumper.printClusters(null);
}
@@ -102,8 +102,10 @@ public final class Job extends CanopyDri
String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION));
double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION));
+ ClassLoader ccl = Thread.currentThread().getContextClassLoader();
+ DistanceMeasure measure = (DistanceMeasure) ((Class<?>) ccl.loadClass(measureClass)).newInstance();
- job(input, output, measureClass, t1, t2);
+ job(input, output, measure, t1, t2);
return 0;
}
Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java?rev=986960&r1=986959&r2=986960&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java Wed Aug 18 21:47:30 2010
@@ -31,6 +31,8 @@ import org.apache.mahout.clustering.synt
import org.apache.mahout.clustering.syntheticcontrol.canopy.InputDriver;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure;
import org.apache.mahout.utils.clustering.ClusterDumper;
import org.slf4j.Logger;
@@ -51,16 +53,7 @@ public final class Job extends FuzzyKMea
log.info("Running with default arguments");
Path output = new Path("output");
HadoopUtil.overwriteOutput(output);
- new Job().job(new Path("testdata"),
- output,
- "org.apache.mahout.common.distance.EuclideanDistanceMeasure",
- 80,
- 55,
- 10,
- 1,
- (float) 2,
- 0.5,
- true);
+ new Job().job(new Path("testdata"), output, new EuclideanDistanceMeasure(), 80, 55, 10, 1, (float) 2, 0.5, true);
}
}
@@ -109,14 +102,17 @@ public final class Job extends FuzzyKMea
if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
HadoopUtil.overwriteOutput(output);
}
+ ClassLoader ccl = Thread.currentThread().getContextClassLoader();
+ DistanceMeasure measure = (DistanceMeasure) ((Class<?>) ccl.loadClass(measureClass)).newInstance();
+
if (hasOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)) {
clusters = RandomSeedGenerator.buildRandom(input, clusters, Integer.parseInt(argMap
- .get(DefaultOptionCreator.NUM_CLUSTERS_OPTION)));
+ .get(DefaultOptionCreator.NUM_CLUSTERS_OPTION)), measure);
}
boolean runClustering = hasOption(DefaultOptionCreator.CLUSTERING_OPTION);
double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION));
double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION));
- job(input, output, measureClass, t1, t2, maxIterations, numReduceTasks, fuzziness, convergenceDelta, runClustering);
+ job(input, output, measure, t1, t2, maxIterations, numReduceTasks, fuzziness, convergenceDelta, runClustering);
return 0;
}
@@ -149,26 +145,26 @@ public final class Job extends FuzzyKMea
*/
private void job(Path input,
Path output,
- String measureClass,
+ DistanceMeasure measure,
double t1,
double t2,
int maxIterations,
int numReducerTasks,
float fuzziness,
double convergenceDelta,
- boolean runClustering)
- throws IOException, InstantiationException, IllegalAccessException, InterruptedException, ClassNotFoundException {
+ boolean runClustering) throws IOException, InstantiationException, IllegalAccessException, InterruptedException,
+ ClassNotFoundException {
Path directoryContainingConvertedInput = new Path(output, Constants.DIRECTORY_CONTAINING_CONVERTED_INPUT);
log.info("Preparing Input");
InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector");
log.info("Running Canopy to get initial clusters");
- CanopyDriver.runJob(directoryContainingConvertedInput, output, measureClass, t1, t2, false, false);
+ CanopyDriver.runJob(directoryContainingConvertedInput, output, measure, t1, t2, false, false);
log.info("Running FuzzyKMeans");
FuzzyKMeansDriver.runJob(directoryContainingConvertedInput,
new Path(output, Cluster.INITIAL_CLUSTERS_DIR),
output,
- measureClass,
+ measure,
convergenceDelta,
maxIterations,
numReducerTasks,
@@ -178,8 +174,7 @@ public final class Job extends FuzzyKMea
0.0,
false);
// run ClusterDumper
- ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-3"),
- new Path(output, "clusteredPoints"));
+ ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-3"), new Path(output, "clusteredPoints"));
clusterDumper.printClusters(null);
}
}
Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java?rev=986960&r1=986959&r2=986960&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java Wed Aug 18 21:47:30 2010
@@ -29,6 +29,8 @@ import org.apache.mahout.clustering.synt
import org.apache.mahout.clustering.syntheticcontrol.canopy.InputDriver;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure;
import org.apache.mahout.utils.clustering.ClusterDumper;
import org.slf4j.Logger;
@@ -49,7 +51,7 @@ public final class Job extends KMeansDri
log.info("Running with default arguments");
Path output = new Path("output");
HadoopUtil.overwriteOutput(output);
- new Job().job(new Path("testdata"), output, "org.apache.mahout.common.distance.EuclideanDistanceMeasure", 80, 55, 0.5, 10);
+ new Job().job(new Path("testdata"), output, new EuclideanDistanceMeasure(), 80, 55, 0.5, 10);
}
}
@@ -90,12 +92,15 @@ public final class Job extends KMeansDri
if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
HadoopUtil.overwriteOutput(output);
}
+ ClassLoader ccl = Thread.currentThread().getContextClassLoader();
+ Class<?> cl = ccl.loadClass(measureClass);
+ DistanceMeasure measure = (DistanceMeasure) cl.newInstance();
if (hasOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)) {
clusters = RandomSeedGenerator.buildRandom(input, clusters, Integer.parseInt(argMap
- .get(DefaultOptionCreator.NUM_CLUSTERS_OPTION)));
+ .get(DefaultOptionCreator.NUM_CLUSTERS_OPTION)), measure);
}
boolean runClustering = hasOption(DefaultOptionCreator.CLUSTERING_OPTION);
- runJob(input, clusters, output, measureClass, convergenceDelta, maxIterations, numReduceTasks, runClustering, false);
+ runJob(input, clusters, output, measure, convergenceDelta, maxIterations, numReduceTasks, runClustering, false);
return 0;
}
@@ -111,8 +116,8 @@ public final class Job extends KMeansDri
* the String denoting the input directory path
* @param output
* the String denoting the output directory path
- * @param measureClass
- * the String class name of the DistanceMeasure to use
+ * @param measure
+ * the DistanceMeasure to use
* @param t1
* the canopy T1 threshold
* @param t2
@@ -126,24 +131,31 @@ public final class Job extends KMeansDri
* @throws ClassNotFoundException
* @throws InterruptedException
*/
- private void job(Path input, Path output, String measureClass, double t1, double t2, double convergenceDelta, int maxIterations)
- throws IOException, InstantiationException, IllegalAccessException, InterruptedException, ClassNotFoundException {
+ private void job(Path input,
+ Path output,
+ DistanceMeasure measure,
+ double t1,
+ double t2,
+ double convergenceDelta,
+ int maxIterations) throws IOException, InstantiationException, IllegalAccessException, InterruptedException,
+ ClassNotFoundException {
HadoopUtil.overwriteOutput(output);
Path directoryContainingConvertedInput = new Path(output, Constants.DIRECTORY_CONTAINING_CONVERTED_INPUT);
log.info("Preparing Input");
InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector");
log.info("Running Canopy to get initial clusters");
- CanopyDriver.runJob(directoryContainingConvertedInput, output, measureClass, t1, t2, false, false);
+ CanopyDriver.runJob(directoryContainingConvertedInput, output, measure, t1, t2, false, false);
log.info("Running KMeans");
KMeansDriver.runJob(directoryContainingConvertedInput,
new Path(output, Cluster.INITIAL_CLUSTERS_DIR),
output,
- measureClass,
+ measure,
convergenceDelta,
maxIterations,
1,
- true, false);
+ true,
+ false);
// run ClusterDumper
ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-" + maxIterations), new Path(output,
"clusteredPoints"));
Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/InputMapper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/InputMapper.java?rev=986960&r1=986959&r2=986960&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/InputMapper.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/InputMapper.java Wed Aug 18 21:47:30 2010
@@ -26,12 +26,14 @@ import org.apache.hadoop.io.LongWritable
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.mahout.clustering.meanshift.MeanShiftCanopy;
+import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.Vector;
-public class InputMapper extends Mapper<LongWritable,Text,Text,MeanShiftCanopy> {
-
+public class InputMapper extends Mapper<LongWritable, Text, Text, MeanShiftCanopy> {
+
private static final Pattern SPACE = Pattern.compile(" ");
+
private int nextCanopyId;
@Override
@@ -49,7 +51,7 @@ public class InputMapper extends Mapper<
for (Double d : doubles) {
point.set(index++, d);
}
- MeanShiftCanopy canopy = new MeanShiftCanopy(point, nextCanopyId++);
+ MeanShiftCanopy canopy = new MeanShiftCanopy(point, nextCanopyId++, new EuclideanDistanceMeasure());
context.write(new Text(), canopy);
}
}
Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java?rev=986960&r1=986959&r2=986960&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java Wed Aug 18 21:47:30 2010
@@ -27,6 +27,8 @@ import org.apache.mahout.clustering.mean
import org.apache.mahout.clustering.syntheticcontrol.Constants;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
import org.apache.mahout.utils.clustering.ClusterDumper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -46,7 +48,7 @@ public final class Job extends MeanShift
log.info("Running with default arguments");
Path output = new Path("output");
HadoopUtil.overwriteOutput(output);
- new Job().job(new Path("testdata"), output, "org.apache.mahout.common.distance.EuclideanDistanceMeasure", 47.6, 1, 0.5, 10);
+ new Job().job(new Path("testdata"), output, new EuclideanDistanceMeasure(), 47.6, 1, 0.5, 10);
}
}
@@ -82,8 +84,10 @@ public final class Job extends MeanShift
double convergenceDelta = Double.parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION));
int maxIterations = Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION));
boolean inputIsCanopies = hasOption(INPUT_IS_CANOPIES_OPTION);
+ ClassLoader ccl = Thread.currentThread().getContextClassLoader();
+ DistanceMeasure measure = (DistanceMeasure) ((Class<?>) ccl.loadClass(measureClass)).newInstance();
- runJob(input, output, measureClass, t1, t2, convergenceDelta, maxIterations, inputIsCanopies, runClustering, false);
+ runJob(input, output, measure, t1, t2, convergenceDelta, maxIterations, inputIsCanopies, runClustering, false);
return 0;
}
@@ -99,8 +103,8 @@ public final class Job extends MeanShift
* the String denoting the input directory path
* @param output
* the String denoting the output directory path
- * @param measureClassName
- * the String class name of the DistanceMeasure to use
+ * @param measure
+ * the DistanceMeasure to use
* @param t1
* the meanshift canopy T1 threshold
* @param t2
@@ -112,26 +116,27 @@ public final class Job extends MeanShift
*/
private void job(Path input,
Path output,
- String measureClassName,
+ DistanceMeasure measure,
double t1,
double t2,
double convergenceDelta,
- int maxIterations)
- throws IOException, InterruptedException, ClassNotFoundException, InstantiationException, IllegalAccessException {
+ int maxIterations) throws IOException, InterruptedException, ClassNotFoundException, InstantiationException,
+ IllegalAccessException {
Path directoryContainingConvertedInput = new Path(output, Constants.DIRECTORY_CONTAINING_CONVERTED_INPUT);
InputDriver.runJob(input, directoryContainingConvertedInput);
MeanShiftCanopyDriver.runJob(directoryContainingConvertedInput,
output,
- measureClassName,
+ measure,
t1,
t2,
convergenceDelta,
maxIterations,
true,
- true, false);
+ true,
+ false);
// run ClusterDumper
- ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-" + maxIterations),
- new Path(output, "clusteredPoints"));
+ ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-" + maxIterations), new Path(output,
+ "clusteredPoints"));
clusterDumper.printClusters(null);
}
Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwDriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwDriver.java?rev=986960&r1=986959&r2=986960&view=diff
==============================================================================
--- mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwDriver.java (original)
+++ mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwDriver.java Wed Aug 18 21:47:30 2010
@@ -36,6 +36,7 @@ import org.apache.mahout.clustering.Weig
import org.apache.mahout.clustering.dirichlet.DirichletCluster;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.common.distance.DistanceMeasure;
import org.apache.mahout.math.VectorWritable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -58,8 +59,8 @@ public final class CDbwDriver extends Ab
}
@Override
- public int run(String[] args)
- throws ClassNotFoundException, InstantiationException, IllegalAccessException, IOException, InterruptedException {
+ public int run(String[] args) throws ClassNotFoundException, InstantiationException, IllegalAccessException, IOException,
+ InterruptedException {
addInputOption();
addOutputOption();
addOption(DefaultOptionCreator.distanceMeasureOption().create());
@@ -74,7 +75,10 @@ public final class CDbwDriver extends Ab
String distanceMeasureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
int numReducers = Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION));
int maxIterations = Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION));
- job(input, null, output, distanceMeasureClass, maxIterations, numReducers);
+ ClassLoader ccl = Thread.currentThread().getContextClassLoader();
+ DistanceMeasure measure = (DistanceMeasure) ((Class<?>) ccl.loadClass(distanceMeasureClass)).newInstance();
+
+ job(input, null, output, measure, maxIterations, numReducers);
return 0;
}
@@ -87,8 +91,8 @@ public final class CDbwDriver extends Ab
the directory pathname for input clustered points [clusterId :: VectorWritable]
* @param output
* the directory pathname for output reference points [clusterId :: VectorWritable]
- * @param distanceMeasureClass
- * the String ModelDistribution class name to use
+ * @param measure
+ * the DistanceMeasure to use
* @param numIterations
* the number of iterations
* @param numReducers
@@ -98,21 +102,20 @@ public final class CDbwDriver extends Ab
public static void runJob(Path clustersIn,
Path clusteredPointsIn,
Path output,
- String distanceMeasureClass,
+ DistanceMeasure measure,
int numIterations,
- int numReducers)
- throws ClassNotFoundException, InstantiationException, IllegalAccessException,
- IOException, InterruptedException {
- job(clustersIn, clusteredPointsIn, output, distanceMeasureClass, numIterations, numReducers);
+ int numReducers) throws ClassNotFoundException, InstantiationException, IllegalAccessException,
+ IOException, InterruptedException {
+ job(clustersIn, clusteredPointsIn, output, measure, numIterations, numReducers);
}
private static void job(Path clustersIn,
Path clusteredPointsIn,
Path output,
- String distanceMeasureClass,
+ DistanceMeasure measure,
int numIterations,
- int numReducers)
- throws InstantiationException, IllegalAccessException, IOException, InterruptedException, ClassNotFoundException {
+ int numReducers) throws InstantiationException, IllegalAccessException, IOException,
+ InterruptedException, ClassNotFoundException {
Path stateIn = new Path(output, "representativePoints-0");
writeInitialState(stateIn, clustersIn);
@@ -120,14 +123,14 @@ public final class CDbwDriver extends Ab
log.info("Iteration {}", iteration);
// point the output to a new directory per iteration
Path stateOut = new Path(output, "representativePoints-" + (iteration + 1));
- runIteration(clusteredPointsIn, stateIn, stateOut, distanceMeasureClass, numReducers);
+ runIteration(clusteredPointsIn, stateIn, stateOut, measure, numReducers);
// now point the input to the old output directory
stateIn = stateOut;
}
Configuration conf = new Configuration();
conf.set(STATE_IN_KEY, stateIn.toString());
- conf.set(DISTANCE_MEASURE_KEY, distanceMeasureClass);
+ conf.set(DISTANCE_MEASURE_KEY, measure.getClass().getName());
CDbwEvaluator evaluator = new CDbwEvaluator(conf, clustersIn);
// now print out the Results
System.out.println("CDbw = " + evaluator.CDbw());
@@ -169,20 +172,16 @@ public final class CDbwDriver extends Ab
* the directory pathname for input state
* @param stateOut
* the directory pathname for output state
- * @param distanceMeasureClass
- * the class name of the DistanceMeasure class
+ * @param measure
+ * the DistanceMeasure
* @param numReducers
* the number of Reducers desired
*/
- private static void runIteration(Path input,
- Path stateIn,
- Path stateOut,
- String distanceMeasureClass,
- int numReducers)
+ private static void runIteration(Path input, Path stateIn, Path stateOut, DistanceMeasure measure, int numReducers)
throws IOException, InterruptedException, ClassNotFoundException {
Configuration conf = new Configuration();
conf.set(STATE_IN_KEY, stateIn.toString());
- conf.set(DISTANCE_MEASURE_KEY, distanceMeasureClass);
+ conf.set(DISTANCE_MEASURE_KEY, measure.getClass().getName());
Job job = new Job(conf);
job.setJarByClass(CDbwDriver.class);
job.setOutputKeyClass(IntWritable.class);
Modified: mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java?rev=986960&r1=986959&r2=986960&view=diff
==============================================================================
--- mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java (original)
+++ mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java Wed Aug 18 21:47:30 2010
@@ -42,6 +42,7 @@ import org.apache.mahout.clustering.kmea
import org.apache.mahout.clustering.meanshift.MeanShiftCanopyDriver;
import org.apache.mahout.common.MahoutTestCase;
import org.apache.mahout.common.distance.CosineDistanceMeasure;
+import org.apache.mahout.common.distance.DistanceMeasure;
import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
import org.apache.mahout.math.NamedVector;
import org.apache.mahout.math.Vector;
@@ -136,48 +137,54 @@ public class TestClusterDumper extends M
}
public void testCanopy() throws Exception { // now run the Job
+ DistanceMeasure measure = new EuclideanDistanceMeasure();
+
Path output = getTestTempDirPath("output");
- CanopyDriver.runJob(getTestTempDirPath("testdata"), output, EuclideanDistanceMeasure.class.getName(), 8, 4, true, false);
+ CanopyDriver.runJob(getTestTempDirPath("testdata"), output, measure, 8, 4, true, false);
// run ClusterDumper
ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-0"), new Path(output, "clusteredPoints"));
clusterDumper.printClusters(termDictionary);
}
public void testKmeans() throws Exception {
+ DistanceMeasure measure = new EuclideanDistanceMeasure();
// now run the Canopy job to prime kMeans canopies
Path output = getTestTempDirPath("output");
- CanopyDriver.runJob(getTestTempDirPath("testdata"), output, EuclideanDistanceMeasure.class.getName(), 8, 4, false, false);
+ CanopyDriver.runJob(getTestTempDirPath("testdata"), output, measure, 8, 4, false, false);
// now run the KMeans job
- KMeansDriver.runJob(getTestTempDirPath("testdata"), new Path(output, "clusters-0"), output, EuclideanDistanceMeasure.class
- .getName(), 0.001, 10, 1, true, false);
+ KMeansDriver.runJob(getTestTempDirPath("testdata"), new Path(output, "clusters-0"), output, measure, 0.001, 10, 1, true, false);
// run ClusterDumper
ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-2"), new Path(output, "clusteredPoints"));
clusterDumper.printClusters(termDictionary);
}
public void testFuzzyKmeans() throws Exception {
+ DistanceMeasure measure = new EuclideanDistanceMeasure();
// now run the Canopy job to prime kMeans canopies
Path output = getTestTempDirPath("output");
- CanopyDriver.runJob(getTestTempDirPath("testdata"), output, EuclideanDistanceMeasure.class.getName(), 8, 4, false, false);
+ CanopyDriver.runJob(getTestTempDirPath("testdata"), output, measure, 8, 4, false, false);
// now run the Fuzzy KMeans job
- FuzzyKMeansDriver.runJob(getTestTempDirPath("testdata"), new Path(output, "clusters-0"), output, EuclideanDistanceMeasure.class
- .getName(), 0.001, 10, 1, (float) 1.1, true, true, 0, false);
+ FuzzyKMeansDriver.runJob(getTestTempDirPath("testdata"),
+ new Path(output, "clusters-0"),
+ output,
+ measure,
+ 0.001,
+ 10,
+ 1,
+ (float) 1.1,
+ true,
+ true,
+ 0,
+ false);
// run ClusterDumper
ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-3"), new Path(output, "clusteredPoints"));
clusterDumper.printClusters(termDictionary);
}
public void testMeanShift() throws Exception {
+ DistanceMeasure measure = new CosineDistanceMeasure();
Path output = getTestTempDirPath("output");
- MeanShiftCanopyDriver.runJob(getTestTempDirPath("testdata"),
- output,
- CosineDistanceMeasure.class.getName(),
- 0.5,
- 0.01,
- 0.05,
- 10,
- false,
- true, false);
+ MeanShiftCanopyDriver.runJob(getTestTempDirPath("testdata"), output, measure, 0.5, 0.01, 0.05, 10, false, true, false);
// run ClusterDumper
ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-1"), new Path(output, "clusteredPoints"));
clusterDumper.printClusters(termDictionary);
Modified: mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java?rev=986960&r1=986959&r2=986960&view=diff
==============================================================================
--- mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java (original)
+++ mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java Wed Aug 18 21:47:30 2010
@@ -41,6 +41,7 @@ import org.apache.mahout.clustering.kmea
import org.apache.mahout.clustering.kmeans.TestKmeansClustering;
import org.apache.mahout.clustering.meanshift.MeanShiftCanopyDriver;
import org.apache.mahout.common.MahoutTestCase;
+import org.apache.mahout.common.distance.DistanceMeasure;
import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.Vector;
@@ -48,8 +49,8 @@ import org.apache.mahout.math.VectorWrit
public class TestCDbwEvaluator extends MahoutTestCase {
- private static final double[][] reference = { { 1, 1 }, { 2, 1 }, { 1, 2 }, { 2, 2 }, { 3, 3 },
- { 4, 4 }, { 5, 4 }, { 4, 5 }, { 5, 5 } };
+ private static final double[][] reference = { { 1, 1 }, { 2, 1 }, { 1, 2 }, { 2, 2 }, { 3, 3 }, { 4, 4 }, { 5, 4 }, { 4, 5 },
+ { 5, 5 } };
private Map<Integer, List<VectorWritable>> representativePoints;
@@ -91,13 +92,14 @@ public class TestCDbwEvaluator extends M
* Initialize synthetic data using 4 clusters dC units from origin having 4 representative points dP from each center
* @param dC a double cluster center offset
* @param dP a double representative point offset
+ * @param measure TODO
*/
- private void initData(double dC, double dP) {
+ private void initData(double dC, double dP, DistanceMeasure measure) {
clusters = new HashMap<Integer, Cluster>();
- clusters.put(1, new Canopy(new DenseVector(new double[] { -dC, -dC }), 1));
- clusters.put(3, new Canopy(new DenseVector(new double[] { -dC, dC }), 3));
- clusters.put(5, new Canopy(new DenseVector(new double[] { dC, dC }), 5));
- clusters.put(7, new Canopy(new DenseVector(new double[] { dC, -dC }), 7));
+ clusters.put(1, new Canopy(new DenseVector(new double[] { -dC, -dC }), 1, measure));
+ clusters.put(3, new Canopy(new DenseVector(new double[] { -dC, dC }), 3, measure));
+ clusters.put(5, new Canopy(new DenseVector(new double[] { dC, dC }), 5, measure));
+ clusters.put(7, new Canopy(new DenseVector(new double[] { dC, -dC }), 7, measure));
representativePoints = new HashMap<Integer, List<VectorWritable>>();
for (Cluster cluster : clusters.values()) {
List<VectorWritable> points = new ArrayList<VectorWritable>();
@@ -111,8 +113,9 @@ public class TestCDbwEvaluator extends M
}
public void testCDbw0() {
- initData(1, 0.25);
- CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, new EuclideanDistanceMeasure());
+ DistanceMeasure measure = new EuclideanDistanceMeasure();
+ initData(1, 0.25, measure);
+ CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure);
assertEquals("inter cluster density", 0.0, evaluator.interClusterDensity());
assertEquals("separation", 1.5, evaluator.separation());
assertEquals("intra cluster density", 0.8944271909999157, evaluator.intraClusterDensity());
@@ -120,8 +123,9 @@ public class TestCDbwEvaluator extends M
}
public void testCDbw1() {
- initData(1, 0.5);
- CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, new EuclideanDistanceMeasure());
+ DistanceMeasure measure = new EuclideanDistanceMeasure();
+ initData(1, 0.5, measure);
+ CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure);
assertEquals("inter cluster density", 0.0, evaluator.interClusterDensity());
assertEquals("separation", 1.0, evaluator.separation());
assertEquals("intra cluster density", 0.44721359549995787, evaluator.intraClusterDensity());
@@ -129,8 +133,9 @@ public class TestCDbwEvaluator extends M
}
public void testCDbw2() {
- initData(1, 0.75);
- CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, new EuclideanDistanceMeasure());
+ DistanceMeasure measure = new EuclideanDistanceMeasure();
+ initData(1, 0.75, measure);
+ CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, clusters, measure);
assertEquals("inter cluster density", 1.017921815355728, evaluator.interClusterDensity());
assertEquals("separation", 0.24777966925931558, evaluator.separation());
assertEquals("intra cluster density", 0.29814239699997197, evaluator.intraClusterDensity());
@@ -138,62 +143,89 @@ public class TestCDbwEvaluator extends M
}
public void testCanopy() throws Exception { // now run the Job
- CanopyDriver.runJob(getTestTempDirPath("testdata"), getTestTempDirPath("output"),
- EuclideanDistanceMeasure.class.getName(), 3.1, 2.1, true, false);
+ DistanceMeasure measure = new EuclideanDistanceMeasure();
+ CanopyDriver.runJob(getTestTempDirPath("testdata"), getTestTempDirPath("output"), measure, 3.1, 2.1, true, false);
int numIterations = 2;
Path output = getTestTempDirPath("output");
- CDbwDriver.runJob(new Path(output, "clusters-0"), new Path(output, "clusteredPoints"), output,
- EuclideanDistanceMeasure.class.getName(), numIterations, 1);
+ CDbwDriver.runJob(new Path(output, "clusters-0"), new Path(output, "clusteredPoints"), output, measure, numIterations, 1);
checkRefPoints(numIterations);
}
public void testKmeans() throws Exception {
+ DistanceMeasure measure = new EuclideanDistanceMeasure();
// now run the Canopy job to prime kMeans canopies
- CanopyDriver.runJob(getTestTempDirPath("testdata"), getTestTempDirPath("output"),
- EuclideanDistanceMeasure.class.getName(), 3.1, 2.1, false, false);
+ CanopyDriver.runJob(getTestTempDirPath("testdata"), getTestTempDirPath("output"), measure, 3.1, 2.1, false, false);
// now run the KMeans job
Path output = getTestTempDirPath("output");
- KMeansDriver.runJob(getTestTempDirPath("testdata"), new Path(output, "clusters-0"), output,
- EuclideanDistanceMeasure.class.getName(), 0.001, 10, 1, true, false);
+ KMeansDriver.runJob(getTestTempDirPath("testdata"), new Path(output, "clusters-0"), output, measure, 0.001, 10, 1, true, false);
int numIterations = 2;
- CDbwDriver.runJob(new Path(output, "clusters-2"), new Path(output, "clusteredPoints"), output,
- EuclideanDistanceMeasure.class.getName(), numIterations, 1);
+ CDbwDriver.runJob(new Path(output, "clusters-2"), new Path(output, "clusteredPoints"), output, measure, numIterations, 1);
checkRefPoints(numIterations);
}
public void testFuzzyKmeans() throws Exception {
+ DistanceMeasure measure = new EuclideanDistanceMeasure();
// now run the Canopy job to prime kMeans canopies
- CanopyDriver.runJob(getTestTempDirPath("testdata"), getTestTempDirPath("output"),
- EuclideanDistanceMeasure.class.getName(), 3.1, 2.1, false, false);
+ CanopyDriver.runJob(getTestTempDirPath("testdata"), getTestTempDirPath("output"), measure, 3.1, 2.1, false, false);
// now run the KMeans job
Path output = getTestTempDirPath("output");
- FuzzyKMeansDriver.runJob(getTestTempDirPath("testdata"), new Path(output, "clusters-0"), output,
- EuclideanDistanceMeasure.class.getName(), 0.001, 10, 1, 2, true, true, 0, false);
+ FuzzyKMeansDriver.runJob(getTestTempDirPath("testdata"),
+ new Path(output, "clusters-0"),
+ output,
+ measure,
+ 0.001,
+ 10,
+ 1,
+ 2,
+ true,
+ true,
+ 0,
+ false);
int numIterations = 2;
- CDbwDriver.runJob(new Path(output, "clusters-4"), new Path(output, "clusteredPoints"), output,
- EuclideanDistanceMeasure.class.getName(), numIterations, 1);
+ CDbwDriver.runJob(new Path(output, "clusters-4"), new Path(output, "clusteredPoints"), output, measure, numIterations, 1);
checkRefPoints(numIterations);
}
public void testMeanShift() throws Exception {
- MeanShiftCanopyDriver.runJob(getTestTempDirPath("testdata"), getTestTempDirPath("output"),
- EuclideanDistanceMeasure.class.getName(), 2.1, 1.0, 0.001, 10, false, true, false);
+ DistanceMeasure measure = new EuclideanDistanceMeasure();
+ MeanShiftCanopyDriver.runJob(getTestTempDirPath("testdata"),
+ getTestTempDirPath("output"),
+ measure,
+ 2.1,
+ 1.0,
+ 0.001,
+ 10,
+ false,
+ true,
+ false);
int numIterations = 2;
Path output = getTestTempDirPath("output");
- CDbwDriver.runJob(new Path(output, "clusters-2"), new Path(output, "clusteredPoints"), output,
- EuclideanDistanceMeasure.class.getName(), numIterations, 1);
+ CDbwDriver.runJob(new Path(output, "clusters-2"), new Path(output, "clusteredPoints"), output, measure, numIterations, 1);
checkRefPoints(numIterations);
}
public void testDirichlet() throws Exception {
Vector prototype = new DenseVector(2);
- DirichletDriver.runJob(getTestTempDirPath("testdata"), getTestTempDirPath("output"),
- L1ModelDistribution.class.getName(), prototype.getClass().getName(),
- 15, 5, 1.0, 1, true, true, 0, false);
+ DirichletDriver.runJob(getTestTempDirPath("testdata"),
+ getTestTempDirPath("output"),
+ L1ModelDistribution.class.getName(),
+ prototype.getClass().getName(),
+ 15,
+ 5,
+ 1.0,
+ 1,
+ true,
+ true,
+ 0,
+ false);
int numIterations = 2;
Path output = getTestTempDirPath("output");
- CDbwDriver.runJob(new Path(output, "clusters-5"), new Path(output, "clusteredPoints"), output,
- EuclideanDistanceMeasure.class.getName(), numIterations, 1);
+ CDbwDriver.runJob(new Path(output, "clusters-5"),
+ new Path(output, "clusteredPoints"),
+ output,
+ new EuclideanDistanceMeasure(),
+ numIterations,
+ 1);
checkRefPoints(numIterations);
}