You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by je...@apache.org on 2010/04/22 22:47:40 UTC
svn commit: r937051 - in /lucene/mahout/trunk:
core/src/main/java/org/apache/mahout/clustering/
core/src/main/java/org/apache/mahout/clustering/canopy/
core/src/main/java/org/apache/mahout/clustering/dirichlet/
core/src/main/java/org/apache/mahout/clus...
Author: jeastman
Date: Thu Apr 22 20:47:39 2010
New Revision: 937051
URL: http://svn.apache.org/viewvc?rev=937051&view=rev
Log:
MAHOUT-236: Improvements to consistency of all clustering algorithms:
- Cleaned up cluster asFormatString(x) so all clusters have similar naming
- Added ClusterDumper tests for fuzzyK and MeanShift
- Cleaned up cluster job intermediate file nomenclature:
- All initial clusters go into clusters-0 directory
- All cluster outputs go into clusters-i directory, where i is iteration number (1..n)
- All clustered points go into clusteredPoints directory
- Moved intermediate file nomenclature to standard constants in ClusterBase
all tests run but I think FuzzyKMeans is broken or my pilot error since it produces anomalous outputs with ClusterDumper and CDbw tests. I will debug that next.
Modified:
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/ClusterBase.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/Canopy.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyClusteringJob.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/ClusterDriver.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletCluster.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansClusterMapper.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/SoftCluster.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/Cluster.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterMapper.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterer.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansConfigKeys.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopy.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyJob.java
lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/TestClusterInterface.java
lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java
lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java
lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java
lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java
lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/ClusterBase.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/ClusterBase.java?rev=937051&r1=937050&r2=937051&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/ClusterBase.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/ClusterBase.java Thu Apr 22 20:47:39 2010
@@ -40,6 +40,16 @@ import com.google.gson.reflect.TypeToken
*
*/
public abstract class ClusterBase implements Writable, Cluster {
+
+ // default directory for all clustered points
+ public static final String CLUSTERED_POINTS_DIR = "/clusteredPoints";
+
+ // default directory for initial clusters to prime iterative clustering algorithms
+ public static final String INITIAL_CLUSTERS_DIR = "/clusters-0";
+
+ // default directory for output of clusters per iteration
+ public static final String CLUSTERS_DIR = "/clusters-";
+
// this cluster's clusterId
private int id;
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/Canopy.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/Canopy.java?rev=937051&r1=937050&r2=937051&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/Canopy.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/Canopy.java Thu Apr 22 20:47:39 2010
@@ -122,12 +122,12 @@ public class Canopy extends ClusterBase
@Override
public String toString() {
- return getIdentifier() + " - " + getCenter().asFormatString();
+ return getIdentifier() + ": " + getCenter().asFormatString();
}
@Override
public String getIdentifier() {
- return "C" + getId();
+ return "C-" + getId();
}
/**
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyClusteringJob.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyClusteringJob.java?rev=937051&r1=937050&r2=937051&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyClusteringJob.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyClusteringJob.java Thu Apr 22 20:47:39 2010
@@ -27,6 +27,7 @@ import org.apache.commons.cli2.builder.A
import org.apache.commons.cli2.builder.DefaultOptionBuilder;
import org.apache.commons.cli2.builder.GroupBuilder;
import org.apache.commons.cli2.commandline.Parser;
+import org.apache.mahout.clustering.ClusterBase;
import org.apache.mahout.common.CommandLineUtil;
import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure;
import org.slf4j.Logger;
@@ -42,12 +43,7 @@ import org.slf4j.LoggerFactory;
public final class CanopyClusteringJob {
private static final Logger log = LoggerFactory.getLogger(CanopyClusteringJob.class);
-
- /** The default name of the canopies output sub-directory. */
- public static final String DEFAULT_CANOPIES_OUTPUT_DIRECTORY = "/canopies";
- /** The default name of the directory used to output clusters. */
- public static final String DEFAULT_CLUSTER_OUTPUT_DIRECTORY = ClusterDriver.DEFAULT_CLUSTER_OUTPUT_DIRECTORY;
-
+
private CanopyClusteringJob() { }
/**
@@ -135,9 +131,10 @@ public final class CanopyClusteringJob {
*/
public static void runJob(String input, String output,
String measureClassName, double t1, double t2) throws IOException {
- CanopyDriver.runJob(input, output + DEFAULT_CANOPIES_OUTPUT_DIRECTORY,
+ String canopyOutputDir = output + ClusterBase.CLUSTERS_DIR + "0";
+ CanopyDriver.runJob(input, canopyOutputDir,
measureClassName, t1, t2);
- ClusterDriver.runJob(input, output + DEFAULT_CANOPIES_OUTPUT_DIRECTORY, output,
+ ClusterDriver.runJob(input, canopyOutputDir, output,
measureClassName, t1, t2);
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/ClusterDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/ClusterDriver.java?rev=937051&r1=937050&r2=937051&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/ClusterDriver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/ClusterDriver.java Thu Apr 22 20:47:39 2010
@@ -46,7 +46,7 @@ import org.slf4j.LoggerFactory;
public final class ClusterDriver {
- public static final String DEFAULT_CLUSTER_OUTPUT_DIRECTORY = "/clusters";
+ public static final String DEFAULT_CLUSTERED_POINTS_DIRECTORY = "/clusteredPoints";
private static final Logger log = LoggerFactory.getLogger(ClusterDriver.class);
@@ -159,7 +159,7 @@ public final class ClusterDriver {
conf.setOutputFormat(SequenceFileOutputFormat.class);
FileInputFormat.setInputPaths(conf, new Path(points));
- Path outPath = new Path(output + DEFAULT_CLUSTER_OUTPUT_DIRECTORY);
+ Path outPath = new Path(output + DEFAULT_CLUSTERED_POINTS_DIRECTORY);
FileOutputFormat.setOutputPath(conf, outPath);
conf.setMapperClass(ClusterMapper.class);
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletCluster.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletCluster.java?rev=937051&r1=937050&r2=937051&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletCluster.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletCluster.java Thu Apr 22 20:47:39 2010
@@ -31,54 +31,55 @@ import com.google.gson.GsonBuilder;
import com.google.gson.reflect.TypeToken;
public class DirichletCluster<O> implements Writable, Cluster {
-
+
@Override
public void readFields(DataInput in) throws IOException {
this.totalCount = in.readDouble();
this.model = readModel(in);
}
-
+
@Override
public void write(DataOutput out) throws IOException {
out.writeDouble(totalCount);
writeModel(out, model);
}
-
+
private Model<O> model; // the model for this iteration
-
+
private double totalCount; // total count of observations for the model
-
+
public DirichletCluster(Model<O> model, double totalCount) {
super();
this.model = model;
this.totalCount = totalCount;
}
-
+
public DirichletCluster(Model<O> model) {
super();
this.model = model;
this.totalCount = 0.0;
}
-
+
public DirichletCluster() {
super();
}
-
+
public Model<O> getModel() {
return model;
}
-
+
public void setModel(Model<O> model) {
this.model = model;
this.totalCount += model.count();
}
-
+
public double getTotalCount() {
return totalCount;
}
-
- private static final Type clusterType = new TypeToken<DirichletCluster<Vector>>() { }.getType();
-
+
+ private static final Type clusterType = new TypeToken<DirichletCluster<Vector>>() {
+ }.getType();
+
/** Reads a typed Model instance from the input stream */
public static <O> Model<O> readModel(DataInput in) throws IOException {
String modelClassName = in.readUTF();
@@ -95,18 +96,18 @@ public class DirichletCluster<O> impleme
model.readFields(in);
return model;
}
-
+
/** Writes a typed Model instance to the output stream */
public static void writeModel(DataOutput out, Model<?> model) throws IOException {
out.writeUTF(model.getClass().getName());
model.write(out);
}
-
+
@Override
public String asFormatString(String[] bindings) {
- return model.toString();
+ return "C-" + model.getId() + ": " + model.toString();
}
-
+
@Override
public String asJsonString() {
GsonBuilder builder = new GsonBuilder();
@@ -129,5 +130,5 @@ public class DirichletCluster<O> impleme
public int getNumPoints() {
return model.getNumPoints();
}
-
+
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java?rev=937051&r1=937050&r2=937051&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java Thu Apr 22 20:47:39 2010
@@ -41,10 +41,9 @@ import org.apache.hadoop.mapred.JobClien
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
+import org.apache.mahout.clustering.ClusterBase;
import org.apache.mahout.clustering.dirichlet.models.VectorModelDistribution;
-import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver;
import org.apache.mahout.clustering.kmeans.KMeansDriver;
-import org.apache.mahout.clustering.meanshift.MeanShiftCanopyClusterMapper;
import org.apache.mahout.common.CommandLineUtil;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.math.Vector;
@@ -54,6 +53,7 @@ import org.slf4j.LoggerFactory;
public class DirichletDriver {
+
public static final String STATE_IN_KEY = "org.apache.mahout.clustering.dirichlet.stateIn";
public static final String MODEL_FACTORY_KEY = "org.apache.mahout.clustering.dirichlet.modelFactory";
@@ -215,20 +215,20 @@ public class DirichletDriver {
NoSuchMethodException,
InvocationTargetException {
- String stateIn = output + "/state-0";
- writeInitialState(output, stateIn, modelFactory, modelPrototype, prototypeSize, numClusters, alpha_0);
+ String clustersIn = output + ClusterBase.INITIAL_CLUSTERS_DIR;
+ writeInitialState(output, clustersIn, modelFactory, modelPrototype, prototypeSize, numClusters, alpha_0);
- for (int iteration = 0; iteration < maxIterations; iteration++) {
+ for (int iteration = 1; iteration <= maxIterations; iteration++) {
log.info("Iteration {}", iteration);
// point the output to a new directory per iteration
- String stateOut = output + "/state-" + (iteration + 1);
- runIteration(input, stateIn, stateOut, modelFactory, modelPrototype, prototypeSize, numClusters,
+ String clustersOut = output + ClusterBase.CLUSTERS_DIR + iteration;
+ runIteration(input, clustersIn, clustersOut, modelFactory, modelPrototype, prototypeSize, numClusters,
alpha_0, numReducers);
// now point the input to the old output directory
- stateIn = stateOut;
+ clustersIn = clustersOut;
}
// now cluster the most likely points
- runClustering(input, stateIn, output + "/clusters");
+ runClustering(input, clustersIn, output + ClusterBase.CLUSTERED_POINTS_DIR);
}
private static void writeInitialState(String output,
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansClusterMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansClusterMapper.java?rev=937051&r1=937050&r2=937051&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansClusterMapper.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansClusterMapper.java Thu Apr 22 20:47:39 2010
@@ -42,7 +42,7 @@ public class FuzzyKMeansClusterMapper ex
@Override
public void map(WritableComparable<?> key, VectorWritable point, OutputCollector<IntWritable, VectorWritable> output,
Reporter reporter) throws IOException {
- clusterer.outputPointWithClusterProbabilities(key.toString(), (NamedVector) point.get(), clusters, output);
+ clusterer.outputPointWithClusterProbabilities(key.toString(), point.get(), clusters, output);
}
/**
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java?rev=937051&r1=937050&r2=937051&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java Thu Apr 22 20:47:39 2010
@@ -45,6 +45,7 @@ import org.apache.hadoop.mapred.JobClien
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
+import org.apache.mahout.clustering.ClusterBase;
import org.apache.mahout.clustering.kmeans.RandomSeedGenerator;
import org.apache.mahout.common.CommandLineUtil;
import org.apache.mahout.common.HadoopUtil;
@@ -225,26 +226,26 @@ public final class FuzzyKMeansDriver {
float m) {
boolean converged = false;
- int iteration = 0;
+ int iteration = 1;
// iterate until the clusters converge
- while (!converged && (iteration < maxIterations)) {
+ while (!converged && (iteration <= maxIterations)) {
log.info("Iteration {}", iteration);
// point the output to a new directory per iteration
- String clustersOut = output + File.separator + "clusters-" + iteration;
+ String clustersOut = output + ClusterBase.CLUSTERS_DIR + iteration;
converged = runIteration(input, clustersIn, clustersOut, measureClass,
convergenceDelta, numMapTasks, numReduceTasks, iteration, m);
// now point the input to the old output directory
- clustersIn = output + File.separator + "clusters-" + iteration;
+ clustersIn = clustersOut;
iteration++;
}
// now actually cluster the points
log.info("Clustering ");
- runClustering(input, clustersIn, output + File.separator + "points", measureClass,
+ runClustering(input, clustersIn, output + ClusterBase.CLUSTERED_POINTS_DIR, measureClass,
convergenceDelta, numMapTasks, m);
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/SoftCluster.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/SoftCluster.java?rev=937051&r1=937050&r2=937051&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/SoftCluster.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/SoftCluster.java Thu Apr 22 20:47:39 2010
@@ -157,15 +157,15 @@ public class SoftCluster extends Cluster
@Override
public String toString() {
- return getIdentifier() + " - " + getCenter().asFormatString();
+ return getIdentifier() + ": " + getCenter().asFormatString();
}
@Override
public String getIdentifier() {
if (converged) {
- return "V" + this.getId();
+ return "V-" + this.getId();
} else {
- return "C" + this.getId();
+ return "C-" + this.getId();
}
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/Cluster.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/Cluster.java?rev=937051&r1=937050&r2=937051&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/Cluster.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/Cluster.java Thu Apr 22 20:47:39 2010
@@ -168,12 +168,12 @@ public class Cluster extends ClusterBase
@Override
public String toString() {
- return getIdentifier() + " - " + getCenter().asFormatString();
+ return getIdentifier() + ": " + getCenter().asFormatString();
}
@Override
public String getIdentifier() {
- return (converged ? "V" : "C") + getId();
+ return (converged ? "V-" : "C-") + getId();
}
/**
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterMapper.java?rev=937051&r1=937050&r2=937051&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterMapper.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterMapper.java Thu Apr 22 20:47:39 2010
@@ -43,7 +43,7 @@ public class KMeansClusterMapper extends
VectorWritable point,
OutputCollector<IntWritable,VectorWritable> output,
Reporter reporter) throws IOException {
- clusterer.outputPointWithClusterInfo((NamedVector) point.get(), clusters, output);
+ clusterer.outputPointWithClusterInfo(point.get(), clusters, output);
}
@Override
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterer.java?rev=937051&r1=937050&r2=937051&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterer.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansClusterer.java Thu Apr 22 20:47:39 2010
@@ -81,21 +81,21 @@ public class KMeansClusterer {
output.collect(new Text(nearestCluster.getIdentifier()), new KMeansInfo(1, point));
}
- public void outputPointWithClusterInfo(NamedVector point,
+ public void outputPointWithClusterInfo(Vector vector,
List<Cluster> clusters,
OutputCollector<IntWritable,VectorWritable> output) throws IOException {
Cluster nearestCluster = null;
double nearestDistance = Double.MAX_VALUE;
for (Cluster cluster : clusters) {
Vector clusterCenter = cluster.getCenter();
- double distance = measure.distance(clusterCenter.getLengthSquared(), clusterCenter, point);
+ double distance = measure.distance(clusterCenter.getLengthSquared(), clusterCenter, vector);
if ((distance < nearestDistance) || (nearestCluster == null)) {
nearestCluster = cluster;
nearestDistance = distance;
}
}
- output.collect(new IntWritable(nearestCluster.getId()), new VectorWritable(point));
+ output.collect(new IntWritable(nearestCluster.getId()), new VectorWritable(vector));
}
/**
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansConfigKeys.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansConfigKeys.java?rev=937051&r1=937050&r2=937051&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansConfigKeys.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansConfigKeys.java Thu Apr 22 20:47:39 2010
@@ -25,9 +25,7 @@ public interface KMeansConfigKeys {
String DISTANCE_MEASURE_KEY = "org.apache.mahout.clustering.kmeans.measure";
/** Configuration key for convergence threshold. */
String CLUSTER_CONVERGENCE_KEY = "org.apache.mahout.clustering.kmeans.convergence";
- /** Configuration key for ?? */
+ /** Configuration key for iteration cluster path */
String CLUSTER_PATH_KEY = "org.apache.mahout.clustering.kmeans.path";
- /** The number of iterations that have taken place */
- String ITERATION_NUMBER = "org.apache.mahout.clustering.kmeans.iteration";
}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java?rev=937051&r1=937050&r2=937051&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java Thu Apr 22 20:47:39 2010
@@ -39,6 +39,7 @@ import org.apache.hadoop.mapred.JobClien
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
+import org.apache.mahout.clustering.ClusterBase;
import org.apache.mahout.common.CommandLineUtil;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure;
@@ -47,10 +48,7 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public final class KMeansDriver {
-
- /** The name of the directory used to output final results. */
- public static final String DEFAULT_OUTPUT_DIRECTORY = "/points";
-
+
private static final Logger log = LoggerFactory.getLogger(KMeansDriver.class);
private KMeansDriver() {}
@@ -206,19 +204,19 @@ public final class KMeansDriver {
new Object[] {convergenceDelta, maxIterations, numReduceTasks, VectorWritable.class.getName()});
}
boolean converged = false;
- int iteration = 0;
- while (!converged && (iteration < maxIterations)) {
+ int iteration = 1;
+ while (!converged && (iteration <= maxIterations)) {
log.info("Iteration {}", iteration);
// point the output to a new directory per iteration
- String clustersOut = output + "/clusters-" + iteration;
+ String clustersOut = output + ClusterBase.CLUSTERS_DIR + iteration;
converged = runIteration(input, clustersIn, clustersOut, measureClass, delta, numReduceTasks, iteration);
// now point the input to the old output directory
- clustersIn = output + "/clusters-" + iteration;
+ clustersIn = clustersOut;
iteration++;
}
// now actually cluster the points
log.info("Clustering ");
- runClustering(input, clustersIn, output + DEFAULT_OUTPUT_DIRECTORY, measureClass, delta);
+ runClustering(input, clustersIn, output + ClusterBase.CLUSTERED_POINTS_DIR, measureClass, delta);
}
/**
@@ -265,7 +263,6 @@ public final class KMeansDriver {
conf.set(KMeansConfigKeys.CLUSTER_PATH_KEY, clustersIn);
conf.set(KMeansConfigKeys.DISTANCE_MEASURE_KEY, measureClass);
conf.set(KMeansConfigKeys.CLUSTER_CONVERGENCE_KEY, convergenceDelta);
- conf.set(KMeansConfigKeys.ITERATION_NUMBER, String.valueOf(iteration));
try {
JobClient.runJob(conf);
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopy.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopy.java?rev=937051&r1=937050&r2=937051&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopy.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopy.java Thu Apr 22 20:47:39 2010
@@ -148,7 +148,7 @@ public class MeanShiftCanopy extends Clu
@Override
public String getIdentifier() {
- return (converged ? "V" : "C") + getId();
+ return (converged ? "V-" : "C-") + getId();
}
void init(MeanShiftCanopy canopy) {
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyJob.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyJob.java?rev=937051&r1=937050&r2=937051&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyJob.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyJob.java Thu Apr 22 20:47:39 2010
@@ -31,6 +31,7 @@ import org.apache.hadoop.conf.Configurat
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;
+import org.apache.mahout.clustering.ClusterBase;
import org.apache.mahout.common.CommandLineUtil;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.slf4j.Logger;
@@ -148,7 +149,7 @@ public class MeanShiftCanopyJob {
}
fs.mkdirs(outPath);
- String clustersIn = output + "/initial-canopies";
+ String clustersIn = output + ClusterBase.INITIAL_CLUSTERS_DIR;
if (inputIsCanopies) {
clustersIn = input;
} else {
@@ -157,21 +158,22 @@ public class MeanShiftCanopyJob {
// iterate until the clusters converge
boolean converged = false;
- int iteration = 0;
- while (!converged && (iteration < maxIterations)) {
+ int iteration = 1;
+ while (!converged && (iteration <= maxIterations)) {
log.info("Iteration {}", iteration);
// point the output to a new directory per iteration
- String clustersOut = output + "/canopies-" + iteration;
+ String clustersOut = output + ClusterBase.CLUSTERS_DIR + iteration;
String controlOut = output + CONTROL_CONVERGED;
MeanShiftCanopyDriver.runJob(clustersIn, clustersOut, controlOut, measureClassName, t1, t2, convergenceDelta);
converged = FileSystem.get(conf).exists(new Path(controlOut));
// now point the input to the old output directory
- clustersIn = output + "/canopies-" + iteration;
+ clustersIn = clustersOut;
iteration++;
}
// now cluster the points
- MeanShiftCanopyDriver.runClustering((inputIsCanopies ? input : output + "/initial-canopies"), clustersIn, output + "/clusters");
+ MeanShiftCanopyDriver.runClustering((inputIsCanopies ? input : output + ClusterBase.INITIAL_CLUSTERS_DIR), clustersIn, output
+ + ClusterBase.CLUSTERED_POINTS_DIR);
}
}
Modified: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/TestClusterInterface.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/TestClusterInterface.java?rev=937051&r1=937050&r2=937051&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/TestClusterInterface.java (original)
+++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/TestClusterInterface.java Thu Apr 22 20:47:39 2010
@@ -108,7 +108,7 @@ public class TestClusterInterface extend
NormalModel model = new NormalModel(5, m, 0.75);
Cluster cluster = new DirichletCluster<VectorWritable>(model, 35.0);
String format = cluster.asFormatString(null);
- assertEquals("format", "nm{n=0 m=[1.100, 2.200, 3.300] sd=0.75}", format);
+ assertEquals("format", "C-5: nm{n=0 m=[1.100, 2.200, 3.300] sd=0.75}", format);
}
public void testDirichletNormalModelClusterAsJsonString() {
@@ -131,7 +131,7 @@ public class TestClusterInterface extend
AsymmetricSampledNormalModel model = new AsymmetricSampledNormalModel(5, m, m);
Cluster cluster = new DirichletCluster<VectorWritable>(model, 35.0);
String format = cluster.asFormatString(null);
- assertEquals("format", "asnm{n=0 m=[1.100, 2.200, 3.300] sd=[1.100, 2.200, 3.300]}", format);
+ assertEquals("format", "C-5: asnm{n=0 m=[1.100, 2.200, 3.300] sd=[1.100, 2.200, 3.300]}", format);
}
public void testDirichletAsymmetricSampledNormalModelClusterAsJsonString() {
@@ -155,7 +155,7 @@ public class TestClusterInterface extend
L1Model model = new L1Model(5, m);
Cluster cluster = new DirichletCluster<VectorWritable>(model, 35.0);
String format = cluster.asFormatString(null);
- assertEquals("format", "l1m{n=0 c=[1.100, 2.200, 3.300]}", format);
+ assertEquals("format", "C-5: l1m{n=0 c=[1.100, 2.200, 3.300]}", format);
}
public void testDirichletL1ModelClusterAsJsonString() {
@@ -179,7 +179,7 @@ public class TestClusterInterface extend
Cluster cluster = new Canopy(m, 123);
String formatString = cluster.asFormatString(null);
System.out.println(formatString);
- assertEquals("format", "C123: [1.100, 2.200, 3.300]", formatString);
+ assertEquals("format", "C-123: [1.100, 2.200, 3.300]", formatString);
}
public void testCanopyAsFormatStringSparse() {
@@ -189,7 +189,7 @@ public class TestClusterInterface extend
Cluster cluster = new Canopy(m, 123);
String formatString = cluster.asFormatString(null);
System.out.println(formatString);
- assertEquals("format", "C123: [0:1.100, 2:3.300]", formatString);
+ assertEquals("format", "C-123: [0:1.100, 2:3.300]", formatString);
}
public void testCanopyAsFormatStringWithBindings() {
@@ -199,7 +199,7 @@ public class TestClusterInterface extend
String[] bindings = { "fee", null, null };
String formatString = cluster.asFormatString(bindings);
System.out.println(formatString);
- assertEquals("format", "C123: [fee:1.100, 1:2.200, 2:3.300]", formatString);
+ assertEquals("format", "C-123: [fee:1.100, 1:2.200, 2:3.300]", formatString);
}
public void testCanopyAsFormatStringSparseWithBindings() {
@@ -209,7 +209,7 @@ public class TestClusterInterface extend
Cluster cluster = new Canopy(m, 123);
String formatString = cluster.asFormatString(null);
System.out.println(formatString);
- assertEquals("format", "C123: [0:1.100, 2:3.300]", formatString);
+ assertEquals("format", "C-123: [0:1.100, 2:3.300]", formatString);
}
public void testClusterAsFormatString() {
@@ -218,7 +218,7 @@ public class TestClusterInterface extend
Cluster cluster = new org.apache.mahout.clustering.kmeans.Cluster(m, 123);
String formatString = cluster.asFormatString(null);
System.out.println(formatString);
- assertEquals("format", "C123: [1.100, 2.200, 3.300]", formatString);
+ assertEquals("format", "C-123: [1.100, 2.200, 3.300]", formatString);
}
public void testClusterAsFormatStringSparse() {
@@ -228,7 +228,7 @@ public class TestClusterInterface extend
Cluster cluster = new org.apache.mahout.clustering.kmeans.Cluster(m, 123);
String formatString = cluster.asFormatString(null);
System.out.println(formatString);
- assertEquals("format", "C123: [0:1.100, 2:3.300]", formatString);
+ assertEquals("format", "C-123: [0:1.100, 2:3.300]", formatString);
}
public void testClusterAsFormatStringWithBindings() {
@@ -238,7 +238,7 @@ public class TestClusterInterface extend
String[] bindings = { "fee", null, "foo" };
String formatString = cluster.asFormatString(bindings);
System.out.println(formatString);
- assertEquals("format", "C123: [fee:1.100, 1:2.200, foo:3.300]", formatString);
+ assertEquals("format", "C-123: [fee:1.100, 1:2.200, foo:3.300]", formatString);
}
public void testClusterAsFormatStringSparseWithBindings() {
@@ -248,7 +248,7 @@ public class TestClusterInterface extend
Cluster cluster = new org.apache.mahout.clustering.kmeans.Cluster(m, 123);
String formatString = cluster.asFormatString(null);
System.out.println(formatString);
- assertEquals("format", "C123: [0:1.100, 2:3.300]", formatString);
+ assertEquals("format", "C-123: [0:1.100, 2:3.300]", formatString);
}
public void testMSCanopyAsFormatString() {
@@ -257,7 +257,7 @@ public class TestClusterInterface extend
Cluster cluster = new MeanShiftCanopy(m, 123);
String formatString = cluster.asFormatString(null);
System.out.println(formatString);
- assertEquals("format", "C123: [1.100, 2.200, 3.300]", formatString);
+ assertEquals("format", "C-123: [1.100, 2.200, 3.300]", formatString);
}
public void testMSCanopyAsFormatStringSparse() {
@@ -267,7 +267,7 @@ public class TestClusterInterface extend
Cluster cluster = new MeanShiftCanopy(m, 123);
String formatString = cluster.asFormatString(null);
System.out.println(formatString);
- assertEquals("format", "C123: [0:1.100, 2:3.300]", formatString);
+ assertEquals("format", "C-123: [0:1.100, 2:3.300]", formatString);
}
public void testMSCanopyAsFormatStringWithBindings() {
@@ -277,7 +277,7 @@ public class TestClusterInterface extend
String[] bindings = { "fee", null, "foo" };
String formatString = cluster.asFormatString(bindings);
System.out.println(formatString);
- assertEquals("format", "C123: [fee:1.100, 1:2.200, foo:3.300]", formatString);
+ assertEquals("format", "C-123: [fee:1.100, 1:2.200, foo:3.300]", formatString);
}
public void testMSCanopyAsFormatStringSparseWithBindings() {
@@ -288,7 +288,7 @@ public class TestClusterInterface extend
String[] bindings = { "fee", null, "foo" };
String formatString = cluster.asFormatString(bindings);
System.out.println(formatString);
- assertEquals("format", "C123: [fee:1.100, foo:3.300]", formatString);
+ assertEquals("format", "C-123: [fee:1.100, foo:3.300]", formatString);
}
}
Modified: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java?rev=937051&r1=937050&r2=937051&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java (original)
+++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/canopy/TestCanopyCreation.java Thu Apr 22 20:47:39 2010
@@ -352,12 +352,12 @@ public class TestCanopyCreation extends
Text key = new Text();
Canopy canopy = new Canopy();
assertTrue("more to come", reader.next(key, canopy));
- assertEquals("1st key", "C0", key.toString());
+ assertEquals("1st key", "C-0", key.toString());
// Canopy canopy = new Canopy(value); //Canopy.decodeCanopy(value.toString());
assertEquals("1st x value", 1.5, canopy.getCenter().get(0));
assertEquals("1st y value", 1.5, canopy.getCenter().get(1));
assertTrue("more to come", reader.next(key, canopy));
- assertEquals("2nd key", "C1", key.toString());
+ assertEquals("2nd key", "C-1", key.toString());
// canopy = Canopy.decodeCanopy(canopy.toString());
assertEquals("1st x value", 4.333333333333334, canopy.getCenter().get(0));
assertEquals("1st y value", 4.333333333333334, canopy.getCenter().get(1));
@@ -388,11 +388,11 @@ public class TestCanopyCreation extends
Text key = new Text();
Canopy value = new Canopy();
assertTrue("more to come", reader.next(key, value));
- assertEquals("1st key", "C0", key.toString());
+ assertEquals("1st key", "C-0", key.toString());
assertEquals("1st x value", 1.8, value.getCenter().get(0));
assertEquals("1st y value", 1.8, value.getCenter().get(1));
assertTrue("more to come", reader.next(key, value));
- assertEquals("2nd key", "C1", key.toString());
+ assertEquals("2nd key", "C-1", key.toString());
assertEquals("1st x value", 4.433333333333334, value.getCenter().get(0));
assertEquals("1st y value", 4.433333333333334, value.getCenter().get(1));
assertFalse("more to come", reader.next(key, value));
@@ -493,7 +493,7 @@ public class TestCanopyCreation extends
// now run the Job
CanopyClusteringJob.runJob("testdata", "output", ManhattanDistanceMeasure.class.getName(), 3.1, 2.1);
// TODO: change
- Path path = new Path("output/clusters/part-00000");
+ Path path = new Path("output/clusteredPoints/part-00000");
SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
int count = 0;
/*
@@ -525,7 +525,7 @@ public class TestCanopyCreation extends
ClusteringTestUtils.writePointsToFile(points, "testdata/file2", fs, conf);
// now run the Job
CanopyClusteringJob.runJob("testdata", "output", EuclideanDistanceMeasure.class.getName(), 3.1, 2.1);
- Path path = new Path("output/clusters/part-00000");
+ Path path = new Path("output/clusteredPoints/part-00000");
SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
int count = 0;
/*
@@ -566,12 +566,12 @@ public class TestCanopyCreation extends
Text key = new Text();
Canopy value = new Canopy();
assertTrue("more to come", reader.next(key, value));
- assertEquals("1st key", "C0", key.toString());
+ assertEquals("1st key", "C-0", key.toString());
assertEquals("1st x value", 1.5, value.getCenter().get(0));
assertEquals("1st y value", 1.5, value.getCenter().get(1));
assertTrue("more to come", reader.next(key, value));
- assertEquals("2nd key", "C1", key.toString());
+ assertEquals("2nd key", "C-1", key.toString());
assertEquals("1st x value", 4.333333333333334, value.getCenter().get(0));
assertEquals("1st y value", 4.333333333333334, value.getCenter().get(1));
Modified: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java?rev=937051&r1=937050&r2=937051&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java (original)
+++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/fuzzykmeans/TestFuzzyKmeansClustering.java Thu Apr 22 20:47:39 2010
@@ -205,11 +205,11 @@ public class TestFuzzyKmeansClustering e
EuclideanDistanceMeasure.class.getName(), 0.001, 2, 1, k + 1, 2);
// now compare the expected clusters with actual
- File outDir = new File("output/points");
+ File outDir = new File("output/clusteredPoints");
assertTrue("output dir exists?", outDir.exists());
outDir.list();
// assertEquals("output dir files?", 4, outFiles.length);
- SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path("output/points/part-00000"), conf);
+ SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path("output/clusteredPoints/part-00000"), conf);
IntWritable key = new IntWritable();
VectorWritable out = new VectorWritable();
while (reader.next(key, out)) {
Modified: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java?rev=937051&r1=937050&r2=937051&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java (original)
+++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java Thu Apr 22 20:47:39 2010
@@ -373,10 +373,10 @@ public class TestKmeansClustering extend
KMeansDriver.runJob("testdata/points", "testdata/clusters", "output", EuclideanDistanceMeasure.class
.getName(), 0.001, 10, k + 1);
// now compare the expected clusters with actual
- File outDir = new File("output/points");
+ File outDir = new File("output/clusteredPoints");
assertTrue("output dir exists?", outDir.exists());
// assertEquals("output dir files?", 4, outFiles.length);
- SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path("output/points/part-00000"), conf);
+ SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path("output/clusteredPoints/part-00000"), conf);
int[] expect = expectedNumPoints[k];
DummyOutputCollector<IntWritable,VectorWritable> collector = new DummyOutputCollector<IntWritable,VectorWritable>();
// The key is the clusterId
@@ -424,12 +424,12 @@ public class TestKmeansClustering extend
.getName(), 0.001, 10, 1);
// now compare the expected clusters with actual
- File outDir = new File("output/points");
+ File outDir = new File("output/clusteredPoints");
assertTrue("output dir exists?", outDir.exists());
String[] outFiles = outDir.list();
assertEquals("output dir files?", 4, outFiles.length);
DummyOutputCollector<IntWritable,VectorWritable> collector = new DummyOutputCollector<IntWritable,VectorWritable>();
- SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path("output/points/part-00000"), conf);
+ SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path("output/clusteredPoints/part-00000"), conf);
// The key is the clusterId
IntWritable clusterId = new IntWritable(0);
Modified: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java?rev=937051&r1=937050&r2=937051&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java (original)
+++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java Thu Apr 22 20:47:39 2010
@@ -204,7 +204,7 @@ public class TestMeanShift extends Mahou
for (Map.Entry<String, MeanShiftCanopy> stringMeanShiftCanopyEntry : refCanopyMap.entrySet()) {
MeanShiftCanopy ref = stringMeanShiftCanopyEntry.getValue();
- MeanShiftCanopy canopy = canopyMap.get((ref.isConverged() ? "V" : "C") + ref.getCanopyId());
+ MeanShiftCanopy canopy = canopyMap.get((ref.isConverged() ? "V-" : "C-") + ref.getCanopyId());
assertEquals("ids", ref.getCanopyId(), canopy.getCanopyId());
assertEquals("centers(" + ref.getIdentifier() + ')', ref.getCenter().asFormatString(), canopy.getCenter().asFormatString());
assertEquals("bound points", ref.getBoundPoints().size(), canopy.getBoundPoints().size());
@@ -272,7 +272,7 @@ public class TestMeanShift extends Mahou
for (Map.Entry<String, MeanShiftCanopy> mapEntry : reducerReferenceMap.entrySet()) {
MeanShiftCanopy refCanopy = mapEntry.getValue();
- List<MeanShiftCanopy> values = reduceCollector.getValue(new Text((refCanopy.isConverged() ? "V" : "C")
+ List<MeanShiftCanopy> values = reduceCollector.getValue(new Text((refCanopy.isConverged() ? "V-" : "C-")
+ refCanopy.getCanopyId()));
assertEquals("values", 1, values.size());
MeanShiftCanopy reducerCanopy = values.get(0);
@@ -306,7 +306,7 @@ public class TestMeanShift extends Mahou
// now run the Job
MeanShiftCanopyJob.runJob("testdata", "output", EuclideanDistanceMeasure.class.getName(), 4, 1, 0.5, 10);
JobConf conf = new JobConf(MeanShiftCanopyDriver.class);
- Path outPart = new Path("output/canopies-2/part-00000");
+ Path outPart = new Path("output/clusters-3/part-00000");
SequenceFile.Reader reader = new SequenceFile.Reader(fs, outPart, conf);
Text key = new Text();
MeanShiftCanopy value = new MeanShiftCanopy();
Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java?rev=937051&r1=937050&r2=937051&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java Thu Apr 22 20:47:39 2010
@@ -31,6 +31,7 @@ import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
+import org.apache.mahout.clustering.ClusterBase;
import org.apache.mahout.clustering.canopy.CanopyClusteringJob;
import org.apache.mahout.clustering.canopy.CanopyDriver;
import org.apache.mahout.clustering.kmeans.KMeansDriver;
@@ -150,10 +151,10 @@ public final class Job {
"org.apache.mahout.math.RandomAccessSparseVector");
log.info("Running Canopy to get initial clusters");
CanopyDriver.runJob(directoryContainingConvertedInput,
- output + CanopyClusteringJob.DEFAULT_CANOPIES_OUTPUT_DIRECTORY, measureClass, t1, t2);
+ output + ClusterBase.INITIAL_CLUSTERS_DIR, measureClass, t1, t2);
log.info("Running KMeans");
KMeansDriver.runJob(directoryContainingConvertedInput,
- output + CanopyClusteringJob.DEFAULT_CANOPIES_OUTPUT_DIRECTORY, output, measureClass, convergenceDelta,
+ output + ClusterBase.INITIAL_CLUSTERS_DIR, output, measureClass, convergenceDelta,
maxIterations, 1);
}
}
Modified: lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java?rev=937051&r1=937050&r2=937051&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java (original)
+++ lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java Thu Apr 22 20:47:39 2010
@@ -37,7 +37,9 @@ import org.apache.mahout.clustering.cano
import org.apache.mahout.clustering.canopy.CanopyDriver;
import org.apache.mahout.clustering.dirichlet.DirichletDriver;
import org.apache.mahout.clustering.dirichlet.models.L1ModelDistribution;
+import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver;
import org.apache.mahout.clustering.kmeans.KMeansDriver;
+import org.apache.mahout.clustering.meanshift.MeanShiftCanopyJob;
import org.apache.mahout.common.MahoutTestCase;
import org.apache.mahout.common.RandomUtils;
import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
@@ -135,17 +137,34 @@ public class TestClusterDumper extends M
public void testCanopy() throws Exception { // now run the Job
CanopyClusteringJob.runJob("testdata/points", "output", EuclideanDistanceMeasure.class.getName(), 8, 4);
// run ClusterDumper
- ClusterDumper clusterDumper = new ClusterDumper("output/canopies", null);
+ ClusterDumper clusterDumper = new ClusterDumper("output/clusters-0", null);
clusterDumper.printClusters();
}
public void testKmeans() throws Exception {
// now run the Canopy job to prime kMeans canopies
- CanopyDriver.runJob("testdata/points", "testdata/canopies", EuclideanDistanceMeasure.class.getName(), 8, 4);
+ CanopyDriver.runJob("testdata/points", "output/clusters-0", EuclideanDistanceMeasure.class.getName(), 8, 4);
// now run the KMeans job
- KMeansDriver.runJob("testdata/points", "testdata/canopies", "output", EuclideanDistanceMeasure.class.getName(),
+ KMeansDriver.runJob("testdata/points", "output/clusters-0", "output", EuclideanDistanceMeasure.class.getName(),
0.001, 10, 1);
// run ClusterDumper
+ ClusterDumper clusterDumper = new ClusterDumper("output/clusters-2", null);
+ clusterDumper.printClusters();
+ }
+
+ public void testFuzzyKmeans() throws Exception {
+ // now run the Canopy job to prime kMeans canopies
+ CanopyDriver.runJob("testdata/points", "output/clusters-0", EuclideanDistanceMeasure.class.getName(), 8, 4);
+ // now run the KMeans job
+ FuzzyKMeansDriver.runJob("testdata/points", "output/clusters-0", "output", EuclideanDistanceMeasure.class.getName(), 0.001, 10, 1, 1, 2);
+ // run ClusterDumper
+ ClusterDumper clusterDumper = new ClusterDumper("output/clusters-3", null);
+ clusterDumper.printClusters();
+ }
+
+ public void testMeanShift() throws Exception {
+ MeanShiftCanopyJob.runJob("testdata/points", "output", EuclideanDistanceMeasure.class.getName(), 9, 1.0, 0.001, 10);
+ // run ClusterDumper
ClusterDumper clusterDumper = new ClusterDumper("output/clusters-1", null);
clusterDumper.printClusters();
}
@@ -156,7 +175,7 @@ public class TestClusterDumper extends M
L1ModelDistribution.class.getName(), prototype.getClass().getName(), prototype
.size(), 15, 10, 1.0, 1);
// run ClusterDumper
- ClusterDumper clusterDumper = new ClusterDumper("output/state-10", null);
+ ClusterDumper clusterDumper = new ClusterDumper("output/clusters-10", null);
clusterDumper.printClusters();
}
}
Modified: lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java?rev=937051&r1=937050&r2=937051&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java (original)
+++ lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java Thu Apr 22 20:47:39 2010
@@ -79,34 +79,34 @@ public class TestCDbwEvaluator extends M
public void testCanopy() throws Exception { // now run the Job
CanopyClusteringJob.runJob("testdata", "output", EuclideanDistanceMeasure.class.getName(), 3.1, 2.1);
int numIterations = 2;
- CDbwDriver.runJob("output/canopies", "output/clusters", "output", EuclideanDistanceMeasure.class.getName(), numIterations, 1);
+ CDbwDriver.runJob("output/clusters-0", "output/clusteredPoints", "output", EuclideanDistanceMeasure.class.getName(), numIterations, 1);
checkRefPoints(numIterations);
}
public void testKmeans() throws Exception {
// now run the Canopy job to prime kMeans canopies
- CanopyDriver.runJob("testdata", "output/canopies", EuclideanDistanceMeasure.class.getName(), 3.1, 2.1);
+ CanopyDriver.runJob("testdata", "output/clusters-0", EuclideanDistanceMeasure.class.getName(), 3.1, 2.1);
// now run the KMeans job
- KMeansDriver.runJob("testdata", "output/canopies", "output", EuclideanDistanceMeasure.class.getName(), 0.001, 10, 1);
+ KMeansDriver.runJob("testdata", "output/clusters-0", "output", EuclideanDistanceMeasure.class.getName(), 0.001, 10, 1);
int numIterations = 2;
- CDbwDriver.runJob("output/clusters-1", "output/points", "output", EuclideanDistanceMeasure.class.getName(), numIterations, 1);
+ CDbwDriver.runJob("output/clusters-2", "output/clusteredPoints", "output", EuclideanDistanceMeasure.class.getName(), numIterations, 1);
checkRefPoints(numIterations);
}
public void testFuzzyKmeans() throws Exception {
// now run the Canopy job to prime kMeans canopies
- CanopyDriver.runJob("testdata", "output/canopies", EuclideanDistanceMeasure.class.getName(), 3.1, 2.1);
+ CanopyDriver.runJob("testdata", "output/clusters-0", EuclideanDistanceMeasure.class.getName(), 3.1, 2.1);
// now run the KMeans job
- FuzzyKMeansDriver.runJob("testdata", "output/canopies", "output", EuclideanDistanceMeasure.class.getName(), 0.001, 10, 1, 1, 2);
+ FuzzyKMeansDriver.runJob("testdata", "output/clusters-0", "output", EuclideanDistanceMeasure.class.getName(), 0.001, 10, 1, 1, 2);
int numIterations = 2;
- CDbwDriver.runJob("output/clusters-3", "output/points", "output", EuclideanDistanceMeasure.class.getName(), numIterations, 1);
+ CDbwDriver.runJob("output/clusters-4", "output/clusteredPoints", "output", EuclideanDistanceMeasure.class.getName(), numIterations, 1);
checkRefPoints(numIterations);
}
public void testMeanShift() throws Exception {
MeanShiftCanopyJob.runJob("testdata", "output", EuclideanDistanceMeasure.class.getName(), 2.1, 1.0, 0.001, 10);
int numIterations = 2;
- CDbwDriver.runJob("output/canopies-1", "output/clusters", "output", EuclideanDistanceMeasure.class.getName(), numIterations, 1);
+ CDbwDriver.runJob("output/clusters-2", "output/clusteredPoints", "output", EuclideanDistanceMeasure.class.getName(), numIterations, 1);
checkRefPoints(numIterations);
}
@@ -115,7 +115,7 @@ public class TestCDbwEvaluator extends M
DirichletDriver.runJob("testdata", "output", L1ModelDistribution.class.getName(), prototype.getClass().getName(), prototype
.size(), 15, 5, 1.0, 1);
int numIterations = 2;
- CDbwDriver.runJob("output/state-5", "output/clusters", "output", EuclideanDistanceMeasure.class.getName(), numIterations, 1);
+ CDbwDriver.runJob("output/clusters-5", "output/clusteredPoints", "output", EuclideanDistanceMeasure.class.getName(), numIterations, 1);
checkRefPoints(numIterations);
}