You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ro...@apache.org on 2010/02/13 20:08:05 UTC
svn commit: r909871 [3/7] - in
/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout: analysis/
cf/taste/ejb/ cf/taste/example/ cf/taste/example/bookcrossing/
cf/taste/example/grouplens/ cf/taste/example/jester/
cf/taste/example/netflix/ classi...
Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/kmeans/DisplayKMeans.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/kmeans/DisplayKMeans.java?rev=909871&r1=909870&r2=909871&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/kmeans/DisplayKMeans.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/kmeans/DisplayKMeans.java Sat Feb 13 19:07:36 2010
@@ -26,78 +26,88 @@
import org.apache.mahout.clustering.canopy.Canopy;
import org.apache.mahout.clustering.dirichlet.DisplayDirichlet;
-import org.apache.mahout.math.DenseVector;
-import org.apache.mahout.math.Vector;
+import org.apache.mahout.common.RandomUtils;
import org.apache.mahout.common.distance.DistanceMeasure;
import org.apache.mahout.common.distance.ManhattanDistanceMeasure;
-import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
class DisplayKMeans extends DisplayDirichlet {
+ private static final double t1 = 3.0;
+
+ private static final double t2 = 1.5;
+
+ private static List<List<Cluster>> clusters;
+
DisplayKMeans() {
initialize();
this.setTitle("K-Means Clusters (> 5% of population)");
}
-
- private static List<List<Cluster>> clusters;
-
- private static final double t1 = 3.0;
-
- private static final double t2 = 1.5;
-
+
@Override
public void paint(Graphics g) {
super.plotSampleData(g);
Graphics2D g2 = (Graphics2D) g;
Vector dv = new DenseVector(2);
- int i = clusters.size() - 1;
- for (List<Cluster> cls : clusters) {
+ int i = DisplayKMeans.clusters.size() - 1;
+ for (List<Cluster> cls : DisplayKMeans.clusters) {
g2.setStroke(new BasicStroke(i == 0 ? 3 : 1));
- g2.setColor(colors[Math.min(colors.length - 1, i--)]);
+ g2.setColor(DisplayDirichlet.colors[Math.min(DisplayDirichlet.colors.length - 1, i--)]);
for (Cluster cluster : cls) {
- //if (true || cluster.getNumPoints() > sampleData.size() * 0.05) {
- dv.assign(cluster.getStd() * 3);
- plotEllipse(g2, cluster.getCenter(), dv);
- //}
+ // if (true || cluster.getNumPoints() > sampleData.size() * 0.05) {
+ dv.assign(cluster.getStd() * 3);
+ DisplayDirichlet.plotEllipse(g2, cluster.getCenter(), dv);
+ // }
}
}
}
-
+
/**
- * This is the reference k-means implementation. Given its inputs it iterates
- * over the points and clusters until their centers converge or until the
- * maximum number of iterations is exceeded.
+ * This is the reference k-means implementation. Given its inputs it iterates over the points and clusters
+ * until their centers converge or until the maximum number of iterations is exceeded.
*
- * @param points the input List<Vector> of points
- * @param clusters the initial List<Cluster> of clusters
- * @param measure the DistanceMeasure to use
- * @param maxIter the maximum number of iterations
+ * @param points
+ * the input List<Vector> of points
+ * @param clusters
+ * the initial List<Cluster> of clusters
+ * @param measure
+ * the DistanceMeasure to use
+ * @param maxIter
+ * the maximum number of iterations
*/
private static void referenceKmeans(List<VectorWritable> points,
- List<List<Cluster>> clusters, DistanceMeasure measure, int maxIter) {
+ List<List<Cluster>> clusters,
+ DistanceMeasure measure,
+ int maxIter) {
boolean converged = false;
int iteration = 0;
while (!converged && iteration < maxIter) {
List<Cluster> next = new ArrayList<Cluster>();
List<Cluster> cs = clusters.get(iteration++);
- for (Cluster c : cs)
+ for (Cluster c : cs) {
next.add(new Cluster(c.getCenter()));
+ }
clusters.add(next);
- converged = iterateReference(points, clusters.get(iteration), measure);
+ converged = DisplayKMeans.iterateReference(points, clusters.get(iteration), measure);
}
}
-
+
/**
- * Perform a single iteration over the points and clusters, assigning points
- * to clusters and returning if the iterations are completed.
+ * Perform a single iteration over the points and clusters, assigning points to clusters and returning if
+ * the iterations are completed.
*
- * @param points the List<Vector> having the input points
- * @param clusters the List<Cluster> clusters
- * @param measure a DistanceMeasure to use
+ * @param points
+ * the List<Vector> having the input points
+ * @param clusters
+ * the List<Cluster> clusters
+ * @param measure
+ * a DistanceMeasure to use
* @return
*/
private static boolean iterateReference(List<VectorWritable> points,
- List<Cluster> clusters, DistanceMeasure measure) {
+ List<Cluster> clusters,
+ DistanceMeasure measure) {
// iterate through all points, assigning each to the nearest cluster
for (VectorWritable point : points) {
Cluster closestCluster = null;
@@ -114,41 +124,44 @@
// test for convergence
boolean converged = true;
for (Cluster cluster : clusters) {
- if (!cluster.computeConvergence(measure, 0.001))
+ if (!cluster.computeConvergence(measure, 0.001)) {
converged = false;
+ }
}
// update the cluster centers
- if (!converged)
- for (Cluster cluster : clusters)
+ if (!converged) {
+ for (Cluster cluster : clusters) {
cluster.recomputeCenter();
+ }
+ }
return converged;
}
-
+
/**
* Iterate through the points, adding new canopies. Return the canopies.
*
* @param measure
- * a DistanceMeasure to use
+ * a DistanceMeasure to use
* @param points
- * a list<Vector> defining the points to be clustered
+ * a list<Vector> defining the points to be clustered
* @param t1
- * the T1 distance threshold
+ * the T1 distance threshold
* @param t2
- * the T2 distance threshold
+ * the T2 distance threshold
* @return the List<Canopy> created
*/
static List<Canopy> populateCanopies(DistanceMeasure measure,
- List<VectorWritable> points, double t1, double t2) {
+ List<VectorWritable> points,
+ double t1,
+ double t2) {
List<Canopy> canopies = new ArrayList<Canopy>();
/**
- * Reference Implementation: Given a distance metric, one can create
- * canopies as follows: Start with a list of the data points in any order,
- * and with two distance thresholds, T1 and T2, where T1 > T2. (These
- * thresholds can be set by the user, or selected by cross-validation.) Pick
- * a point on the list and measure its distance to all other points. Put all
- * points that are within distance threshold T1 into a canopy. Remove from
- * the list all points that are within distance threshold T2. Repeat until
- * the list is empty.
+ * Reference Implementation: Given a distance metric, one can create canopies as follows: Start with a
+ * list of the data points in any order, and with two distance thresholds, T1 and T2, where T1 > T2.
+ * (These thresholds can be set by the user, or selected by cross-validation.) Pick a point on the list
+ * and measure its distance to all other points. Put all points that are within distance threshold T1 into
+ * a canopy. Remove from the list all points that are within distance threshold T2. Repeat until the list
+ * is empty.
*/
int nextCanopyId = 0;
while (!points.isEmpty()) {
@@ -161,29 +174,34 @@
Vector p2 = ptIter.next().get();
double dist = measure.distance(p1, p2);
// Put all points that are within distance threshold T1 into the canopy
- if (dist < t1)
+ if (dist < t1) {
canopy.addPoint(p2);
+ }
// Remove from the list all points that are within distance threshold T2
- if (dist < t2)
+ if (dist < t2) {
ptIter.remove();
+ }
}
}
return canopies;
}
-
+
public static void main(String[] args) {
RandomUtils.useTestSeed();
- generateSamples();
+ DisplayDirichlet.generateSamples();
List<VectorWritable> points = new ArrayList<VectorWritable>();
- points.addAll(sampleData);
- List<Canopy> canopies = populateCanopies(new ManhattanDistanceMeasure(), points, t1, t2);
+ points.addAll(DisplayDirichlet.sampleData);
+ List<Canopy> canopies = DisplayKMeans.populateCanopies(new ManhattanDistanceMeasure(), points,
+ DisplayKMeans.t1, DisplayKMeans.t2);
DistanceMeasure measure = new ManhattanDistanceMeasure();
- clusters = new ArrayList<List<Cluster>>();
- clusters.add(new ArrayList<Cluster>());
- for (Canopy canopy : canopies)
- if (canopy.getNumPoints() > 0.05 * sampleData.size())
- clusters.get(0).add(new Cluster(canopy.getCenter()));
- referenceKmeans(sampleData, clusters, measure, 10);
+ DisplayKMeans.clusters = new ArrayList<List<Cluster>>();
+ DisplayKMeans.clusters.add(new ArrayList<Cluster>());
+ for (Canopy canopy : canopies) {
+ if (canopy.getNumPoints() > 0.05 * DisplayDirichlet.sampleData.size()) {
+ DisplayKMeans.clusters.get(0).add(new Cluster(canopy.getCenter()));
+ }
+ }
+ DisplayKMeans.referenceKmeans(DisplayDirichlet.sampleData, DisplayKMeans.clusters, measure, 10);
new DisplayKMeans();
}
}
Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/meanshift/DisplayMeanShift.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/meanshift/DisplayMeanShift.java?rev=909871&r1=909870&r2=909871&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/meanshift/DisplayMeanShift.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/meanshift/DisplayMeanShift.java Sat Feb 13 19:07:36 2010
@@ -26,85 +26,90 @@
import org.apache.mahout.clustering.dirichlet.DisplayDirichlet;
import org.apache.mahout.clustering.dirichlet.models.NormalModelDistribution;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.Vector;
-import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
-import org.apache.mahout.common.RandomUtils;
import org.apache.mahout.math.VectorWritable;
class DisplayMeanShift extends DisplayDirichlet {
+
+ private static final MeanShiftCanopyClusterer clusterer =
+ new MeanShiftCanopyClusterer(new EuclideanDistanceMeasure(), 1.0, 0.05, 0.5);
+ private static List<MeanShiftCanopy> canopies = new ArrayList<MeanShiftCanopy>();
+
private DisplayMeanShift() {
initialize();
this.setTitle("Canopy Clusters (> 1.5% of population)");
}
-
- private static final MeanShiftCanopyClusterer clusterer =
- new MeanShiftCanopyClusterer(new EuclideanDistanceMeasure(), 1.0, 0.05, 0.5);
- private static List<MeanShiftCanopy> canopies = new ArrayList<MeanShiftCanopy>();
-
// TODO this is never queried?
//private static final List<List<Vector>> iterationCenters = new ArrayList<List<Vector>>();
-
+
@Override
public void paint(Graphics g) {
Graphics2D g2 = (Graphics2D) g;
- double sx = (double) res / ds;
+ double sx = (double) res / DisplayDirichlet.ds;
g2.setTransform(AffineTransform.getScaleInstance(sx, sx));
-
+
// plot the axes
g2.setColor(Color.BLACK);
- Vector dv = new DenseVector(2).assign(size / 2.0);
- Vector dv1 = new DenseVector(2).assign(clusterer.getT1());
- Vector dv2 = new DenseVector(2).assign(clusterer.getT2());
- plotRectangle(g2, new DenseVector(2).assign(2), dv);
- plotRectangle(g2, new DenseVector(2).assign(-2), dv);
-
+ Vector dv = new DenseVector(2).assign(DisplayDirichlet.size / 2.0);
+ Vector dv1 = new DenseVector(2).assign(DisplayMeanShift.clusterer.getT1());
+ Vector dv2 = new DenseVector(2).assign(DisplayMeanShift.clusterer.getT2());
+ DisplayDirichlet.plotRectangle(g2, new DenseVector(2).assign(2), dv);
+ DisplayDirichlet.plotRectangle(g2, new DenseVector(2).assign(-2), dv);
+
// plot the sample data
g2.setColor(Color.DARK_GRAY);
dv.assign(0.03);
- for (VectorWritable v : sampleData)
- plotRectangle(g2, v.get(), dv);
+ for (VectorWritable v : DisplayDirichlet.sampleData) {
+ DisplayDirichlet.plotRectangle(g2, v.get(), dv);
+ }
int i = 0;
- for (MeanShiftCanopy canopy : canopies)
- if (canopy.getBoundPoints().size() > 0.015 * sampleData.size()) {
- g2.setColor(colors[Math.min(i++, colors.length - 1)]);
- for (Vector v : canopy.getBoundPoints())
- plotRectangle(g2, v, dv);
- plotEllipse(g2, canopy.getCenter(), dv1);
- plotEllipse(g2, canopy.getCenter(), dv2);
+ for (MeanShiftCanopy canopy : DisplayMeanShift.canopies) {
+ if (canopy.getBoundPoints().size() > 0.015 * DisplayDirichlet.sampleData.size()) {
+ g2.setColor(DisplayDirichlet.colors[Math.min(i++, DisplayDirichlet.colors.length - 1)]);
+ for (Vector v : canopy.getBoundPoints()) {
+ DisplayDirichlet.plotRectangle(g2, v, dv);
+ }
+ DisplayDirichlet.plotEllipse(g2, canopy.getCenter(), dv1);
+ DisplayDirichlet.plotEllipse(g2, canopy.getCenter(), dv2);
}
+ }
}
-
+
private static void testReferenceImplementation() {
// add all points to the canopies
int nextCanopyId = 0;
- for (VectorWritable aRaw : sampleData) {
- clusterer.mergeCanopy(new MeanShiftCanopy(aRaw.get(), nextCanopyId++), canopies);
+ for (VectorWritable aRaw : DisplayDirichlet.sampleData) {
+ DisplayMeanShift.clusterer.mergeCanopy(
+ new MeanShiftCanopy(aRaw.get(), nextCanopyId++), DisplayMeanShift.canopies);
}
boolean done = false;
- while (!done) {// shift canopies to their centroids
+ while (!done) { // shift canopies to their centroids
done = true;
List<MeanShiftCanopy> migratedCanopies = new ArrayList<MeanShiftCanopy>();
//List<Vector> centers = new ArrayList<Vector>();
- for (MeanShiftCanopy canopy : canopies) {
+ for (MeanShiftCanopy canopy : DisplayMeanShift.canopies) {
//centers.add(canopy.getCenter());
- done = clusterer.shiftToMean(canopy) && done;
- clusterer.mergeCanopy(canopy, migratedCanopies);
+ done = DisplayMeanShift.clusterer.shiftToMean(canopy) && done;
+ DisplayMeanShift.clusterer.mergeCanopy(canopy, migratedCanopies);
}
//iterationCenters.add(centers);
- canopies = migratedCanopies;
+ DisplayMeanShift.canopies = migratedCanopies;
}
}
-
+
public static void main(String[] args) {
RandomUtils.useTestSeed();
- generateSamples();
- testReferenceImplementation();
- for (MeanShiftCanopy canopy : canopies)
+ DisplayDirichlet.generateSamples();
+ DisplayMeanShift.testReferenceImplementation();
+ for (MeanShiftCanopy canopy : DisplayMeanShift.canopies) {
System.out.println(canopy.toString());
+ }
new DisplayMeanShift();
}
-
+
static void generateResults() {
DisplayDirichlet.generateResults(new NormalModelDistribution());
}
Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/Constants.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/Constants.java?rev=909871&r1=909870&r2=909871&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/Constants.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/Constants.java Sat Feb 13 19:07:36 2010
@@ -20,14 +20,14 @@
* Constants shared between examples.
*/
public interface Constants {
-
- /**
- * Directory containing output for examples.
- */
- String CLUSTERED_POINTS_OUTPUT_DIRECTORY = "/clustered-points";
- /**
- * Directory used to store the input after it has been processed from it's
- * original form into one suitable for processing by the clustering examples.
- */
- String DIRECTORY_CONTAINING_CONVERTED_INPUT = "/data";
+
+ /**
+ * Directory containing output for examples.
+ */
+ String CLUSTERED_POINTS_OUTPUT_DIRECTORY = "/clustered-points";
+ /**
+ * Directory used to store the input after it has been processed from it's
+ * original form into one suitable for processing by the clustering examples.
+ */
+ String DIRECTORY_CONTAINING_CONVERTED_INPUT = "/data";
}
Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/InputDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/InputDriver.java?rev=909871&r1=909870&r2=909871&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/InputDriver.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/InputDriver.java Sat Feb 13 19:07:36 2010
@@ -17,6 +17,8 @@
package org.apache.mahout.clustering.syntheticcontrol.canopy;
+import java.io.IOException;
+
import org.apache.commons.cli2.CommandLine;
import org.apache.commons.cli2.Group;
import org.apache.commons.cli2.Option;
@@ -39,29 +41,28 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import java.io.IOException;
-
public class InputDriver {
/**Logger for this class.*/
private static final Logger LOG = LoggerFactory.getLogger(InputDriver.class);
-
+
private InputDriver() {
}
-
+
public static void main(String[] args) throws IOException {
DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
ArgumentBuilder abuilder = new ArgumentBuilder();
GroupBuilder gbuilder = new GroupBuilder();
-
+
Option inputOpt = DefaultOptionCreator.inputOption().withRequired(false).create();
Option outputOpt = DefaultOptionCreator.outputOption().withRequired(false).create();
Option vectorOpt = obuilder.withLongName("vector").withRequired(false).withArgument(
- abuilder.withName("v").withMinimum(1).withMaximum(1).create()).withDescription(
- "The vector implementation to use.").withShortName("v").create();
-
+ abuilder.withName("v").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The vector implementation to use.").withShortName("v").create();
+
Option helpOpt = DefaultOptionCreator.helpOption();
-
- Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(vectorOpt).withOption(helpOpt).create();
+
+ Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(
+ vectorOpt).withOption(helpOpt).create();
try {
Parser parser = new Parser();
@@ -71,35 +72,35 @@
CommandLineUtil.printHelp(group);
return;
}
-
+
String input = cmdLine.getValue(inputOpt, "testdata").toString();
String output = cmdLine.getValue(outputOpt, "output").toString();
String vectorClassName = cmdLine.getValue(vectorOpt, "org.apache.mahout.math.RandomAccessSparseVector").toString();
- runJob(input, output, vectorClassName);
+ InputDriver.runJob(input, output, vectorClassName);
} catch (OptionException e) {
- LOG.error("Exception parsing command line: ", e);
+ InputDriver.LOG.error("Exception parsing command line: ", e);
CommandLineUtil.printHelp(group);
}
}
-
+
public static void runJob(String input, String output, String vectorClassName) throws IOException {
JobClient client = new JobClient();
JobConf conf = new JobConf(InputDriver.class);
-
+
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(VectorWritable.class);
conf.setOutputFormat(SequenceFileOutputFormat.class);
conf.set("vector.implementation.class.name", vectorClassName);
FileInputFormat.setInputPaths(conf, new Path(input));
FileOutputFormat.setOutputPath(conf, new Path(output));
-
+
conf.setMapperClass(InputMapper.class);
-
+
conf.setReducerClass(Reducer.class);
conf.setNumReduceTasks(0);
-
+
client.setConf(conf);
JobClient.runJob(conf);
}
-
+
}
Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/InputMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/InputMapper.java?rev=909871&r1=909870&r2=909871&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/InputMapper.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/InputMapper.java Sat Feb 13 19:07:36 2010
@@ -17,41 +17,43 @@
package org.apache.mahout.clustering.syntheticcontrol.canopy;
+import java.io.IOException;
+import java.lang.reflect.Constructor;
+import java.lang.reflect.InvocationTargetException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Pattern;
+
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
-import org.apache.hadoop.mapred.JobConf;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.regex.Pattern;
-import java.lang.reflect.Constructor;
-import java.lang.reflect.InvocationTargetException;
-
-public class InputMapper extends MapReduceBase implements
- Mapper<LongWritable, Text, Text, VectorWritable> {
-
+public class InputMapper extends MapReduceBase implements Mapper<LongWritable,Text,Text,VectorWritable> {
+
private static final Pattern SPACE = Pattern.compile(" ");
-
+
private Constructor<?> constructor;
-
+
private VectorWritable vectorWritable;
-
+
@Override
- public void map(LongWritable key, Text values,
- OutputCollector<Text, VectorWritable> output, Reporter reporter) throws IOException {
- String[] numbers = SPACE.split(values.toString());
+ public void map(LongWritable key,
+ Text values,
+ OutputCollector<Text,VectorWritable> output,
+ Reporter reporter) throws IOException {
+ String[] numbers = InputMapper.SPACE.split(values.toString());
// sometimes there are multiple separator spaces
List<Double> doubles = new ArrayList<Double>();
for (String value : numbers) {
- if (value.length() > 0)
+ if (value.length() > 0) {
doubles.add(Double.valueOf(value));
+ }
}
try {
Vector result = (Vector) constructor.newInstance(doubles.size());
@@ -61,7 +63,7 @@
}
vectorWritable.set(result);
output.collect(new Text(String.valueOf(index)), vectorWritable);
-
+
} catch (InstantiationException e) {
throw new IllegalStateException(e);
} catch (IllegalAccessException e) {
@@ -70,8 +72,7 @@
throw new IllegalStateException(e);
}
}
-
-
+
@Override
public void configure(JobConf job) {
vectorWritable = new VectorWritable();
@@ -84,6 +85,6 @@
} catch (ClassNotFoundException e) {
throw new IllegalStateException(e);
}
-
+
}
}
Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java?rev=909871&r1=909870&r2=909871&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java Sat Feb 13 19:07:36 2010
@@ -34,90 +34,83 @@
import org.apache.mahout.clustering.canopy.CanopyClusteringJob;
import org.apache.mahout.clustering.syntheticcontrol.Constants;
import org.apache.mahout.common.CommandLineUtil;
-import org.apache.mahout.math.Vector;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class Job {
- /** Logger for this class.*/
+ /** Logger for this class. */
private static final Logger LOG = LoggerFactory.getLogger(Job.class);
-
- private Job() {
- }
-
+
+ private Job() { }
+
public static void main(String[] args) throws IOException, ClassNotFoundException {
- DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
- ArgumentBuilder abuilder = new ArgumentBuilder();
- GroupBuilder gbuilder = new GroupBuilder();
-
- Option inputOpt = obuilder.withLongName("input").withRequired(false).withArgument(
- abuilder.withName("input").withMinimum(1).withMaximum(1).create()).
- withDescription("The Path for input Vectors. Must be a SequenceFile of Writable, Vector").withShortName("i").create();
- Option outputOpt = obuilder.withLongName("output").withRequired(false).withArgument(
- abuilder.withName("output").withMinimum(1).withMaximum(1).create()).
- withDescription("The Path to put the output in").withShortName("o").create();
-
- Option measureClassOpt = obuilder.withLongName("distance").withRequired(false).withArgument(
- abuilder.withName("distance").withMinimum(1).withMaximum(1).create()).
- withDescription("The Distance Measure to use. Default is SquaredEuclidean").withShortName("m").create();
- Option vectorClassOpt = obuilder.withLongName("vectorClass").withRequired(false).withArgument(
- abuilder.withName("vectorClass").withMinimum(1).withMaximum(1).create()).
- withDescription("The Vector implementation class name. Default is RandomAccessSparseVector.class").withShortName("v").create();
-
- Option t1Opt = obuilder.withLongName("t1").withRequired(false).withArgument(
- abuilder.withName("t1").withMinimum(1).withMaximum(1).create()).
- withDescription("t1").withShortName("t1").create();
- Option t2Opt = obuilder.withLongName("t2").withRequired(false).withArgument(
- abuilder.withName("t2").withMinimum(1).withMaximum(1).create()).
- withDescription("t2").withShortName("t2").create();
-
-
- Option helpOpt = obuilder.withLongName("help").
- withDescription("Print out help").withShortName("h").create();
-
- Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt)
- .withOption(measureClassOpt).withOption(vectorClassOpt)
- .withOption(t1Opt).withOption(t2Opt)
- .withOption(helpOpt).create();
-
-
- try {
- Parser parser = new Parser();
- parser.setGroup(group);
- CommandLine cmdLine = parser.parse(args);
-
- if (cmdLine.hasOption(helpOpt)) {
- CommandLineUtil.printHelp(group);
- return;
- }
-
- String input = cmdLine.getValue(inputOpt, "testdata").toString();
- String output = cmdLine.getValue(outputOpt, "output").toString();
- String measureClass = cmdLine.getValue(
- measureClassOpt, "org.apache.mahout.common.distance.EuclideanDistanceMeasure").toString();
-
- String className = cmdLine.getValue(vectorClassOpt, "org.apache.mahout.math.RandomAccessSparseVector").toString();
- //Class<? extends Vector> vectorClass = Class.forName(className).asSubclass(Vector.class);
- double t1 = Double.parseDouble(cmdLine.getValue(t1Opt, "80").toString());
- double t2 = Double.parseDouble(cmdLine.getValue(t2Opt, "55").toString());
-
- runJob(input, output, measureClass, t1, t2);
- } catch (OptionException e) {
- LOG.error("Exception", e);
+ DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
+ ArgumentBuilder abuilder = new ArgumentBuilder();
+ GroupBuilder gbuilder = new GroupBuilder();
+
+ Option inputOpt = obuilder.withLongName("input").withRequired(false).withArgument(
+ abuilder.withName("input").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The Path for input Vectors. Must be a SequenceFile of Writable, Vector").withShortName("i").create();
+ Option outputOpt = obuilder.withLongName("output").withRequired(false).withArgument(
+ abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The Path to put the output in").withShortName("o").create();
+
+ Option measureClassOpt = obuilder.withLongName("distance").withRequired(false).withArgument(
+ abuilder.withName("distance").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The Distance Measure to use. Default is SquaredEuclidean").withShortName("m").create();
+ // Option vectorClassOpt = obuilder.withLongName("vectorClass").withRequired(false).withArgument(
+ // abuilder.withName("vectorClass").withMinimum(1).withMaximum(1).create()).
+ // withDescription("The Vector implementation class name. Default is RandomAccessSparseVector.class").withShortName("v").create();
+
+ Option t1Opt = obuilder.withLongName("t1").withRequired(false).withArgument(
+ abuilder.withName("t1").withMinimum(1).withMaximum(1).create()).withDescription("t1").withShortName(
+ "t1").create();
+ Option t2Opt = obuilder.withLongName("t2").withRequired(false).withArgument(
+ abuilder.withName("t2").withMinimum(1).withMaximum(1).create()).withDescription("t2").withShortName(
+ "t2").create();
+
+ Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
+ .create();
+
+ Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(
+ measureClassOpt)// .withOption(vectorClassOpt)
+ .withOption(t1Opt).withOption(t2Opt).withOption(helpOpt).create();
+
+ try {
+ Parser parser = new Parser();
+ parser.setGroup(group);
+ CommandLine cmdLine = parser.parse(args);
+
+ if (cmdLine.hasOption(helpOpt)) {
CommandLineUtil.printHelp(group);
+ return;
}
+
+ String input = cmdLine.getValue(inputOpt, "testdata").toString();
+ String output = cmdLine.getValue(outputOpt, "output").toString();
+ String measureClass = cmdLine.getValue(measureClassOpt,
+ "org.apache.mahout.common.distance.EuclideanDistanceMeasure").toString();
+
+ // String className = cmdLine.getValue(vectorClassOpt,
+ // "org.apache.mahout.math.RandomAccessSparseVector").toString();
+ // Class<? extends Vector> vectorClass = Class.forName(className).asSubclass(Vector.class);
+ double t1 = Double.parseDouble(cmdLine.getValue(t1Opt, "80").toString());
+ double t2 = Double.parseDouble(cmdLine.getValue(t2Opt, "55").toString());
+
+ Job.runJob(input, output, measureClass, t1, t2);
+ } catch (OptionException e) {
+ Job.LOG.error("Exception", e);
+ CommandLineUtil.printHelp(group);
+ }
}
-
+
/**
- * Run the canopy clustering job on an input dataset using the given distance
- * measure, t1 and t2 parameters. All output data will be written to the
- * output directory, which will be initially deleted if it exists. The
- * clustered points will reside in the path <output>/clustered-points. By
- * default, the job expects the a file containing synthetic_control.data as
- * obtained from
- * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series
- * resides in a directory named "testdata", and writes output to a directory
- * named "output".
+ * Run the canopy clustering job on an input dataset using the given distance measure, t1 and t2 parameters.
+ * All output data will be written to the output directory, which will be initially deleted if it exists.
+ * The clustered points will reside in the path <output>/clustered-points. By default, the job expects the a
+ * file containing synthetic_control.data as obtained from
+ * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series resides in a directory named
+ * "testdata", and writes output to a directory named "output".
*
* @param input
* the String denoting the input directory path
@@ -130,21 +123,20 @@
* @param t2
* the canopy T2 threshold
*/
- private static void runJob(String input, String output,
- String measureClassName, double t1, double t2) throws IOException {
+ private static void runJob(String input, String output, String measureClassName, double t1, double t2) throws IOException {
JobClient client = new JobClient();
JobConf conf = new JobConf(Job.class);
-
+
Path outPath = new Path(output);
client.setConf(conf);
FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
- if (dfs.exists(outPath))
+ if (dfs.exists(outPath)) {
dfs.delete(outPath, true);
- String directoryContainingConvertedInput = output
- + Constants.DIRECTORY_CONTAINING_CONVERTED_INPUT;
- InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector");
- CanopyClusteringJob.runJob(directoryContainingConvertedInput, output,
- measureClassName, t1, t2);
+ }
+ String directoryContainingConvertedInput = output + Constants.DIRECTORY_CONTAINING_CONVERTED_INPUT;
+ InputDriver.runJob(input, directoryContainingConvertedInput,
+ "org.apache.mahout.math.RandomAccessSparseVector");
+ CanopyClusteringJob.runJob(directoryContainingConvertedInput, output, measureClassName, t1, t2);
}
-
+
}
Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/Job.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/Job.java?rev=909871&r1=909870&r2=909871&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/Job.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/Job.java Sat Feb 13 19:07:36 2010
@@ -48,43 +48,43 @@
import org.slf4j.LoggerFactory;
public class Job {
-
- /**Logger for this class.*/
+
+ /** Logger for this class. */
private static final Logger log = LoggerFactory.getLogger(Job.class);
-
- private Job() {
- }
-
+
+ private Job() { }
+
public static void main(String[] args) throws Exception {
DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
ArgumentBuilder abuilder = new ArgumentBuilder();
GroupBuilder gbuilder = new GroupBuilder();
-
+
Option inputOpt = DefaultOptionCreator.inputOption().withRequired(false).create();
Option outputOpt = DefaultOptionCreator.outputOption().withRequired(false).create();
Option maxIterOpt = DefaultOptionCreator.maxIterOption().withRequired(false).create();
Option topicsOpt = DefaultOptionCreator.kOption().withRequired(false).create();
-
+
Option redOpt = obuilder.withLongName("reducerNum").withRequired(false).withArgument(
- abuilder.withName("r").withMinimum(1).withMaximum(1).create()).withDescription("The number of reducers to use.")
- .withShortName("r").create();
-
+ abuilder.withName("r").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The number of reducers to use.").withShortName("r").create();
+
Option vectorOpt = obuilder.withLongName("vector").withRequired(false).withArgument(
- abuilder.withName("v").withMinimum(1).withMaximum(1).create()).withDescription("The vector implementation to use.")
- .withShortName("v").create();
-
+ abuilder.withName("v").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The vector implementation to use.").withShortName("v").create();
+
Option mOpt = obuilder.withLongName("alpha").withRequired(false).withShortName("m").withArgument(
- abuilder.withName("alpha").withMinimum(1).withMaximum(1).create()).withDescription(
- "The alpha0 value for the DirichletDistribution.").create();
-
- Option modelOpt = obuilder.withLongName("modelClass").withRequired(false).withShortName("d").withArgument(
- abuilder.withName("modelClass").withMinimum(1).withMaximum(1).create())
+ abuilder.withName("alpha").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The alpha0 value for the DirichletDistribution.").create();
+
+ Option modelOpt = obuilder.withLongName("modelClass").withRequired(false).withShortName("d")
+ .withArgument(abuilder.withName("modelClass").withMinimum(1).withMaximum(1).create())
.withDescription("The ModelDistribution class name.").create();
Option helpOpt = DefaultOptionCreator.helpOption();
-
- Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(modelOpt).withOption(
- maxIterOpt).withOption(mOpt).withOption(topicsOpt).withOption(redOpt).withOption(helpOpt).create();
-
+
+ Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt)
+ .withOption(modelOpt).withOption(maxIterOpt).withOption(mOpt).withOption(topicsOpt)
+ .withOption(redOpt).withOption(helpOpt).create();
+
try {
Parser parser = new Parser();
parser.setGroup(group);
@@ -93,41 +93,50 @@
CommandLineUtil.printHelp(group);
return;
}
-
+
String input = cmdLine.getValue(inputOpt, "testdata").toString();
String output = cmdLine.getValue(outputOpt, "output").toString();
String modelFactory = cmdLine.getValue(modelOpt,
- "org.apache.mahout.clustering.syntheticcontrol.dirichlet.NormalScModelDistribution").toString();
+ "org.apache.mahout.clustering.syntheticcontrol.dirichlet.NormalScModelDistribution").toString();
int numModels = Integer.parseInt(cmdLine.getValue(topicsOpt, "10").toString());
int maxIterations = Integer.parseInt(cmdLine.getValue(maxIterOpt, "5").toString());
double alpha_0 = Double.parseDouble(cmdLine.getValue(mOpt, "1.0").toString());
int numReducers = Integer.parseInt(cmdLine.getValue(redOpt, "1").toString());
- String vectorClassName = cmdLine.getValue(vectorOpt, "org.apache.mahout.math.RandomAccessSparseVector").toString();
- runJob(input, output, modelFactory, numModels, maxIterations, alpha_0, numReducers, vectorClassName);
+ String vectorClassName = cmdLine.getValue(vectorOpt, "org.apache.mahout.math.RandomAccessSparseVector")
+ .toString();
+ Job
+ .runJob(input, output, modelFactory, numModels, maxIterations, alpha_0, numReducers,
+ vectorClassName);
} catch (OptionException e) {
- log.error("Exception parsing command line: ", e);
+ Job.log.error("Exception parsing command line: ", e);
CommandLineUtil.printHelp(group);
}
}
-
+
/**
- * Run the job using supplied arguments, deleting the output directory if it
- * exists beforehand
+ * Run the job using supplied arguments, deleting the output directory if it exists beforehand
*
- * @param input the directory pathname for input points
- * @param output the directory pathname for output points
- * @param modelFactory the ModelDistribution class name
- * @param numModels the number of Models
- * @param maxIterations the maximum number of iterations
- * @param alpha_0 the alpha0 value for the DirichletDistribution
- * @param numReducers the desired number of reducers
- * @throws IllegalAccessException
- * @throws InstantiationException
- * @throws ClassNotFoundException
- * @throws InvocationTargetException
- * @throws NoSuchMethodException
- * @throws IllegalArgumentException
- * @throws SecurityException
+ * @param input
+ * the directory pathname for input points
+ * @param output
+ * the directory pathname for output points
+ * @param modelFactory
+ * the ModelDistribution class name
+ * @param numModels
+ * the number of Models
+ * @param maxIterations
+ * the maximum number of iterations
+ * @param alpha_0
+ * the alpha0 value for the DirichletDistribution
+ * @param numReducers
+ * the desired number of reducers
+ * @throws IllegalAccessException
+ * @throws InstantiationException
+ * @throws ClassNotFoundException
+ * @throws InvocationTargetException
+ * @throws NoSuchMethodException
+ * @throws IllegalArgumentException
+ * @throws SecurityException
*/
public static void runJob(String input,
String output,
@@ -136,9 +145,14 @@
int maxIterations,
double alpha_0,
int numReducers,
- String vectorClassName)
- throws IOException, ClassNotFoundException, InstantiationException, IllegalAccessException,
- SecurityException, IllegalArgumentException, NoSuchMethodException, InvocationTargetException {
+ String vectorClassName) throws IOException,
+ ClassNotFoundException,
+ InstantiationException,
+ IllegalAccessException,
+ SecurityException,
+ IllegalArgumentException,
+ NoSuchMethodException,
+ InvocationTargetException {
// delete the output directory
JobConf conf = new JobConf(DirichletJob.class);
Path outPath = new Path(output);
@@ -149,33 +163,41 @@
fs.mkdirs(outPath);
String directoryContainingConvertedInput = output + Constants.DIRECTORY_CONTAINING_CONVERTED_INPUT;
InputDriver.runJob(input, directoryContainingConvertedInput, vectorClassName);
- DirichletDriver.runJob(directoryContainingConvertedInput,
- output + "/state",
- modelFactory,
- vectorClassName,
- 60,
- numModels,
- maxIterations,
- alpha_0,
- numReducers);
- printResults(output + "/state", modelFactory, vectorClassName, 60, maxIterations, numModels, alpha_0);
+ DirichletDriver.runJob(directoryContainingConvertedInput, output + "/state", modelFactory,
+ vectorClassName, 60, numModels, maxIterations, alpha_0, numReducers);
+ Job.printResults(output + "/state", modelFactory, vectorClassName, 60, maxIterations, numModels, alpha_0);
}
-
+
/**
* Prints out all of the clusters during each iteration
- * @param output the String output directory
- * @param modelDistribution the String class name of the ModelDistribution
- * @param vectorClassName the String class name of the Vector to use
- * @param prototypeSize the size of the Vector prototype for the Dirichlet Models
- * @param numIterations the int number of Iterations
- * @param numModels the int number of models
- * @param alpha_0 the double alpha_0 value
- * @throws InvocationTargetException
- * @throws NoSuchMethodException
+ *
+ * @param output
+ * the String output directory
+ * @param modelDistribution
+ * the String class name of the ModelDistribution
+ * @param vectorClassName
+ * the String class name of the Vector to use
+ * @param prototypeSize
+ * the size of the Vector prototype for the Dirichlet Models
+ * @param numIterations
+ * the int number of Iterations
+ * @param numModels
+ * the int number of models
+ * @param alpha_0
+ * the double alpha_0 value
+ * @throws InvocationTargetException
+ * @throws NoSuchMethodException
* @throws SecurityException
*/
- public static void printResults(String output, String modelDistribution, String vectorClassName, int prototypeSize,
- int numIterations, int numModels, double alpha_0) throws SecurityException, NoSuchMethodException, InvocationTargetException {
+ public static void printResults(String output,
+ String modelDistribution,
+ String vectorClassName,
+ int prototypeSize,
+ int numIterations,
+ int numModels,
+ double alpha_0) throws SecurityException,
+ NoSuchMethodException,
+ InvocationTargetException {
List<List<DirichletCluster<VectorWritable>>> clusters = new ArrayList<List<DirichletCluster<VectorWritable>>>();
JobConf conf = new JobConf(KMeansDriver.class);
conf.set(DirichletDriver.MODEL_FACTORY_KEY, modelDistribution);
@@ -187,14 +209,17 @@
conf.set(DirichletDriver.PROTOTYPE_SIZE_KEY, Integer.toString(prototypeSize));
clusters.add(DirichletMapper.getDirichletState(conf).getClusters());
}
- printResults(clusters, 0);
-
+ Job.printResults(clusters, 0);
+
}
-
+
/**
* Actually prints out the clusters
- * @param clusters a List of Lists of DirichletClusters
- * @param significant the minimum number of samples to enable printing a model
+ *
+ * @param clusters
+ * a List of Lists of DirichletClusters
+ * @param significant
+ * the minimum number of samples to enable printing a model
*/
private static void printResults(List<List<DirichletCluster<VectorWritable>>> clusters, int significant) {
int row = 0;
@@ -211,6 +236,6 @@
result.append('\n');
}
result.append('\n');
- log.info(result.toString());
+ Job.log.info(result.toString());
}
}
Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/NormalScModelDistribution.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/NormalScModelDistribution.java?rev=909871&r1=909870&r2=909871&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/NormalScModelDistribution.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/NormalScModelDistribution.java Sat Feb 13 19:07:36 2010
@@ -29,14 +29,15 @@
* DirichletCluster algorithm. Uses a Normal Distribution
*/
public class NormalScModelDistribution extends NormalModelDistribution {
-
+
@Override
public Model<VectorWritable>[] sampleFromPrior(int howMany) {
Model<VectorWritable>[] result = new NormalModel[howMany];
for (int i = 0; i < howMany; i++) {
DenseVector mean = new DenseVector(60);
- for (int j = 0; j < 60; j++)
+ for (int j = 0; j < 60; j++) {
mean.set(j, UncommonDistributions.rNorm(30, 0.5));
+ }
result[i] = new NormalModel(mean, 1);
}
return result;
Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java?rev=909871&r1=909870&r2=909871&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java Sat Feb 13 19:07:36 2010
@@ -17,8 +17,6 @@
package org.apache.mahout.clustering.syntheticcontrol.kmeans;
-import static org.apache.mahout.clustering.syntheticcontrol.Constants.DIRECTORY_CONTAINING_CONVERTED_INPUT;
-
import java.io.IOException;
import org.apache.commons.cli2.CommandLine;
@@ -36,117 +34,127 @@
import org.apache.mahout.clustering.canopy.CanopyClusteringJob;
import org.apache.mahout.clustering.canopy.CanopyDriver;
import org.apache.mahout.clustering.kmeans.KMeansDriver;
+import org.apache.mahout.clustering.syntheticcontrol.Constants;
import org.apache.mahout.clustering.syntheticcontrol.canopy.InputDriver;
import org.apache.mahout.common.CommandLineUtil;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
-import org.apache.mahout.math.Vector;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class Job {
-
- /** Logger for this class.*/
+
+ /** Logger for this class. */
private static final Logger LOG = LoggerFactory.getLogger(Job.class);
-
- private Job() {
- }
-
- public static void main(String[] args) throws IOException,
- ClassNotFoundException {
+
+ private Job() { }
+
+ public static void main(String[] args) throws IOException, ClassNotFoundException {
DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
ArgumentBuilder abuilder = new ArgumentBuilder();
GroupBuilder gbuilder = new GroupBuilder();
-
+
Option inputOpt = DefaultOptionCreator.inputOption().withRequired(false).create();
Option outputOpt = DefaultOptionCreator.outputOption().withRequired(false).create();
Option convergenceDeltaOpt = DefaultOptionCreator.convergenceOption().withRequired(false).create();
Option maxIterationsOpt = DefaultOptionCreator.maxIterOption().withRequired(false).create();
-
+
Option measureClassOpt = obuilder.withLongName("distance").withRequired(false).withArgument(
- abuilder.withName("distance").withMinimum(1).withMaximum(1).create()).withDescription(
- "The Distance Measure to use. Default is SquaredEuclidean").withShortName("m").create();
-
+ abuilder.withName("distance").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The Distance Measure to use. Default is SquaredEuclidean").withShortName("m").create();
+
Option t1Opt = obuilder.withLongName("t1").withRequired(false).withArgument(
- abuilder.withName("t1").withMinimum(1).withMaximum(1).create()).withDescription(
- "The t1 value to use.").withShortName("m").create();
+ abuilder.withName("t1").withMinimum(1).withMaximum(1).create()).withDescription("The t1 value to use.")
+ .withShortName("m").create();
Option t2Opt = obuilder.withLongName("t2").withRequired(false).withArgument(
- abuilder.withName("t2").withMinimum(1).withMaximum(1).create()).withDescription(
- "The t2 value to use.").withShortName("m").create();
+ abuilder.withName("t2").withMinimum(1).withMaximum(1).create()).withDescription("The t2 value to use.")
+ .withShortName("m").create();
Option vectorClassOpt = obuilder.withLongName("vectorClass").withRequired(false).withArgument(
- abuilder.withName("vectorClass").withMinimum(1).withMaximum(1).create()).withDescription(
- "The Vector implementation class name. Default is RandomAccessSparseVector.class").withShortName("v").create();
-
+ abuilder.withName("vectorClass").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The Vector implementation class name. Default is RandomAccessSparseVector.class").withShortName("v")
+ .create();
+
Option helpOpt = DefaultOptionCreator.helpOption();
-
- Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt)
- .withOption(measureClassOpt).withOption(convergenceDeltaOpt).withOption(maxIterationsOpt)
+
+ Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(
+ measureClassOpt).withOption(convergenceDeltaOpt).withOption(maxIterationsOpt)
.withOption(vectorClassOpt).withOption(t1Opt).withOption(t2Opt).withOption(helpOpt).create();
try {
Parser parser = new Parser();
parser.setGroup(group);
CommandLine cmdLine = parser.parse(args);
-
+
if (cmdLine.hasOption(helpOpt)) {
CommandLineUtil.printHelp(group);
return;
}
String input = cmdLine.getValue(inputOpt, "testdata").toString();
String output = cmdLine.getValue(outputOpt, "output").toString();
- String measureClass = cmdLine.getValue(measureClassOpt, "org.apache.mahout.common.distance.EuclideanDistanceMeasure").toString();
+ String measureClass = cmdLine.getValue(measureClassOpt,
+ "org.apache.mahout.common.distance.EuclideanDistanceMeasure").toString();
double t1 = Double.parseDouble(cmdLine.getValue(t1Opt, "80").toString());
double t2 = Double.parseDouble(cmdLine.getValue(t2Opt, "55").toString());
double convergenceDelta = Double.parseDouble(cmdLine.getValue(convergenceDeltaOpt, "0.5").toString());
int maxIterations = Integer.parseInt(cmdLine.getValue(maxIterationsOpt, 10).toString());
- String className = cmdLine.getValue(vectorClassOpt, "org.apache.mahout.math.RandomAccessSparseVector").toString();
- Class<? extends Vector> vectorClass = Class.forName(className).asSubclass(Vector.class);
-
- runJob(input, output, measureClass, t1, t2, convergenceDelta, maxIterations);
+ // String className = cmdLine.getValue(vectorClassOpt,
+ // "org.apache.mahout.math.RandomAccessSparseVector").toString();
+ // Class<? extends Vector> vectorClass = Class.forName(className).asSubclass(Vector.class);
+
+ Job.runJob(input, output, measureClass, t1, t2, convergenceDelta, maxIterations);
} catch (OptionException e) {
- LOG.error("Exception", e);
+ Job.LOG.error("Exception", e);
CommandLineUtil.printHelp(group);
}
}
-
+
/**
- * Run the kmeans clustering job on an input dataset using the given distance
- * measure, t1, t2 and iteration parameters. All output data will be written
- * to the output directory, which will be initially deleted if it exists. The
- * clustered points will reside in the path <output>/clustered-points. By
- * default, the job expects the a file containing synthetic_control.data as
- * obtained from
- * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series
- * resides in a directory named "testdata", and writes output to a directory
- * named "output".
+ * Run the kmeans clustering job on an input dataset using the given distance measure, t1, t2 and iteration
+ * parameters. All output data will be written to the output directory, which will be initially deleted if
+ * it exists. The clustered points will reside in the path <output>/clustered-points. By default, the job
+ * expects the a file containing synthetic_control.data as obtained from
+ * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series resides in a directory named
+ * "testdata", and writes output to a directory named "output".
*
- * @param input the String denoting the input directory path
- * @param output the String denoting the output directory path
- * @param measureClass the String class name of the DistanceMeasure to use
- * @param t1 the canopy T1 threshold
- * @param t2 the canopy T2 threshold
- * @param convergenceDelta the double convergence criteria for iterations
- * @param maxIterations the int maximum number of iterations
+ * @param input
+ * the String denoting the input directory path
+ * @param output
+ * the String denoting the output directory path
+ * @param measureClass
+ * the String class name of the DistanceMeasure to use
+ * @param t1
+ * the canopy T1 threshold
+ * @param t2
+ * the canopy T2 threshold
+ * @param convergenceDelta
+ * the double convergence criteria for iterations
+ * @param maxIterations
+ * the int maximum number of iterations
*/
- private static void runJob(String input, String output, String measureClass,
- double t1, double t2, double convergenceDelta, int maxIterations) throws IOException {
+ private static void runJob(String input,
+ String output,
+ String measureClass,
+ double t1,
+ double t2,
+ double convergenceDelta,
+ int maxIterations) throws IOException {
JobClient client = new JobClient();
JobConf conf = new JobConf(Job.class);
-
+
Path outPath = new Path(output);
client.setConf(conf);
FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
- if (dfs.exists(outPath))
+ if (dfs.exists(outPath)) {
dfs.delete(outPath, true);
- final String directoryContainingConvertedInput = output
- + DIRECTORY_CONTAINING_CONVERTED_INPUT;
+ }
+ final String directoryContainingConvertedInput = output + Constants.DIRECTORY_CONTAINING_CONVERTED_INPUT;
System.out.println("Preparing Input");
- InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector");
+ InputDriver.runJob(input, directoryContainingConvertedInput,
+ "org.apache.mahout.math.RandomAccessSparseVector");
System.out.println("Running Canopy to get initial clusters");
- CanopyDriver.runJob(directoryContainingConvertedInput, output
- + CanopyClusteringJob.DEFAULT_CANOPIES_OUTPUT_DIRECTORY, measureClass,
- t1, t2);
+ CanopyDriver.runJob(directoryContainingConvertedInput,
+ output + CanopyClusteringJob.DEFAULT_CANOPIES_OUTPUT_DIRECTORY, measureClass, t1, t2);
System.out.println("Running KMeans");
- KMeansDriver.runJob(directoryContainingConvertedInput, output
- + CanopyClusteringJob.DEFAULT_CANOPIES_OUTPUT_DIRECTORY, output,
- measureClass, convergenceDelta, maxIterations, 1);
+ KMeansDriver.runJob(directoryContainingConvertedInput,
+ output + CanopyClusteringJob.DEFAULT_CANOPIES_OUTPUT_DIRECTORY, output, measureClass, convergenceDelta,
+ maxIterations, 1);
}
}
Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/InputDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/InputDriver.java?rev=909871&r1=909870&r2=909871&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/InputDriver.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/InputDriver.java Sat Feb 13 19:07:36 2010
@@ -40,20 +40,20 @@
import org.slf4j.LoggerFactory;
public class InputDriver {
- /**Logger for this class.*/
+ /** Logger for this class. */
private static final Logger LOG = LoggerFactory.getLogger(InputDriver.class);
-
- private InputDriver() {
- }
-
+
+ private InputDriver() { }
+
public static void main(String[] args) throws IOException {
GroupBuilder gbuilder = new GroupBuilder();
-
+
Option inputOpt = DefaultOptionCreator.inputOption().withRequired(false).create();
Option outputOpt = DefaultOptionCreator.outputOption().withRequired(false).create();
Option helpOpt = DefaultOptionCreator.helpOption();
- Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(helpOpt).create();
-
+ Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(helpOpt)
+ .create();
+
try {
Parser parser = new Parser();
parser.setGroup(group);
@@ -62,34 +62,32 @@
CommandLineUtil.printHelp(group);
return;
}
-
+
String input = cmdLine.getValue(inputOpt, "testdata").toString();
String output = cmdLine.getValue(outputOpt, "output").toString();
- runJob(input, output);
+ InputDriver.runJob(input, output);
} catch (OptionException e) {
- LOG.error("Exception parsing command line: ", e);
+ InputDriver.LOG.error("Exception parsing command line: ", e);
CommandLineUtil.printHelp(group);
}
}
-
+
public static void runJob(String input, String output) throws IOException {
JobClient client = new JobClient();
- JobConf conf = new JobConf(
- org.apache.mahout.clustering.syntheticcontrol.meanshift.InputDriver.class);
-
+ JobConf conf = new JobConf(org.apache.mahout.clustering.syntheticcontrol.meanshift.InputDriver.class);
+
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(MeanShiftCanopy.class);
-
+
FileInputFormat.setInputPaths(conf, new Path(input));
FileOutputFormat.setOutputPath(conf, new Path(output));
conf.setOutputFormat(SequenceFileOutputFormat.class);
- conf
- .setMapperClass(org.apache.mahout.clustering.syntheticcontrol.meanshift.InputMapper.class);
+ conf.setMapperClass(org.apache.mahout.clustering.syntheticcontrol.meanshift.InputMapper.class);
conf.setReducerClass(Reducer.class);
conf.setNumReduceTasks(0);
-
+
client.setConf(conf);
JobClient.runJob(conf);
}
-
+
}
Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/InputMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/InputMapper.java?rev=909871&r1=909870&r2=909871&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/InputMapper.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/InputMapper.java Sat Feb 13 19:07:36 2010
@@ -17,6 +17,11 @@
package org.apache.mahout.clustering.syntheticcontrol.meanshift;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Pattern;
+
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.MapReduceBase;
@@ -27,33 +32,31 @@
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.Vector;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.regex.Pattern;
-
-public class InputMapper extends MapReduceBase implements
- Mapper<LongWritable, Text, Text, MeanShiftCanopy> {
-
+public class InputMapper extends MapReduceBase implements Mapper<LongWritable,Text,Text,MeanShiftCanopy> {
+
private static final Pattern SPACE = Pattern.compile(" ");
private int nextCanopyId = 0;
@Override
- public void map(LongWritable key, Text values,
- OutputCollector<Text, MeanShiftCanopy> output, Reporter reporter) throws IOException {
- String[] numbers = SPACE.split(values.toString());
+ public void map(LongWritable key,
+ Text values,
+ OutputCollector<Text,MeanShiftCanopy> output,
+ Reporter reporter) throws IOException {
+ String[] numbers = InputMapper.SPACE.split(values.toString());
// sometimes there are multiple separator spaces
List<Double> doubles = new ArrayList<Double>();
for (String value : numbers) {
- if (value.length() > 0)
+ if (value.length() > 0) {
doubles.add(Double.valueOf(value));
+ }
}
Vector point = new DenseVector(doubles.size());
int index = 0;
- for (Double d : doubles)
+ for (Double d : doubles) {
point.set(index++, d);
+ }
MeanShiftCanopy canopy = new MeanShiftCanopy(point, nextCanopyId++);
output.collect(new Text(), canopy);
}
-
+
}
Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java?rev=909871&r1=909870&r2=909871&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java Sat Feb 13 19:07:36 2010
@@ -42,40 +42,38 @@
public class Job {
/** Logger for this class. */
private static final Logger LOG = LoggerFactory.getLogger(Job.class);
-
+
private static final String CLUSTERED_POINTS_OUTPUT_DIRECTORY = "/clusteredPoints";
-
- private Job() {
- }
-
+
+ private Job() { }
+
public static void main(String[] args) throws IOException {
DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
ArgumentBuilder abuilder = new ArgumentBuilder();
GroupBuilder gbuilder = new GroupBuilder();
-
+
Option inputOpt = DefaultOptionCreator.inputOption().withRequired(false).create();
Option outputOpt = DefaultOptionCreator.outputOption().withRequired(false).create();
Option convergenceDeltaOpt = DefaultOptionCreator.convergenceOption().withRequired(false).create();
Option maxIterOpt = DefaultOptionCreator.maxIterOption().withRequired(false).create();
- Option helpOpt = DefaultOptionCreator.helpOption();
-
- Option modelOpt = obuilder.withLongName("distanceClass").withRequired(false).withShortName("d").
- withArgument(abuilder.withName("distanceClass").withMinimum(1).withMaximum(1).create()).
- withDescription("The distance measure class name.").create();
-
-
- Option threshold1Opt = obuilder.withLongName("threshold_1").withRequired(false).withShortName("t1").
- withArgument(abuilder.withName("threshold_1").withMinimum(1).withMaximum(1).create()).
- withDescription("The T1 distance threshold.").create();
-
- Option threshold2Opt = obuilder.withLongName("threshold_2").withRequired(false).withShortName("t2").
- withArgument(abuilder.withName("threshold_2").withMinimum(1).withMaximum(1).create()).
- withDescription("The T1 distance threshold.").create();
-
- Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(modelOpt).
- withOption(helpOpt).withOption(convergenceDeltaOpt).withOption(threshold1Opt).withOption(maxIterOpt).
- withOption(threshold2Opt).create();
-
+ Option helpOpt = DefaultOptionCreator.helpOption();
+
+ Option modelOpt = obuilder.withLongName("distanceClass").withRequired(false).withShortName("d")
+ .withArgument(abuilder.withName("distanceClass").withMinimum(1).withMaximum(1).create())
+ .withDescription("The distance measure class name.").create();
+
+ Option threshold1Opt = obuilder.withLongName("threshold_1").withRequired(false).withShortName("t1")
+ .withArgument(abuilder.withName("threshold_1").withMinimum(1).withMaximum(1).create())
+ .withDescription("The T1 distance threshold.").create();
+
+ Option threshold2Opt = obuilder.withLongName("threshold_2").withRequired(false).withShortName("t2")
+ .withArgument(abuilder.withName("threshold_2").withMinimum(1).withMaximum(1).create())
+ .withDescription("The T1 distance threshold.").create();
+
+ Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt)
+ .withOption(modelOpt).withOption(helpOpt).withOption(convergenceDeltaOpt).withOption(threshold1Opt)
+ .withOption(maxIterOpt).withOption(threshold2Opt).create();
+
try {
Parser parser = new Parser();
parser.setGroup(group);
@@ -84,47 +82,55 @@
CommandLineUtil.printHelp(group);
return;
}
-
+
String input = cmdLine.getValue(inputOpt, "testdata").toString();
String output = cmdLine.getValue(outputOpt, "output").toString();
- String measureClassName = cmdLine.getValue(modelOpt, "org.apache.mahout.common.distance.EuclideanDistanceMeasure").toString();
+ String measureClassName = cmdLine.getValue(modelOpt,
+ "org.apache.mahout.common.distance.EuclideanDistanceMeasure").toString();
double t1 = Double.parseDouble(cmdLine.getValue(threshold1Opt, "47.6").toString());
double t2 = Double.parseDouble(cmdLine.getValue(threshold2Opt, "1").toString());
double convergenceDelta = Double.parseDouble(cmdLine.getValue(convergenceDeltaOpt, "0.5").toString());
int maxIterations = Integer.parseInt(cmdLine.getValue(maxIterOpt, "10").toString());
- runJob(input, output, measureClassName, t1, t2, convergenceDelta,
- maxIterations);
+ Job.runJob(input, output, measureClassName, t1, t2, convergenceDelta, maxIterations);
} catch (OptionException e) {
- LOG.error("Exception parsing command line: ", e);
+ Job.LOG.error("Exception parsing command line: ", e);
CommandLineUtil.printHelp(group);
}
}
-
+
/**
- * Run the meanshift clustering job on an input dataset using the given
- * distance measure, t1, t2 and iteration parameters. All output data will be
- * written to the output directory, which will be initially deleted if it
- * exists. The clustered points will reside in the path
- * <output>/clustered-points. By default, the job expects the a file
- * containing synthetic_control.data as obtained from
- * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series
- * resides in a directory named "testdata", and writes output to a directory
- * named "output".
+ * Run the meanshift clustering job on an input dataset using the given distance measure, t1, t2 and
+ * iteration parameters. All output data will be written to the output directory, which will be initially
+ * deleted if it exists. The clustered points will reside in the path <output>/clustered-points. By default,
+ * the job expects the a file containing synthetic_control.data as obtained from
+ * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series resides in a directory named
+ * "testdata", and writes output to a directory named "output".
*
- * @param input the String denoting the input directory path
- * @param output the String denoting the output directory path
- * @param measureClassName the String class name of the DistanceMeasure to use
- * @param t1 the meanshift canopy T1 threshold
- * @param t2 the meanshift canopy T2 threshold
- * @param convergenceDelta the double convergence criteria for iterations
- * @param maxIterations the int maximum number of iterations
+ * @param input
+ * the String denoting the input directory path
+ * @param output
+ * the String denoting the output directory path
+ * @param measureClassName
+ * the String class name of the DistanceMeasure to use
+ * @param t1
+ * the meanshift canopy T1 threshold
+ * @param t2
+ * the meanshift canopy T2 threshold
+ * @param convergenceDelta
+ * the double convergence criteria for iterations
+ * @param maxIterations
+ * the int maximum number of iterations
*/
- private static void runJob(String input, String output,
- String measureClassName, double t1, double t2, double convergenceDelta,
- int maxIterations) throws IOException {
+ private static void runJob(String input,
+ String output,
+ String measureClassName,
+ double t1,
+ double t2,
+ double convergenceDelta,
+ int maxIterations) throws IOException {
JobClient client = new JobClient();
JobConf conf = new JobConf(Job.class);
-
+
Path outPath = new Path(output);
client.setConf(conf);
FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
@@ -133,11 +139,11 @@
}
String directoryContainingConvertedInput = output + Constants.DIRECTORY_CONTAINING_CONVERTED_INPUT;
InputDriver.runJob(input, directoryContainingConvertedInput);
- MeanShiftCanopyJob.runJob(directoryContainingConvertedInput, output + "/meanshift",
- measureClassName, t1, t2, convergenceDelta, maxIterations);
+ MeanShiftCanopyJob.runJob(directoryContainingConvertedInput, output + "/meanshift", measureClassName, t1,
+ t2, convergenceDelta, maxIterations);
FileStatus[] status = dfs.listStatus(new Path(output + "/meanshift"));
OutputDriver.runJob(status[status.length - 1].getPath().toString(),
- output + CLUSTERED_POINTS_OUTPUT_DIRECTORY);
+ output + Job.CLUSTERED_POINTS_OUTPUT_DIRECTORY);
}
-
+
}
Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/OutputDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/OutputDriver.java?rev=909871&r1=909870&r2=909871&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/OutputDriver.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/OutputDriver.java Sat Feb 13 19:07:36 2010
@@ -41,18 +41,18 @@
public class OutputDriver {
/** Logger for this class. */
private static final Logger LOG = LoggerFactory.getLogger(OutputDriver.class);
-
- private OutputDriver() {
- }
-
+
+ private OutputDriver() { }
+
public static void main(String[] args) throws IOException {
GroupBuilder gbuilder = new GroupBuilder();
-
+
Option inputOpt = DefaultOptionCreator.inputOption().withRequired(false).create();
Option outputOpt = DefaultOptionCreator.outputOption().withRequired(false).create();
Option helpOpt = DefaultOptionCreator.helpOption();
- Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(helpOpt).create();
-
+ Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(helpOpt)
+ .create();
+
try {
Parser parser = new Parser();
parser.setGroup(group);
@@ -61,35 +61,34 @@
CommandLineUtil.printHelp(group);
return;
}
-
+
String input = cmdLine.getValue(inputOpt, "testdata").toString();
String output = cmdLine.getValue(outputOpt, "output").toString();
- runJob(input, output);
+ OutputDriver.runJob(input, output);
} catch (OptionException e) {
- LOG.error("Exception parsing command line: ", e);
+ OutputDriver.LOG.error("Exception parsing command line: ", e);
CommandLineUtil.printHelp(group);
}
}
-
+
public static void runJob(String input, String output) throws IOException {
JobClient client = new JobClient();
- JobConf conf = new JobConf(
- org.apache.mahout.clustering.syntheticcontrol.meanshift.OutputDriver.class);
-
+ JobConf conf = new JobConf(org.apache.mahout.clustering.syntheticcontrol.meanshift.OutputDriver.class);
+
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(Text.class);
conf.setInputFormat(SequenceFileInputFormat.class);
-
+
FileInputFormat.setInputPaths(conf, new Path(input));
FileOutputFormat.setOutputPath(conf, new Path(output));
-
+
conf.setMapperClass(OutputMapper.class);
-
+
conf.setReducerClass(Reducer.class);
conf.setNumReduceTasks(0);
-
+
client.setConf(conf);
JobClient.runJob(conf);
}
-
+
}
Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/OutputMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/OutputMapper.java?rev=909871&r1=909870&r2=909871&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/OutputMapper.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/OutputMapper.java Sat Feb 13 19:07:36 2010
@@ -17,6 +17,8 @@
package org.apache.mahout.clustering.syntheticcontrol.meanshift;
+import java.io.IOException;
+
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
@@ -27,27 +29,24 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import java.io.IOException;
-
-public class OutputMapper extends MapReduceBase implements
- Mapper<Text, MeanShiftCanopy, Text, Text> {
-
+public class OutputMapper extends MapReduceBase implements Mapper<Text,MeanShiftCanopy,Text,Text> {
+
private static final Logger log = LoggerFactory.getLogger(OutputMapper.class);
-
+
private int clusters = 0;
-
+
@Override
- public void map(Text key, MeanShiftCanopy canopy, OutputCollector<Text, Text> output,
- Reporter reporter) throws IOException {
+ public void map(Text key, MeanShiftCanopy canopy, OutputCollector<Text,Text> output, Reporter reporter) throws IOException {
clusters++;
- for (Vector point : canopy.getBoundPoints())
+ for (Vector point : canopy.getBoundPoints()) {
output.collect(key, new Text(point.asFormatString()));
+ }
}
-
+
@Override
public void close() throws IOException {
- log.info("+++ Clusters={}", clusters);
+ OutputMapper.log.info("+++ Clusters={}", clusters);
super.close();
}
-
+
}