You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by je...@apache.org on 2008/06/16 17:49:44 UTC

svn commit: r668205 - in /lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering: ./ syntheticcontrol/ syntheticcontrol/canopy/ syntheticcontrol/kmeans/ syntheticcontrol/meanshift/

Author: jeastman
Date: Mon Jun 16 08:49:43 2008
New Revision: 668205

URL: http://svn.apache.org/viewvc?rev=668205&view=rev
Log:
MAHOUT-59: improved cluster examples from the Jira patch with output formatting to a common format

Added:
    lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/
    lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/
    lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/canopy/
    lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/canopy/InputDriver.java
    lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/canopy/InputMapper.java
    lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java
    lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/canopy/OutputDriver.java
    lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/canopy/OutputMapper.java
    lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/kmeans/
    lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
    lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/kmeans/OutputDriver.java
    lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/kmeans/OutputMapper.java
    lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/meanshift/
    lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/meanshift/InputDriver.java
    lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/meanshift/InputMapper.java
    lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java
    lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/meanshift/OutputDriver.java
    lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/meanshift/OutputMapper.java

Added: lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/canopy/InputDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/canopy/InputDriver.java?rev=668205&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/canopy/InputDriver.java (added)
+++ lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/canopy/InputDriver.java Mon Jun 16 08:49:43 2008
@@ -0,0 +1,57 @@
+package org.apache.mahout.clustering.syntheticcontrol.canopy;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.FileOutputFormat;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.mahout.matrix.Vector;
+
+public class InputDriver {
+
+  public static void main(String[] args) {
+    runJob(args[0], args[1]);
+  }
+
+  public static void runJob(String input, String output) {
+    JobClient client = new JobClient();
+    JobConf conf = new JobConf(InputDriver.class);
+
+    conf.setOutputKeyClass(Text.class);
+    conf.setOutputValueClass(Vector.class);
+
+    FileInputFormat.setInputPaths(conf, new Path(input));
+    FileOutputFormat.setOutputPath(conf, new Path(output));
+
+    conf.setMapperClass(InputMapper.class);
+
+    conf.setReducerClass(Reducer.class);
+    conf.setNumReduceTasks(0);
+
+    client.setConf(conf);
+    try {
+      JobClient.runJob(conf);
+    } catch (Exception e) {
+      e.printStackTrace();
+    }
+  }
+
+}

Added: lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/canopy/InputMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/canopy/InputMapper.java?rev=668205&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/canopy/InputMapper.java (added)
+++ lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/canopy/InputMapper.java Mon Jun 16 08:49:43 2008
@@ -0,0 +1,52 @@
+package org.apache.mahout.clustering.syntheticcontrol.canopy;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.mahout.matrix.DenseVector;
+import org.apache.mahout.matrix.Vector;
+
+public class InputMapper extends MapReduceBase implements
+    Mapper<LongWritable, Text, Text, Text> {
+
+  public void map(LongWritable key, Text values,
+      OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
+    String[] numbers = values.toString().split(" ");
+    // sometimes there are multiple separator spaces
+    List<Double> doubles = new ArrayList<Double>();
+    for (int i = 0; i < numbers.length; i++) {
+      String value = numbers[i];
+      if (value.length() > 0)
+        doubles.add(new Double(value));
+    }
+    Vector result = new DenseVector(doubles.size());
+    int index = 0;
+    for (Double d : doubles)
+      result.set(index++, d);
+    output.collect(null, new Text(result.asFormatString()));
+  }
+
+}

Added: lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java?rev=668205&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java (added)
+++ lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java Mon Jun 16 08:49:43 2008
@@ -0,0 +1,78 @@
+package org.apache.mahout.clustering.syntheticcontrol.canopy;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.mahout.clustering.canopy.CanopyClusteringJob;
+
+public class Job {
+
+  public static void main(String[] args) {
+    if (args.length == 5) {
+      String input = args[0];
+      String output = args[1];
+      String measureClassName = args[2];
+      double t1 = new Double(args[3]);
+      double t2 = new Double(args[4]);
+      runJob(input, output, measureClassName, t1, t2);
+    } else
+      runJob("testdata", "output",
+          "org.apache.mahout.utils.EuclideanDistanceMeasure", 80, 55);
+  }
+
+  /**
+   * Run the canopy clustering job on an input dataset using the given distance
+   * measure, t1 and t2 parameters. All output data will be written to the
+   * output directory, which will be initially deleted if it exists. The
+   * clustered points will reside in the path <output>/clustered-points. By
+   * default, the job expects the a file containing synthetic_control.data as
+   * obtained from
+   * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series
+   * resides in a directory named "testdata", and writes output to a directory
+   * named "output".
+   * 
+   * @param input the String denoting the input directory path
+   * @param output the String denoting the output directory path
+   * @param measureClassName the String class name of the DistanceMeasure to use
+   * @param t1 the canopy T1 threshold
+   * @param t2 the canopy T2 threshold
+   */
+  private static void runJob(String input, String output,
+      String measureClassName, double t1, double t2) {
+    JobClient client = new JobClient();
+    JobConf conf = new JobConf(Job.class);
+
+    Path outPath = new Path(output);
+    client.setConf(conf);
+    try {
+      FileSystem dfs = FileSystem.get(conf);
+      if (dfs.exists(outPath))
+        dfs.delete(outPath, true);
+      InputDriver.runJob(input, output + "/data");
+      CanopyClusteringJob.runJob(output + "/data", output, measureClassName,
+          t1, t2);
+      OutputDriver.runJob(output + "/clusters", output + "/clustered-points");
+    } catch (Exception e) {
+      e.printStackTrace();
+    }
+  }
+
+}

Added: lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/canopy/OutputDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/canopy/OutputDriver.java?rev=668205&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/canopy/OutputDriver.java (added)
+++ lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/canopy/OutputDriver.java Mon Jun 16 08:49:43 2008
@@ -0,0 +1,58 @@
+package org.apache.mahout.clustering.syntheticcontrol.canopy;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.FileOutputFormat;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.Reducer;
+
+public class OutputDriver {
+
+  public static void main(String[] args) {
+    runJob(args[0], args[1]);
+  }
+
+  public static void runJob(String input, String output) {
+    JobClient client = new JobClient();
+    JobConf conf = new JobConf(OutputDriver.class);
+
+    conf.setOutputKeyClass(Text.class);
+    conf.setOutputValueClass(IntWritable.class);
+
+    FileInputFormat.setInputPaths(conf, new Path(input));
+    FileOutputFormat.setOutputPath(conf, new Path(output));
+
+    conf.setMapperClass(OutputMapper.class);
+
+    conf.setReducerClass(Reducer.class);
+    conf.setNumReduceTasks(0);
+
+    client.setConf(conf);
+    try {
+      JobClient.runJob(conf);
+    } catch (Exception e) {
+      e.printStackTrace();
+    }
+  }
+
+}

Added: lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/canopy/OutputMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/canopy/OutputMapper.java?rev=668205&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/canopy/OutputMapper.java (added)
+++ lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/canopy/OutputMapper.java Mon Jun 16 08:49:43 2008
@@ -0,0 +1,45 @@
+package org.apache.mahout.clustering.syntheticcontrol.canopy;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.mahout.clustering.canopy.Canopy;
+import org.apache.mahout.matrix.AbstractVector;
+import org.apache.mahout.matrix.Vector;
+
+public class OutputMapper extends MapReduceBase implements
+    Mapper<LongWritable, Text, Text, Text> {
+
+  public void map(LongWritable key, Text values,
+      OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
+    String foo = values.toString();
+    int ix = foo.indexOf(']');
+    Canopy canopy = Canopy.decodeCanopy(foo.substring(0, ix + 1));
+    Vector point = AbstractVector.decodeVector(foo.substring(ix + 3));
+    output.collect(new Text(canopy.getIdentifier()), new Text(point
+        .asFormatString()));
+  }
+
+}

Added: lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java?rev=668205&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java (added)
+++ lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java Mon Jun 16 08:49:43 2008
@@ -0,0 +1,86 @@
+package org.apache.mahout.clustering.syntheticcontrol.kmeans;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.mahout.clustering.canopy.CanopyClusteringJob;
+import org.apache.mahout.clustering.kmeans.KMeansDriver;
+import org.apache.mahout.clustering.syntheticcontrol.canopy.InputDriver;
+
+public class Job {
+
+  public static void main(String[] args) {
+    if (args.length == 6) {
+      String input = args[0];
+      String output = args[1];
+      String measureClass = args[2];
+      double t1 = new Double(args[3]);
+      double t2 = new Double(args[4]);
+      double convergenceDelta = new Double(args[5]);
+      int maxIterations = new Integer(args[6]);
+      runJob(input, output, measureClass, convergenceDelta, t1, t2,
+          maxIterations);
+    } else
+      runJob("testdata", "output",
+          "org.apache.mahout.utils.EuclideanDistanceMeasure", 80, 55, 0.5, 10);
+  }
+
+  /**
+   * Run the kmeans clustering job on an input dataset using the given distance
+   * measure, t1, t2 and iteration parameters. All output data will be written
+   * to the output directory, which will be initially deleted if it exists. The
+   * clustered points will reside in the path <output>/clustered-points. By
+   * default, the job expects the a file containing synthetic_control.data as
+   * obtained from
+   * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series
+   * resides in a directory named "testdata", and writes output to a directory
+   * named "output".
+   * 
+   * @param input the String denoting the input directory path
+   * @param output the String denoting the output directory path
+   * @param measureClassName the String class name of the DistanceMeasure to use
+   * @param t1 the canopy T1 threshold
+   * @param t2 the canopy T2 threshold
+   * @param convergenceDelta the double convergence criteria for iterations
+   * @param maxIterations the int maximum number of iterations
+   */
+  private static void runJob(String input, String output, String measureClass,
+      double t1, double t2, double convergenceDelta, int maxIterations) {
+    JobClient client = new JobClient();
+    JobConf conf = new JobConf(Job.class);
+
+    Path outPath = new Path(output);
+    client.setConf(conf);
+    try {
+      FileSystem dfs = FileSystem.get(conf);
+      if (dfs.exists(outPath))
+        dfs.delete(outPath, true);
+      InputDriver.runJob(input, output + "/data");
+      CanopyClusteringJob
+          .runJob(output + "/data", output, measureClass, t1, t2);
+      KMeansDriver.runJob(output + "/data", output + "/canopies", output,
+          measureClass, convergenceDelta, maxIterations);
+      OutputDriver.runJob(output + "/points", output + "/clustered-points");
+    } catch (Exception e) {
+      e.printStackTrace();
+    }
+  }
+}

Added: lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/kmeans/OutputDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/kmeans/OutputDriver.java?rev=668205&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/kmeans/OutputDriver.java (added)
+++ lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/kmeans/OutputDriver.java Mon Jun 16 08:49:43 2008
@@ -0,0 +1,58 @@
+package org.apache.mahout.clustering.syntheticcontrol.kmeans;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.FileOutputFormat;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.Reducer;
+
+public class OutputDriver {
+
+  public static void main(String[] args) {
+    runJob(args[0], args[1]);
+  }
+
+  public static void runJob(String input, String output) {
+    JobClient client = new JobClient();
+    JobConf conf = new JobConf(OutputDriver.class);
+
+    conf.setOutputKeyClass(Text.class);
+    conf.setOutputValueClass(IntWritable.class);
+
+    FileInputFormat.setInputPaths(conf, new Path(input));
+    FileOutputFormat.setOutputPath(conf, new Path(output));
+
+    conf.setMapperClass(OutputMapper.class);
+
+    conf.setReducerClass(Reducer.class);
+    conf.setNumReduceTasks(0);
+
+    client.setConf(conf);
+    try {
+      JobClient.runJob(conf);
+    } catch (Exception e) {
+      e.printStackTrace();
+    }
+  }
+
+}

Added: lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/kmeans/OutputMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/kmeans/OutputMapper.java?rev=668205&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/kmeans/OutputMapper.java (added)
+++ lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/kmeans/OutputMapper.java Mon Jun 16 08:49:43 2008
@@ -0,0 +1,45 @@
+package org.apache.mahout.clustering.syntheticcontrol.kmeans;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.mahout.clustering.kmeans.Cluster;
+import org.apache.mahout.matrix.AbstractVector;
+import org.apache.mahout.matrix.Vector;
+
+public class OutputMapper extends MapReduceBase implements
+    Mapper<LongWritable, Text, Text, Text> {
+
+  public void map(LongWritable key, Text values,
+      OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
+    String foo = values.toString();
+    int ix = foo.indexOf(']');
+    Cluster canopy = Cluster.decodeCluster(foo.substring(0, ix + 1));
+    Vector point = AbstractVector.decodeVector(foo.substring(ix + 3));
+    output.collect(new Text(canopy.getIdentifier()), new Text(point
+        .asFormatString()));
+  }
+
+}

Added: lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/meanshift/InputDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/meanshift/InputDriver.java?rev=668205&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/meanshift/InputDriver.java (added)
+++ lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/meanshift/InputDriver.java Mon Jun 16 08:49:43 2008
@@ -0,0 +1,59 @@
+package org.apache.mahout.clustering.syntheticcontrol.meanshift;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.FileOutputFormat;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.mahout.matrix.Vector;
+
+public class InputDriver {
+
+  public static void main(String[] args) {
+    runJob(args[0], args[1]);
+  }
+
+  public static void runJob(String input, String output) {
+    JobClient client = new JobClient();
+    JobConf conf = new JobConf(
+        org.apache.mahout.clustering.syntheticcontrol.meanshift.InputDriver.class);
+
+    conf.setOutputKeyClass(Text.class);
+    conf.setOutputValueClass(Vector.class);
+
+    FileInputFormat.setInputPaths(conf, new Path(input));
+    FileOutputFormat.setOutputPath(conf, new Path(output));
+
+    conf
+        .setMapperClass(org.apache.mahout.clustering.syntheticcontrol.meanshift.InputMapper.class);
+
+    conf.setReducerClass(Reducer.class);
+    conf.setNumReduceTasks(0);
+
+    client.setConf(conf);
+    try {
+      JobClient.runJob(conf);
+    } catch (Exception e) {
+      e.printStackTrace();
+    }
+  }
+
+}

Added: lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/meanshift/InputMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/meanshift/InputMapper.java?rev=668205&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/meanshift/InputMapper.java (added)
+++ lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/meanshift/InputMapper.java Mon Jun 16 08:49:43 2008
@@ -0,0 +1,54 @@
+package org.apache.mahout.clustering.syntheticcontrol.meanshift;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.mahout.clustering.meanshift.MeanShiftCanopy;
+import org.apache.mahout.matrix.DenseVector;
+import org.apache.mahout.matrix.Vector;
+
+public class InputMapper extends MapReduceBase implements
+    Mapper<LongWritable, Text, Text, Text> {
+
+  public void map(LongWritable key, Text values,
+      OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
+    String[] numbers = values.toString().split(" ");
+    // sometimes there are multiple separator spaces
+    List<Double> doubles = new ArrayList<Double>();
+    for (int i = 0; i < numbers.length; i++) {
+      String value = numbers[i];
+      if (value.length() > 0)
+        doubles.add(new Double(value));
+    }
+    Vector point = new DenseVector(doubles.size());
+    int index = 0;
+    for (Double d : doubles)
+      point.set(index++, d);
+    MeanShiftCanopy canopy = new MeanShiftCanopy(point);
+    output.collect(null, new Text(canopy.toString()));
+  }
+
+}

Added: lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java?rev=668205&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java (added)
+++ lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java Mon Jun 16 08:49:43 2008
@@ -0,0 +1,87 @@
+package org.apache.mahout.clustering.syntheticcontrol.meanshift;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.mahout.clustering.meanshift.MeanShiftCanopyJob;
+import org.apache.mahout.clustering.syntheticcontrol.meanshift.InputDriver;
+
+public class Job {
+
+  public static void main(String[] args) {
+    if (args.length == 7) {
+      String input = args[0];
+      String output = args[1];
+      String measureClassName = args[2];
+      double t1 = new Double(args[3]);
+      double t2 = new Double(args[4]);
+      double convergenceDelta = new Double(args[5]);
+      int maxIterations = new Integer(args[6]);
+      runJob(input, output, measureClassName, t1, t2, convergenceDelta,
+          maxIterations);
+    } else
+      runJob("testdata", "output",
+          "org.apache.mahout.utils.EuclideanDistanceMeasure", 47.6, 1, 0.5, 10);
+  }
+
+  /**
+   * Run the meanshift clustering job on an input dataset using the given
+   * distance measure, t1, t2 and iteration parameters. All output data will be
+   * written to the output directory, which will be initially deleted if it
+   * exists. The clustered points will reside in the path
+   * <output>/clustered-points. By default, the job expects the a file
+   * containing synthetic_control.data as obtained from
+   * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series
+   * resides in a directory named "testdata", and writes output to a directory
+   * named "output".
+   * 
+   * @param input the String denoting the input directory path
+   * @param output the String denoting the output directory path
+   * @param measureClassName the String class name of the DistanceMeasure to use
+   * @param t1 the meanshift canopy T1 threshold
+   * @param t2 the meanshift canopy T2 threshold
+   * @param convergenceDelta the double convergence criteria for iterations
+   * @param maxIterations the int maximum number of iterations
+   */
+  private static void runJob(String input, String output,
+      String measureClassName, double t1, double t2, double convergenceDelta,
+      int maxIterations) {
+    JobClient client = new JobClient();
+    JobConf conf = new JobConf(Job.class);
+
+    Path outPath = new Path(output);
+    client.setConf(conf);
+    try {
+      FileSystem dfs = FileSystem.get(conf);
+      if (dfs.exists(outPath))
+        dfs.delete(outPath, true);
+      InputDriver.runJob(input, output + "/data");
+      MeanShiftCanopyJob.runJob(output + "/data", output + "/meanshift",
+          measureClassName, t1, t2, convergenceDelta, maxIterations);
+      FileStatus[] status = dfs.listStatus(new Path(output + "/meanshift"));
+      OutputDriver.runJob(status[status.length - 1].getPath().toString(),
+          output + "/clustered-points");
+    } catch (Exception e) {
+      e.printStackTrace();
+    }
+  }
+
+}

Added: lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/meanshift/OutputDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/meanshift/OutputDriver.java?rev=668205&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/meanshift/OutputDriver.java (added)
+++ lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/meanshift/OutputDriver.java Mon Jun 16 08:49:43 2008
@@ -0,0 +1,61 @@
+package org.apache.mahout.clustering.syntheticcontrol.meanshift;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.FileOutputFormat;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.SequenceFileInputFormat;
+
+public class OutputDriver {
+
+  public static void main(String[] args) {
+    runJob(args[0], args[1]);
+  }
+
+  public static void runJob(String input, String output) {
+    JobClient client = new JobClient();
+    JobConf conf = new JobConf(
+        org.apache.mahout.clustering.syntheticcontrol.meanshift.OutputDriver.class);
+
+    conf.setOutputKeyClass(Text.class);
+    conf.setOutputValueClass(IntWritable.class);
+    conf.setInputFormat(SequenceFileInputFormat.class);
+
+    FileInputFormat.setInputPaths(conf, new Path(input));
+    FileOutputFormat.setOutputPath(conf, new Path(output));
+
+    conf.setMapperClass(OutputMapper.class);
+
+    conf.setReducerClass(Reducer.class);
+    conf.setNumReduceTasks(0);
+
+    client.setConf(conf);
+    try {
+      JobClient.runJob(conf);
+    } catch (Exception e) {
+      e.printStackTrace();
+    }
+  }
+
+}

Added: lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/meanshift/OutputMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/meanshift/OutputMapper.java?rev=668205&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/meanshift/OutputMapper.java (added)
+++ lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/meanshift/OutputMapper.java Mon Jun 16 08:49:43 2008
@@ -0,0 +1,50 @@
+package org.apache.mahout.clustering.syntheticcontrol.meanshift;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.mahout.clustering.meanshift.MeanShiftCanopy;
+import org.apache.mahout.matrix.Vector;
+
+public class OutputMapper extends MapReduceBase implements
+    Mapper<Text, Text, Text, Text> {
+
+  int clusters = 0;
+
+  public void map(Text key, Text values, OutputCollector<Text, Text> output,
+      Reporter reporter) throws IOException {
+    clusters++;
+    String foo = values.toString();
+    MeanShiftCanopy canopy = MeanShiftCanopy.decodeCanopy(foo);
+    for (Vector point : canopy.getBoundPoints())
+      output.collect(key, new Text(point.asFormatString()));
+  }
+
+  @Override
+  public void close() throws IOException {
+    System.out.println("+++ Clusters=" + clusters);
+    super.close();
+  }
+
+}