You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by je...@apache.org on 2008/06/16 17:49:44 UTC
svn commit: r668205 - in
/lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering:
./ syntheticcontrol/ syntheticcontrol/canopy/ syntheticcontrol/kmeans/
syntheticcontrol/meanshift/
Author: jeastman
Date: Mon Jun 16 08:49:43 2008
New Revision: 668205
URL: http://svn.apache.org/viewvc?rev=668205&view=rev
Log:
MAHOUT-59: improved cluster examples from the Jira patch with output formatting to a common format
Added:
lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/
lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/
lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/canopy/
lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/canopy/InputDriver.java
lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/canopy/InputMapper.java
lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java
lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/canopy/OutputDriver.java
lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/canopy/OutputMapper.java
lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/kmeans/
lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/kmeans/OutputDriver.java
lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/kmeans/OutputMapper.java
lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/meanshift/
lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/meanshift/InputDriver.java
lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/meanshift/InputMapper.java
lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java
lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/meanshift/OutputDriver.java
lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/meanshift/OutputMapper.java
Added: lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/canopy/InputDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/canopy/InputDriver.java?rev=668205&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/canopy/InputDriver.java (added)
+++ lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/canopy/InputDriver.java Mon Jun 16 08:49:43 2008
@@ -0,0 +1,57 @@
+package org.apache.mahout.clustering.syntheticcontrol.canopy;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.FileOutputFormat;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.mahout.matrix.Vector;
+
+public class InputDriver {
+
+ public static void main(String[] args) {
+ runJob(args[0], args[1]);
+ }
+
+ public static void runJob(String input, String output) {
+ JobClient client = new JobClient();
+ JobConf conf = new JobConf(InputDriver.class);
+
+ conf.setOutputKeyClass(Text.class);
+ conf.setOutputValueClass(Vector.class);
+
+ FileInputFormat.setInputPaths(conf, new Path(input));
+ FileOutputFormat.setOutputPath(conf, new Path(output));
+
+ conf.setMapperClass(InputMapper.class);
+
+ conf.setReducerClass(Reducer.class);
+ conf.setNumReduceTasks(0);
+
+ client.setConf(conf);
+ try {
+ JobClient.runJob(conf);
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+
+}
Added: lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/canopy/InputMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/canopy/InputMapper.java?rev=668205&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/canopy/InputMapper.java (added)
+++ lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/canopy/InputMapper.java Mon Jun 16 08:49:43 2008
@@ -0,0 +1,52 @@
+package org.apache.mahout.clustering.syntheticcontrol.canopy;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.mahout.matrix.DenseVector;
+import org.apache.mahout.matrix.Vector;
+
+public class InputMapper extends MapReduceBase implements
+ Mapper<LongWritable, Text, Text, Text> {
+
+ public void map(LongWritable key, Text values,
+ OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
+ String[] numbers = values.toString().split(" ");
+ // sometimes there are multiple separator spaces
+ List<Double> doubles = new ArrayList<Double>();
+ for (int i = 0; i < numbers.length; i++) {
+ String value = numbers[i];
+ if (value.length() > 0)
+ doubles.add(new Double(value));
+ }
+ Vector result = new DenseVector(doubles.size());
+ int index = 0;
+ for (Double d : doubles)
+ result.set(index++, d);
+ output.collect(null, new Text(result.asFormatString()));
+ }
+
+}
Added: lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java?rev=668205&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java (added)
+++ lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java Mon Jun 16 08:49:43 2008
@@ -0,0 +1,78 @@
+package org.apache.mahout.clustering.syntheticcontrol.canopy;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.mahout.clustering.canopy.CanopyClusteringJob;
+
+public class Job {
+
+ public static void main(String[] args) {
+ if (args.length == 5) {
+ String input = args[0];
+ String output = args[1];
+ String measureClassName = args[2];
+ double t1 = new Double(args[3]);
+ double t2 = new Double(args[4]);
+ runJob(input, output, measureClassName, t1, t2);
+ } else
+ runJob("testdata", "output",
+ "org.apache.mahout.utils.EuclideanDistanceMeasure", 80, 55);
+ }
+
+ /**
+ * Run the canopy clustering job on an input dataset using the given distance
+ * measure, t1 and t2 parameters. All output data will be written to the
+ * output directory, which will be initially deleted if it exists. The
+ * clustered points will reside in the path <output>/clustered-points. By
+ * default, the job expects the a file containing synthetic_control.data as
+ * obtained from
+ * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series
+ * resides in a directory named "testdata", and writes output to a directory
+ * named "output".
+ *
+ * @param input the String denoting the input directory path
+ * @param output the String denoting the output directory path
+ * @param measureClassName the String class name of the DistanceMeasure to use
+ * @param t1 the canopy T1 threshold
+ * @param t2 the canopy T2 threshold
+ */
+ private static void runJob(String input, String output,
+ String measureClassName, double t1, double t2) {
+ JobClient client = new JobClient();
+ JobConf conf = new JobConf(Job.class);
+
+ Path outPath = new Path(output);
+ client.setConf(conf);
+ try {
+ FileSystem dfs = FileSystem.get(conf);
+ if (dfs.exists(outPath))
+ dfs.delete(outPath, true);
+ InputDriver.runJob(input, output + "/data");
+ CanopyClusteringJob.runJob(output + "/data", output, measureClassName,
+ t1, t2);
+ OutputDriver.runJob(output + "/clusters", output + "/clustered-points");
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+
+}
Added: lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/canopy/OutputDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/canopy/OutputDriver.java?rev=668205&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/canopy/OutputDriver.java (added)
+++ lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/canopy/OutputDriver.java Mon Jun 16 08:49:43 2008
@@ -0,0 +1,58 @@
+package org.apache.mahout.clustering.syntheticcontrol.canopy;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.FileOutputFormat;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.Reducer;
+
+public class OutputDriver {
+
+ public static void main(String[] args) {
+ runJob(args[0], args[1]);
+ }
+
+ public static void runJob(String input, String output) {
+ JobClient client = new JobClient();
+ JobConf conf = new JobConf(OutputDriver.class);
+
+ conf.setOutputKeyClass(Text.class);
+ conf.setOutputValueClass(IntWritable.class);
+
+ FileInputFormat.setInputPaths(conf, new Path(input));
+ FileOutputFormat.setOutputPath(conf, new Path(output));
+
+ conf.setMapperClass(OutputMapper.class);
+
+ conf.setReducerClass(Reducer.class);
+ conf.setNumReduceTasks(0);
+
+ client.setConf(conf);
+ try {
+ JobClient.runJob(conf);
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+
+}
Added: lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/canopy/OutputMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/canopy/OutputMapper.java?rev=668205&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/canopy/OutputMapper.java (added)
+++ lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/canopy/OutputMapper.java Mon Jun 16 08:49:43 2008
@@ -0,0 +1,45 @@
+package org.apache.mahout.clustering.syntheticcontrol.canopy;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.mahout.clustering.canopy.Canopy;
+import org.apache.mahout.matrix.AbstractVector;
+import org.apache.mahout.matrix.Vector;
+
+public class OutputMapper extends MapReduceBase implements
+ Mapper<LongWritable, Text, Text, Text> {
+
+ public void map(LongWritable key, Text values,
+ OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
+ String foo = values.toString();
+ int ix = foo.indexOf(']');
+ Canopy canopy = Canopy.decodeCanopy(foo.substring(0, ix + 1));
+ Vector point = AbstractVector.decodeVector(foo.substring(ix + 3));
+ output.collect(new Text(canopy.getIdentifier()), new Text(point
+ .asFormatString()));
+ }
+
+}
Added: lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java?rev=668205&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java (added)
+++ lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java Mon Jun 16 08:49:43 2008
@@ -0,0 +1,86 @@
+package org.apache.mahout.clustering.syntheticcontrol.kmeans;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.mahout.clustering.canopy.CanopyClusteringJob;
+import org.apache.mahout.clustering.kmeans.KMeansDriver;
+import org.apache.mahout.clustering.syntheticcontrol.canopy.InputDriver;
+
+public class Job {
+
+ public static void main(String[] args) {
+ if (args.length == 6) {
+ String input = args[0];
+ String output = args[1];
+ String measureClass = args[2];
+ double t1 = new Double(args[3]);
+ double t2 = new Double(args[4]);
+ double convergenceDelta = new Double(args[5]);
+ int maxIterations = new Integer(args[6]);
+ runJob(input, output, measureClass, convergenceDelta, t1, t2,
+ maxIterations);
+ } else
+ runJob("testdata", "output",
+ "org.apache.mahout.utils.EuclideanDistanceMeasure", 80, 55, 0.5, 10);
+ }
+
+ /**
+ * Run the kmeans clustering job on an input dataset using the given distance
+ * measure, t1, t2 and iteration parameters. All output data will be written
+ * to the output directory, which will be initially deleted if it exists. The
+ * clustered points will reside in the path <output>/clustered-points. By
+ * default, the job expects the a file containing synthetic_control.data as
+ * obtained from
+ * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series
+ * resides in a directory named "testdata", and writes output to a directory
+ * named "output".
+ *
+ * @param input the String denoting the input directory path
+ * @param output the String denoting the output directory path
+ * @param measureClassName the String class name of the DistanceMeasure to use
+ * @param t1 the canopy T1 threshold
+ * @param t2 the canopy T2 threshold
+ * @param convergenceDelta the double convergence criteria for iterations
+ * @param maxIterations the int maximum number of iterations
+ */
+ private static void runJob(String input, String output, String measureClass,
+ double t1, double t2, double convergenceDelta, int maxIterations) {
+ JobClient client = new JobClient();
+ JobConf conf = new JobConf(Job.class);
+
+ Path outPath = new Path(output);
+ client.setConf(conf);
+ try {
+ FileSystem dfs = FileSystem.get(conf);
+ if (dfs.exists(outPath))
+ dfs.delete(outPath, true);
+ InputDriver.runJob(input, output + "/data");
+ CanopyClusteringJob
+ .runJob(output + "/data", output, measureClass, t1, t2);
+ KMeansDriver.runJob(output + "/data", output + "/canopies", output,
+ measureClass, convergenceDelta, maxIterations);
+ OutputDriver.runJob(output + "/points", output + "/clustered-points");
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+}
Added: lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/kmeans/OutputDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/kmeans/OutputDriver.java?rev=668205&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/kmeans/OutputDriver.java (added)
+++ lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/kmeans/OutputDriver.java Mon Jun 16 08:49:43 2008
@@ -0,0 +1,58 @@
+package org.apache.mahout.clustering.syntheticcontrol.kmeans;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.FileOutputFormat;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.Reducer;
+
+public class OutputDriver {
+
+ public static void main(String[] args) {
+ runJob(args[0], args[1]);
+ }
+
+ public static void runJob(String input, String output) {
+ JobClient client = new JobClient();
+ JobConf conf = new JobConf(OutputDriver.class);
+
+ conf.setOutputKeyClass(Text.class);
+ conf.setOutputValueClass(IntWritable.class);
+
+ FileInputFormat.setInputPaths(conf, new Path(input));
+ FileOutputFormat.setOutputPath(conf, new Path(output));
+
+ conf.setMapperClass(OutputMapper.class);
+
+ conf.setReducerClass(Reducer.class);
+ conf.setNumReduceTasks(0);
+
+ client.setConf(conf);
+ try {
+ JobClient.runJob(conf);
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+
+}
Added: lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/kmeans/OutputMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/kmeans/OutputMapper.java?rev=668205&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/kmeans/OutputMapper.java (added)
+++ lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/kmeans/OutputMapper.java Mon Jun 16 08:49:43 2008
@@ -0,0 +1,45 @@
+package org.apache.mahout.clustering.syntheticcontrol.kmeans;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.mahout.clustering.kmeans.Cluster;
+import org.apache.mahout.matrix.AbstractVector;
+import org.apache.mahout.matrix.Vector;
+
+public class OutputMapper extends MapReduceBase implements
+ Mapper<LongWritable, Text, Text, Text> {
+
+ public void map(LongWritable key, Text values,
+ OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
+ String foo = values.toString();
+ int ix = foo.indexOf(']');
+ Cluster canopy = Cluster.decodeCluster(foo.substring(0, ix + 1));
+ Vector point = AbstractVector.decodeVector(foo.substring(ix + 3));
+ output.collect(new Text(canopy.getIdentifier()), new Text(point
+ .asFormatString()));
+ }
+
+}
Added: lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/meanshift/InputDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/meanshift/InputDriver.java?rev=668205&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/meanshift/InputDriver.java (added)
+++ lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/meanshift/InputDriver.java Mon Jun 16 08:49:43 2008
@@ -0,0 +1,59 @@
+package org.apache.mahout.clustering.syntheticcontrol.meanshift;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.FileOutputFormat;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.mahout.matrix.Vector;
+
+public class InputDriver {
+
+ public static void main(String[] args) {
+ runJob(args[0], args[1]);
+ }
+
+ public static void runJob(String input, String output) {
+ JobClient client = new JobClient();
+ JobConf conf = new JobConf(
+ org.apache.mahout.clustering.syntheticcontrol.meanshift.InputDriver.class);
+
+ conf.setOutputKeyClass(Text.class);
+ conf.setOutputValueClass(Vector.class);
+
+ FileInputFormat.setInputPaths(conf, new Path(input));
+ FileOutputFormat.setOutputPath(conf, new Path(output));
+
+ conf
+ .setMapperClass(org.apache.mahout.clustering.syntheticcontrol.meanshift.InputMapper.class);
+
+ conf.setReducerClass(Reducer.class);
+ conf.setNumReduceTasks(0);
+
+ client.setConf(conf);
+ try {
+ JobClient.runJob(conf);
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+
+}
Added: lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/meanshift/InputMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/meanshift/InputMapper.java?rev=668205&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/meanshift/InputMapper.java (added)
+++ lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/meanshift/InputMapper.java Mon Jun 16 08:49:43 2008
@@ -0,0 +1,54 @@
+package org.apache.mahout.clustering.syntheticcontrol.meanshift;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.mahout.clustering.meanshift.MeanShiftCanopy;
+import org.apache.mahout.matrix.DenseVector;
+import org.apache.mahout.matrix.Vector;
+
+public class InputMapper extends MapReduceBase implements
+ Mapper<LongWritable, Text, Text, Text> {
+
+ public void map(LongWritable key, Text values,
+ OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
+ String[] numbers = values.toString().split(" ");
+ // sometimes there are multiple separator spaces
+ List<Double> doubles = new ArrayList<Double>();
+ for (int i = 0; i < numbers.length; i++) {
+ String value = numbers[i];
+ if (value.length() > 0)
+ doubles.add(new Double(value));
+ }
+ Vector point = new DenseVector(doubles.size());
+ int index = 0;
+ for (Double d : doubles)
+ point.set(index++, d);
+ MeanShiftCanopy canopy = new MeanShiftCanopy(point);
+ output.collect(null, new Text(canopy.toString()));
+ }
+
+}
Added: lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java?rev=668205&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java (added)
+++ lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java Mon Jun 16 08:49:43 2008
@@ -0,0 +1,87 @@
+package org.apache.mahout.clustering.syntheticcontrol.meanshift;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.mahout.clustering.meanshift.MeanShiftCanopyJob;
+import org.apache.mahout.clustering.syntheticcontrol.meanshift.InputDriver;
+
+public class Job {
+
+ public static void main(String[] args) {
+ if (args.length == 7) {
+ String input = args[0];
+ String output = args[1];
+ String measureClassName = args[2];
+ double t1 = new Double(args[3]);
+ double t2 = new Double(args[4]);
+ double convergenceDelta = new Double(args[5]);
+ int maxIterations = new Integer(args[6]);
+ runJob(input, output, measureClassName, t1, t2, convergenceDelta,
+ maxIterations);
+ } else
+ runJob("testdata", "output",
+ "org.apache.mahout.utils.EuclideanDistanceMeasure", 47.6, 1, 0.5, 10);
+ }
+
+ /**
+ * Run the meanshift clustering job on an input dataset using the given
+ * distance measure, t1, t2 and iteration parameters. All output data will be
+ * written to the output directory, which will be initially deleted if it
+ * exists. The clustered points will reside in the path
+ * <output>/clustered-points. By default, the job expects the a file
+ * containing synthetic_control.data as obtained from
+ * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series
+ * resides in a directory named "testdata", and writes output to a directory
+ * named "output".
+ *
+ * @param input the String denoting the input directory path
+ * @param output the String denoting the output directory path
+ * @param measureClassName the String class name of the DistanceMeasure to use
+ * @param t1 the meanshift canopy T1 threshold
+ * @param t2 the meanshift canopy T2 threshold
+ * @param convergenceDelta the double convergence criteria for iterations
+ * @param maxIterations the int maximum number of iterations
+ */
+ private static void runJob(String input, String output,
+ String measureClassName, double t1, double t2, double convergenceDelta,
+ int maxIterations) {
+ JobClient client = new JobClient();
+ JobConf conf = new JobConf(Job.class);
+
+ Path outPath = new Path(output);
+ client.setConf(conf);
+ try {
+ FileSystem dfs = FileSystem.get(conf);
+ if (dfs.exists(outPath))
+ dfs.delete(outPath, true);
+ InputDriver.runJob(input, output + "/data");
+ MeanShiftCanopyJob.runJob(output + "/data", output + "/meanshift",
+ measureClassName, t1, t2, convergenceDelta, maxIterations);
+ FileStatus[] status = dfs.listStatus(new Path(output + "/meanshift"));
+ OutputDriver.runJob(status[status.length - 1].getPath().toString(),
+ output + "/clustered-points");
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+
+}
Added: lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/meanshift/OutputDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/meanshift/OutputDriver.java?rev=668205&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/meanshift/OutputDriver.java (added)
+++ lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/meanshift/OutputDriver.java Mon Jun 16 08:49:43 2008
@@ -0,0 +1,61 @@
+package org.apache.mahout.clustering.syntheticcontrol.meanshift;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.FileOutputFormat;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.SequenceFileInputFormat;
+
+public class OutputDriver {
+
+ public static void main(String[] args) {
+ runJob(args[0], args[1]);
+ }
+
+ public static void runJob(String input, String output) {
+ JobClient client = new JobClient();
+ JobConf conf = new JobConf(
+ org.apache.mahout.clustering.syntheticcontrol.meanshift.OutputDriver.class);
+
+ conf.setOutputKeyClass(Text.class);
+ conf.setOutputValueClass(IntWritable.class);
+ conf.setInputFormat(SequenceFileInputFormat.class);
+
+ FileInputFormat.setInputPaths(conf, new Path(input));
+ FileOutputFormat.setOutputPath(conf, new Path(output));
+
+ conf.setMapperClass(OutputMapper.class);
+
+ conf.setReducerClass(Reducer.class);
+ conf.setNumReduceTasks(0);
+
+ client.setConf(conf);
+ try {
+ JobClient.runJob(conf);
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+
+}
Added: lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/meanshift/OutputMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/meanshift/OutputMapper.java?rev=668205&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/meanshift/OutputMapper.java (added)
+++ lucene/mahout/trunk/core/src/main/examples/org/apache/mahout/clustering/syntheticcontrol/meanshift/OutputMapper.java Mon Jun 16 08:49:43 2008
@@ -0,0 +1,50 @@
+package org.apache.mahout.clustering.syntheticcontrol.meanshift;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.mahout.clustering.meanshift.MeanShiftCanopy;
+import org.apache.mahout.matrix.Vector;
+
+public class OutputMapper extends MapReduceBase implements
+ Mapper<Text, Text, Text, Text> {
+
+ int clusters = 0;
+
+ public void map(Text key, Text values, OutputCollector<Text, Text> output,
+ Reporter reporter) throws IOException {
+ clusters++;
+ String foo = values.toString();
+ MeanShiftCanopy canopy = MeanShiftCanopy.decodeCanopy(foo);
+ for (Vector point : canopy.getBoundPoints())
+ output.collect(key, new Text(point.asFormatString()));
+ }
+
+ @Override
+ public void close() throws IOException {
+ System.out.println("+++ Clusters=" + clusters);
+ super.close();
+ }
+
+}