You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sr...@apache.org on 2012/06/20 14:07:58 UTC
svn commit: r1352052 [2/7] - in /mahout/trunk: ./ buildtools/
buildtools/src/main/resources/ core/
core/src/main/java/org/apache/mahout/cf/taste/hadoop/
core/src/main/java/org/apache/mahout/cf/taste/hadoop/als/
core/src/main/java/org/apache/mahout/cf/t...
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/tools/TreeVisualizer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/tools/TreeVisualizer.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/tools/TreeVisualizer.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/tools/TreeVisualizer.java Wed Jun 20 12:07:50 2012
@@ -44,88 +44,102 @@ public final class TreeVisualizer {
return df.format(value);
}
- private static String toStringNode(Node node, Dataset dataset, String[] attrNames,
- Map<String, Field> fields, int layer) throws IllegalAccessException {
+ private static String toStringNode(Node node,
+ Dataset dataset,
+ String[] attrNames,
+ Map<String, Field> fields,
+ int layer) {
+
StringBuilder buff = new StringBuilder();
- if (node instanceof CategoricalNode) {
- CategoricalNode cnode = (CategoricalNode) node;
- int attr = (Integer) fields.get("CategoricalNode.attr").get(cnode);
- double[] values = (double[]) fields.get("CategoricalNode.values").get(cnode);
- Node[] childs = (Node[]) fields.get("CategoricalNode.childs").get(cnode);
- String[][] attrValues = (String[][]) fields.get("Dataset.values").get(dataset);
- for (int i = 0; i < childs.length; i++) {
+ try {
+
+ if (node instanceof CategoricalNode) {
+ CategoricalNode cnode = (CategoricalNode) node;
+ int attr = (Integer) fields.get("CategoricalNode.attr").get(cnode);
+ double[] values = (double[]) fields.get("CategoricalNode.values").get(cnode);
+ Node[] childs = (Node[]) fields.get("CategoricalNode.childs").get(cnode);
+ String[][] attrValues = (String[][]) fields.get("Dataset.values").get(dataset);
+ for (int i = 0; i < childs.length; i++) {
+ buff.append('\n');
+ for (int j = 0; j < layer; j++) {
+ buff.append("| ");
+ }
+ buff.append(attrNames == null ? attr : attrNames[attr]).append(" = ").append(attrValues[attr][i]);
+ int index = ArrayUtils.indexOf(values, i);
+ if (index >= 0) {
+ buff.append(toStringNode(childs[index], dataset, attrNames, fields, layer + 1));
+ }
+ }
+ } else if (node instanceof NumericalNode) {
+ NumericalNode nnode = (NumericalNode) node;
+ int attr = (Integer) fields.get("NumericalNode.attr").get(nnode);
+ double split = (Double) fields.get("NumericalNode.split").get(nnode);
+ Node loChild = (Node) fields.get("NumericalNode.loChild").get(nnode);
+ Node hiChild = (Node) fields.get("NumericalNode.hiChild").get(nnode);
buff.append('\n');
for (int j = 0; j < layer; j++) {
buff.append("| ");
}
- buff.append((attrNames == null ? attr : attrNames[attr]) + " = " + attrValues[attr][i]);
- int index = ArrayUtils.indexOf(values, i);
- if (index >= 0) {
- buff.append(toStringNode(childs[index], dataset, attrNames, fields, layer + 1));
+ buff.append(attrNames == null ? attr : attrNames[attr]).append(" < ").append(doubleToString(split));
+ buff.append(toStringNode(loChild, dataset, attrNames, fields, layer + 1));
+ buff.append('\n');
+ for (int j = 0; j < layer; j++) {
+ buff.append("| ");
+ }
+ buff.append(attrNames == null ? attr : attrNames[attr]).append(" >= ").append(doubleToString(split));
+ buff.append(toStringNode(hiChild, dataset, attrNames, fields, layer + 1));
+ } else if (node instanceof Leaf) {
+ Leaf leaf = (Leaf) node;
+ double label = (Double) fields.get("Leaf.label").get(leaf);
+ if (dataset.isNumerical(dataset.getLabelId())) {
+ buff.append(" : ").append(doubleToString(label));
+ } else {
+ buff.append(" : ").append(dataset.getLabelString((int) label));
}
}
- } else if (node instanceof NumericalNode) {
- NumericalNode nnode = (NumericalNode) node;
- int attr = (Integer) fields.get("NumericalNode.attr").get(nnode);
- double split = (Double) fields.get("NumericalNode.split").get(nnode);
- Node loChild = (Node) fields.get("NumericalNode.loChild").get(nnode);
- Node hiChild = (Node) fields.get("NumericalNode.hiChild").get(nnode);
- buff.append('\n');
- for (int j = 0; j < layer; j++) {
- buff.append("| ");
- }
- buff.append((attrNames == null ? attr : attrNames[attr]) + " < " + doubleToString(split));
- buff.append(toStringNode(loChild, dataset, attrNames, fields, layer + 1));
- buff.append('\n');
- for (int j = 0; j < layer; j++) {
- buff.append("| ");
- }
- buff.append((attrNames == null ? attr : attrNames[attr]) + " >= " + doubleToString(split));
- buff.append(toStringNode(hiChild, dataset, attrNames, fields, layer + 1));
- } else if (node instanceof Leaf) {
- Leaf leaf = (Leaf) node;
- double label = (Double) fields.get("Leaf.label").get(leaf);
- if (dataset.isNumerical(dataset.getLabelId())) {
- buff.append(" : ").append(doubleToString(label));
- } else {
- buff.append(" : ").append(dataset.getLabelString((int) label));
- }
+
+ } catch (IllegalAccessException iae) {
+ throw new IllegalStateException(iae);
}
return buff.toString();
}
- private static Map<String, Field> getReflectMap() throws Exception {
+ private static Map<String, Field> getReflectMap() {
Map<String, Field> fields = new HashMap<String, Field>();
- Field m = CategoricalNode.class.getDeclaredField("attr");
- m.setAccessible(true);
- fields.put("CategoricalNode.attr", m);
- m = CategoricalNode.class.getDeclaredField("values");
- m.setAccessible(true);
- fields.put("CategoricalNode.values", m);
- m = CategoricalNode.class.getDeclaredField("childs");
- m.setAccessible(true);
- fields.put("CategoricalNode.childs", m);
- m = NumericalNode.class.getDeclaredField("attr");
- m.setAccessible(true);
- fields.put("NumericalNode.attr", m);
- m = NumericalNode.class.getDeclaredField("split");
- m.setAccessible(true);
- fields.put("NumericalNode.split", m);
- m = NumericalNode.class.getDeclaredField("loChild");
- m.setAccessible(true);
- fields.put("NumericalNode.loChild", m);
- m = NumericalNode.class.getDeclaredField("hiChild");
- m.setAccessible(true);
- fields.put("NumericalNode.hiChild", m);
- m = Leaf.class.getDeclaredField("label");
- m.setAccessible(true);
- fields.put("Leaf.label", m);
- m = Dataset.class.getDeclaredField("values");
- m.setAccessible(true);
- fields.put("Dataset.values", m);
+ try {
+ Field m = CategoricalNode.class.getDeclaredField("attr");
+ m.setAccessible(true);
+ fields.put("CategoricalNode.attr", m);
+ m = CategoricalNode.class.getDeclaredField("values");
+ m.setAccessible(true);
+ fields.put("CategoricalNode.values", m);
+ m = CategoricalNode.class.getDeclaredField("childs");
+ m.setAccessible(true);
+ fields.put("CategoricalNode.childs", m);
+ m = NumericalNode.class.getDeclaredField("attr");
+ m.setAccessible(true);
+ fields.put("NumericalNode.attr", m);
+ m = NumericalNode.class.getDeclaredField("split");
+ m.setAccessible(true);
+ fields.put("NumericalNode.split", m);
+ m = NumericalNode.class.getDeclaredField("loChild");
+ m.setAccessible(true);
+ fields.put("NumericalNode.loChild", m);
+ m = NumericalNode.class.getDeclaredField("hiChild");
+ m.setAccessible(true);
+ fields.put("NumericalNode.hiChild", m);
+ m = Leaf.class.getDeclaredField("label");
+ m.setAccessible(true);
+ fields.put("Leaf.label", m);
+ m = Dataset.class.getDeclaredField("values");
+ m.setAccessible(true);
+ fields.put("Dataset.values", m);
+ } catch (NoSuchFieldException nsfe) {
+ throw new IllegalStateException(nsfe);
+ }
return fields;
}
@@ -134,71 +148,73 @@ public final class TreeVisualizer {
* Decision tree to String
* @param tree
* Node of tree
- * @param dataset
* @param attrNames
* attribute names
*/
- public static String toString(Node tree, Dataset dataset, String[] attrNames)
- throws Exception {
+ public static String toString(Node tree, Dataset dataset, String[] attrNames) {
return toStringNode(tree, dataset, attrNames, getReflectMap(), 0);
}
/**
* Print Decision tree
- * @param tree
- * Node of tree
- * @param dataset
- * @param attrNames
- * attribute names
+ * @param tree Node of tree
+ * @param attrNames attribute names
*/
- public static void print(Node tree, Dataset dataset, String[] attrNames) throws Exception {
+ public static void print(Node tree, Dataset dataset, String[] attrNames) {
System.out.println(toString(tree, dataset, attrNames));
}
- private static String toStringPredict(Node node, Instance instance, Dataset dataset,
- String[] attrNames, Map<String, Field> fields) throws IllegalAccessException {
+ private static String toStringPredict(Node node,
+ Instance instance,
+ Dataset dataset,
+ String[] attrNames,
+ Map<String, Field> fields) {
StringBuilder buff = new StringBuilder();
- if (node instanceof CategoricalNode) {
- CategoricalNode cnode = (CategoricalNode) node;
- int attr = (Integer) fields.get("CategoricalNode.attr").get(cnode);
- double[] values = (double[]) fields.get("CategoricalNode.values").get(cnode);
- Node[] childs = (Node[]) fields.get("CategoricalNode.childs").get(cnode);
- String[][] attrValues = (String[][]) fields.get("Dataset.values").get(dataset);
-
- int index = ArrayUtils.indexOf(values, instance.get(attr));
- if (index >= 0) {
- buff.append((attrNames == null ? attr : attrNames[attr]) + " = "
- + attrValues[attr][(int) instance.get(attr)]);
- buff.append(" -> ");
- buff.append(toStringPredict(childs[index], instance, dataset, attrNames, fields));
- }
- } else if (node instanceof NumericalNode) {
- NumericalNode nnode = (NumericalNode) node;
- int attr = (Integer) fields.get("NumericalNode.attr").get(nnode);
- double split = (Double) fields.get("NumericalNode.split").get(nnode);
- Node loChild = (Node) fields.get("NumericalNode.loChild").get(nnode);
- Node hiChild = (Node) fields.get("NumericalNode.hiChild").get(nnode);
-
- if (instance.get(attr) < split) {
- buff.append("(" + (attrNames == null ? attr : attrNames[attr]) + " = "
- + doubleToString(instance.get(attr)) + ") < " + doubleToString(split));
- buff.append(" -> ");
- buff.append(toStringPredict(loChild, instance, dataset, attrNames, fields));
- } else {
- buff.append("(" + (attrNames == null ? attr : attrNames[attr]) + " = "
- + doubleToString(instance.get(attr)) + ") >= " + doubleToString(split));
- buff.append(" -> ");
- buff.append(toStringPredict(hiChild, instance, dataset, attrNames, fields));
- }
- } else if (node instanceof Leaf) {
- Leaf leaf = (Leaf) node;
- double label = (Double) fields.get("Leaf.label").get(leaf);
- if (dataset.isNumerical(dataset.getLabelId())) {
- buff.append(doubleToString(label));
- } else {
- buff.append(dataset.getLabelString((int) label));
+ try {
+ if (node instanceof CategoricalNode) {
+ CategoricalNode cnode = (CategoricalNode) node;
+ int attr = (Integer) fields.get("CategoricalNode.attr").get(cnode);
+ double[] values = (double[]) fields.get("CategoricalNode.values").get(cnode);
+ Node[] childs = (Node[]) fields.get("CategoricalNode.childs").get(cnode);
+ String[][] attrValues = (String[][]) fields.get("Dataset.values").get(dataset);
+
+ int index = ArrayUtils.indexOf(values, instance.get(attr));
+ if (index >= 0) {
+ buff.append(attrNames == null ? attr : attrNames[attr]).append(" = ")
+ .append(attrValues[attr][(int) instance.get(attr)]);
+ buff.append(" -> ");
+ buff.append(toStringPredict(childs[index], instance, dataset, attrNames, fields));
+ }
+ } else if (node instanceof NumericalNode) {
+ NumericalNode nnode = (NumericalNode) node;
+ int attr = (Integer) fields.get("NumericalNode.attr").get(nnode);
+ double split = (Double) fields.get("NumericalNode.split").get(nnode);
+ Node loChild = (Node) fields.get("NumericalNode.loChild").get(nnode);
+ Node hiChild = (Node) fields.get("NumericalNode.hiChild").get(nnode);
+
+ if (instance.get(attr) < split) {
+ buff.append('(').append(attrNames == null ? attr : attrNames[attr]).append(" = ")
+ .append(doubleToString(instance.get(attr))).append(") < ").append(doubleToString(split));
+ buff.append(" -> ");
+ buff.append(toStringPredict(loChild, instance, dataset, attrNames, fields));
+ } else {
+ buff.append('(').append(attrNames == null ? attr : attrNames[attr]).append(" = ")
+ .append(doubleToString(instance.get(attr))).append(") >= ").append(doubleToString(split));
+ buff.append(" -> ");
+ buff.append(toStringPredict(hiChild, instance, dataset, attrNames, fields));
+ }
+ } else if (node instanceof Leaf) {
+ Leaf leaf = (Leaf) node;
+ double label = (Double) fields.get("Leaf.label").get(leaf);
+ if (dataset.isNumerical(dataset.getLabelId())) {
+ buff.append(doubleToString(label));
+ } else {
+ buff.append(dataset.getLabelString((int) label));
+ }
}
+ } catch (IllegalAccessException iae) {
+ throw new IllegalStateException(iae);
}
return buff.toString();
@@ -208,12 +224,10 @@ public final class TreeVisualizer {
* Predict trace to String
* @param tree
* Node of tree
- * @param data
* @param attrNames
* attribute names
*/
- public static String[] predictTrace(Node tree, Data data, String[] attrNames)
- throws Exception {
+ public static String[] predictTrace(Node tree, Data data, String[] attrNames) {
Map<String, Field> reflectMap = getReflectMap();
String[] prediction = new String[data.size()];
for (int i = 0; i < data.size(); i++) {
@@ -226,16 +240,13 @@ public final class TreeVisualizer {
* Print predict trace
* @param tree
* Node of tree
- * @param data
* @param attrNames
* attribute names
*/
- public static void predictTracePrint(Node tree, Data data, String[] attrNames)
- throws Exception {
+ public static void predictTracePrint(Node tree, Data data, String[] attrNames) {
Map<String, Field> reflectMap = getReflectMap();
for (int i = 0; i < data.size(); i++) {
- System.out.println(toStringPredict(tree, data.get(i), data.getDataset(), attrNames,
- reflectMap));
+ System.out.println(toStringPredict(tree, data.get(i), data.getDataset(), attrNames, reflectMap));
}
}
}
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/tools/UDistrib.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/tools/UDistrib.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/tools/UDistrib.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/classifier/df/tools/UDistrib.java Wed Jun 20 12:07:50 2012
@@ -159,7 +159,7 @@ public final class UDistrib {
Path dataPath = new Path(dataStr);
FileSystem ifs = dataPath.getFileSystem(conf);
FSDataInputStream input = ifs.open(dataPath);
- Scanner scanner = new Scanner(input);
+ Scanner scanner = new Scanner(input, "UTF-8");
DataConverter converter = new DataConverter(dataset);
int nbInstances = dataset.nbInstances();
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/classifier/evaluation/Auc.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/evaluation/Auc.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/classifier/evaluation/Auc.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/classifier/evaluation/Auc.java Wed Jun 20 12:07:50 2012
@@ -37,7 +37,7 @@ import java.util.Random;
public class Auc {
private int maxBufferSize = 10000;
- private final DoubleArrayList[] scores = { new DoubleArrayList(), new DoubleArrayList() };
+ private final DoubleArrayList[] scores = {new DoubleArrayList(), new DoubleArrayList()};
private final Random rand;
private int samples;
private final double threshold;
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/classifier/naivebayes/BayesUtils.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/naivebayes/BayesUtils.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/classifier/naivebayes/BayesUtils.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/classifier/naivebayes/BayesUtils.java Wed Jun 20 12:07:50 2012
@@ -46,9 +46,12 @@ import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
+import java.util.regex.Pattern;
public final class BayesUtils {
+ private static final Pattern SLASH = Pattern.compile("/");
+
private BayesUtils() {}
public static NaiveBayesModel readModelFromDir(Path base, Configuration conf) {
@@ -116,7 +119,7 @@ public final class BayesUtils {
int i = 0;
try {
for (Object label : labels) {
- String theLabel = ((Pair<?,?>) label).getFirst().toString().split("/")[1];
+ String theLabel = SLASH.split(((Pair<?, ?>) label).getFirst().toString())[1];
if (!seen.contains(theLabel)) {
writer.append(new Text(theLabel), new IntWritable(i++));
seen.add(theLabel);
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/classifier/naivebayes/test/BayesTestMapper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/naivebayes/test/BayesTestMapper.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/classifier/naivebayes/test/BayesTestMapper.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/classifier/naivebayes/test/BayesTestMapper.java Wed Jun 20 12:07:50 2012
@@ -30,6 +30,7 @@ import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import java.io.IOException;
+import java.util.regex.Pattern;
/**
* Run the input through the model and see if it matches.
@@ -38,12 +39,13 @@ import java.io.IOException;
*/
public class BayesTestMapper extends Mapper<Text, VectorWritable, Text, VectorWritable> {
+ private static final Pattern SLASH = Pattern.compile("/");
+
private AbstractNaiveBayesClassifier classifier;
@Override
protected void setup(Context context) throws IOException, InterruptedException {
super.setup(context);
- System.out.println("Setup");
Configuration conf = context.getConfiguration();
Path modelPath = HadoopUtil.cachedFile(conf);
NaiveBayesModel model = NaiveBayesModel.materialize(modelPath, conf);
@@ -59,6 +61,6 @@ public class BayesTestMapper extends Map
protected void map(Text key, VectorWritable value, Context context) throws IOException, InterruptedException {
Vector result = classifier.classifyFull(value.get());
//the key is the expected value
- context.write(new Text(key.toString().split("/")[1]), new VectorWritable(result));
+ context.write(new Text(SLASH.split(key.toString())[1]), new VectorWritable(result));
}
}
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/classifier/naivebayes/test/TestNaiveBayesDriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/naivebayes/test/TestNaiveBayesDriver.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/classifier/naivebayes/test/TestNaiveBayesDriver.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/classifier/naivebayes/test/TestNaiveBayesDriver.java Wed Jun 20 12:07:50 2012
@@ -20,6 +20,7 @@ package org.apache.mahout.classifier.nai
import java.io.IOException;
import java.util.List;
import java.util.Map;
+import java.util.regex.Pattern;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
@@ -60,6 +61,7 @@ public class TestNaiveBayesDriver extend
public static final String LABEL_KEY = "labels";
public static final String COMPLEMENTARY = "class"; //b for bayes, c for complementary
+ private static final Pattern SLASH = Pattern.compile("/");
public static void main(String[] args) throws Exception {
ToolRunner.run(new Configuration(), new TestNaiveBayesDriver(), args);
@@ -95,11 +97,11 @@ public class TestNaiveBayesDriver extend
}
SequenceFile.Writer writer =
new SequenceFile.Writer(fs, getConf(), getOutputPath(), Text.class, VectorWritable.class);
- SequenceFile.Reader reader = new Reader(fs, getInputPath(), getConf());
+ Reader reader = new Reader(fs, getInputPath(), getConf());
Text key = new Text();
VectorWritable vw = new VectorWritable();
while (reader.next(key, vw)) {
- writer.append(new Text(key.toString().split("/")[1]),
+ writer.append(new Text(SLASH.split(key.toString())[1]),
new VectorWritable(classifier.classifyFull(vw.get())));
}
writer.close();
@@ -137,8 +139,7 @@ public class TestNaiveBayesDriver extend
//testJob.getConfiguration().set(LABEL_KEY, getOption("--labels"));
boolean complementary = parsedArgs.containsKey("testComplementary");
testJob.getConfiguration().set(COMPLEMENTARY, String.valueOf(complementary));
- boolean succeeded = testJob.waitForCompletion(true);
- return succeeded;
+ return testJob.waitForCompletion(true);
}
private static void analyzeResults(Map<Integer, String> labelMap,
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/classifier/naivebayes/training/IndexInstancesMapper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/naivebayes/training/IndexInstancesMapper.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/classifier/naivebayes/training/IndexInstancesMapper.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/classifier/naivebayes/training/IndexInstancesMapper.java Wed Jun 20 12:07:50 2012
@@ -18,6 +18,7 @@
package org.apache.mahout.classifier.naivebayes.training;
import java.io.IOException;
+import java.util.regex.Pattern;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
@@ -28,6 +29,8 @@ import org.apache.mahout.math.map.OpenOb
public class IndexInstancesMapper extends Mapper<Text, VectorWritable, IntWritable, VectorWritable> {
+ private static final Pattern SLASH = Pattern.compile("/");
+
public enum Counter { SKIPPED_INSTANCES }
private OpenObjectIntHashMap<String> labelIndex;
@@ -40,7 +43,7 @@ public class IndexInstancesMapper extend
@Override
protected void map(Text labelText, VectorWritable instance, Context ctx) throws IOException, InterruptedException {
- String label = labelText.toString().split("/")[1];
+ String label = SLASH.split(labelText.toString())[1];
if (labelIndex.containsKey(label)) {
ctx.write(new IntWritable(labelIndex.get(label)), instance);
} else {
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/classifier/naivebayes/training/TrainNaiveBayesJob.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/naivebayes/training/TrainNaiveBayesJob.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/classifier/naivebayes/training/TrainNaiveBayesJob.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/classifier/naivebayes/training/TrainNaiveBayesJob.java Wed Jun 20 12:07:50 2012
@@ -30,6 +30,7 @@ import org.apache.mahout.classifier.naiv
import org.apache.mahout.classifier.naivebayes.NaiveBayesModel;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.Pair;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.common.iterator.sequencefile.PathFilters;
import org.apache.mahout.common.iterator.sequencefile.PathType;
@@ -98,18 +99,32 @@ public final class TrainNaiveBayesJob ex
HadoopUtil.cacheFiles(labPath, getConf());
//add up all the vectors with the same labels, while mapping the labels into our index
- Job indexInstances = prepareJob(getInputPath(), getTempPath(SUMMED_OBSERVATIONS), SequenceFileInputFormat.class,
- IndexInstancesMapper.class, IntWritable.class, VectorWritable.class, VectorSumReducer.class, IntWritable.class,
- VectorWritable.class, SequenceFileOutputFormat.class);
+ Job indexInstances = prepareJob(getInputPath(),
+ getTempPath(SUMMED_OBSERVATIONS),
+ SequenceFileInputFormat.class,
+ IndexInstancesMapper.class,
+ IntWritable.class,
+ VectorWritable.class,
+ VectorSumReducer.class,
+ IntWritable.class,
+ VectorWritable.class,
+ SequenceFileOutputFormat.class);
indexInstances.setCombinerClass(VectorSumReducer.class);
boolean succeeded = indexInstances.waitForCompletion(true);
if (!succeeded) {
return -1;
}
//sum up all the weights from the previous step, per label and per feature
- Job weightSummer = prepareJob(getTempPath(SUMMED_OBSERVATIONS), getTempPath(WEIGHTS),
- SequenceFileInputFormat.class, WeightsMapper.class, Text.class, VectorWritable.class, VectorSumReducer.class,
- Text.class, VectorWritable.class, SequenceFileOutputFormat.class);
+ Job weightSummer = prepareJob(getTempPath(SUMMED_OBSERVATIONS),
+ getTempPath(WEIGHTS),
+ SequenceFileInputFormat.class,
+ WeightsMapper.class,
+ Text.class,
+ VectorWritable.class,
+ VectorSumReducer.class,
+ Text.class,
+ VectorWritable.class,
+ SequenceFileOutputFormat.class);
weightSummer.getConfiguration().set(WeightsMapper.NUM_LABELS, String.valueOf(labelSize));
weightSummer.setCombinerClass(VectorSumReducer.class);
succeeded = weightSummer.waitForCompletion(true);
@@ -120,10 +135,18 @@ public final class TrainNaiveBayesJob ex
//put the per label and per feature vectors into the cache
HadoopUtil.cacheFiles(getTempPath(WEIGHTS), getConf());
- //calculate the Thetas, write out to LABEL_THETA_NORMALIZER vectors -- TODO: add reference here to the part of the Rennie paper that discusses this
- Job thetaSummer = prepareJob(getTempPath(SUMMED_OBSERVATIONS), getTempPath(THETAS),
- SequenceFileInputFormat.class, ThetaMapper.class, Text.class, VectorWritable.class, VectorSumReducer.class,
- Text.class, VectorWritable.class, SequenceFileOutputFormat.class);
+ //calculate the Thetas, write out to LABEL_THETA_NORMALIZER vectors --
+ // TODO: add reference here to the part of the Rennie paper that discusses this
+ Job thetaSummer = prepareJob(getTempPath(SUMMED_OBSERVATIONS),
+ getTempPath(THETAS),
+ SequenceFileInputFormat.class,
+ ThetaMapper.class,
+ Text.class,
+ VectorWritable.class,
+ VectorSumReducer.class,
+ Text.class,
+ VectorWritable.class,
+ SequenceFileOutputFormat.class);
thetaSummer.setCombinerClass(VectorSumReducer.class);
thetaSummer.getConfiguration().setFloat(ThetaMapper.ALPHA_I, alphaI);
thetaSummer.getConfiguration().setBoolean(ThetaMapper.TRAIN_COMPLEMENTARY, trainComplementary);
@@ -147,8 +170,11 @@ public final class TrainNaiveBayesJob ex
Iterable<String> labels = Splitter.on(",").split(getOption(LABELS));
labelSize = BayesUtils.writeLabelIndex(getConf(), labels, labPath);
} else if (hasOption(EXTRACT_LABELS)) {
- SequenceFileDirIterable<Text, IntWritable> iterable =
- new SequenceFileDirIterable<Text, IntWritable>(getInputPath(), PathType.LIST, PathFilters.logsCRCFilter(), getConf());
+ Iterable<Pair<Text,IntWritable>> iterable =
+ new SequenceFileDirIterable<Text, IntWritable>(getInputPath(),
+ PathType.LIST,
+ PathFilters.logsCRCFilter(),
+ getConf());
labelSize = BayesUtils.writeLabelIndex(getConf(), labPath, iterable);
}
return labelSize;
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/BaumWelchTrainer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/BaumWelchTrainer.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/BaumWelchTrainer.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/BaumWelchTrainer.java Wed Jun 20 12:07:50 2012
@@ -98,7 +98,7 @@ public final class BaumWelchTrainer {
List<Integer> observations = new ArrayList<Integer>();
//reading observations
- Scanner scanner = new Scanner(new FileInputStream(input));
+ Scanner scanner = new Scanner(new FileInputStream(input), "UTF-8");
try {
while (scanner.hasNextInt()) {
observations.add(scanner.nextInt());
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/RandomSequenceGenerator.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/RandomSequenceGenerator.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/RandomSequenceGenerator.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/RandomSequenceGenerator.java Wed Jun 20 12:07:50 2012
@@ -18,6 +18,7 @@
package org.apache.mahout.classifier.sequencelearning.hmm;
+import com.google.common.base.Charsets;
import com.google.common.io.Closeables;
import org.apache.commons.cli2.CommandLine;
import org.apache.commons.cli2.Group;
@@ -33,6 +34,7 @@ import java.io.DataInputStream;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
+import java.io.OutputStreamWriter;
import java.io.PrintWriter;
/**
@@ -90,7 +92,7 @@ public final class RandomSequenceGenerat
int[] observations = HmmEvaluator.predict(model, length, System.currentTimeMillis());
//writing output
- PrintWriter writer = new PrintWriter(new FileOutputStream(output), true);
+ PrintWriter writer = new PrintWriter(new OutputStreamWriter(new FileOutputStream(output), Charsets.UTF_8), true);
try {
for (int observation : observations) {
writer.print(observation);
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/ViterbiEvaluator.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/ViterbiEvaluator.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/ViterbiEvaluator.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/ViterbiEvaluator.java Wed Jun 20 12:07:50 2012
@@ -17,6 +17,7 @@
package org.apache.mahout.classifier.sequencelearning.hmm;
+import com.google.common.base.Charsets;
import com.google.common.io.Closeables;
import org.apache.commons.cli2.CommandLine;
import org.apache.commons.cli2.Group;
@@ -33,6 +34,7 @@ import java.io.DataInputStream;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
+import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.List;
@@ -90,7 +92,7 @@ public final class ViterbiEvaluator {
//reading observations
List<Integer> observations = new ArrayList<Integer>();
- Scanner scanner = new Scanner(new FileInputStream(input));
+ Scanner scanner = new Scanner(new FileInputStream(input), "UTF-8");
try {
while (scanner.hasNextInt()) {
observations.add(scanner.nextInt());
@@ -108,7 +110,7 @@ public final class ViterbiEvaluator {
int[] hiddenStates = HmmEvaluator.decode(model, observationsArray, true);
//writing output
- PrintWriter writer = new PrintWriter(new FileOutputStream(output), true);
+ PrintWriter writer = new PrintWriter(new OutputStreamWriter(new FileOutputStream(output), Charsets.UTF_8), true);
try {
for (int hiddenState : hiddenStates) {
writer.print(hiddenState);
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/classifier/sgd/AbstractOnlineLogisticRegression.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/sgd/AbstractOnlineLogisticRegression.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/classifier/sgd/AbstractOnlineLogisticRegression.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/classifier/sgd/AbstractOnlineLogisticRegression.java Wed Jun 20 12:07:50 2012
@@ -82,7 +82,7 @@ public abstract class AbstractOnlineLogi
* of v is disturbed.
* @return A version of v with the link function applied.
*/
- public Vector link(Vector v) {
+ public static Vector link(Vector v) {
double max = v.maxValue();
if (max >= 40) {
// if max > 40, we subtract the large offset first
@@ -101,7 +101,7 @@ public abstract class AbstractOnlineLogi
* @param r The value to transform.
* @return The logit of r.
*/
- public double link(double r) {
+ public static double link(double r) {
if (r < 0.0) {
double s = Math.exp(r);
return s / (1.0 + s);
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/classifier/sgd/AdaptiveLogisticRegression.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/sgd/AdaptiveLogisticRegression.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/classifier/sgd/AdaptiveLogisticRegression.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/classifier/sgd/AdaptiveLogisticRegression.java Wed Jun 20 12:07:50 2012
@@ -116,7 +116,7 @@ public class AdaptiveLogisticRegression
Wrapper w = new Wrapper(numCategories, numFeatures, prior);
seed.setPayload(w);
- w.setMappings(seed);
+ Wrapper.setMappings(seed);
seed.setPayload(w);
setPoolSize(this.poolSize);
}
@@ -180,7 +180,7 @@ public class AdaptiveLogisticRegression
// mutation rates small and also hack their learning rate to be small
// as well.
for (State<Wrapper, CrossFoldLearner> state : ep.getPopulation().subList(0, SURVIVORS)) {
- state.getPayload().freeze(state);
+ Wrapper.freeze(state);
}
}
}
@@ -411,7 +411,7 @@ public class AdaptiveLogisticRegression
wrapped.decayExponent(0);
}
- public void freeze(State<Wrapper, CrossFoldLearner> s) {
+ public static void freeze(State<Wrapper, CrossFoldLearner> s) {
// radically decrease learning rate
s.getParams()[1] -= 10;
@@ -423,7 +423,7 @@ public class AdaptiveLogisticRegression
}
}
- public void setMappings(State<Wrapper, CrossFoldLearner> x) {
+ public static void setMappings(State<Wrapper, CrossFoldLearner> x) {
int i = 0;
// set the range for regularization (lambda)
x.setMap(i++, Mapping.logLimit(1.0e-8, 0.1));
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/AbstractCluster.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/AbstractCluster.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/AbstractCluster.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/AbstractCluster.java Wed Jun 20 12:07:50 2012
@@ -130,6 +130,7 @@ public abstract class AbstractCluster im
this.id = id;
}
+ @Override
public long getNumObservations() {
return numObservations;
}
@@ -142,6 +143,7 @@ public abstract class AbstractCluster im
this.numObservations = l;
}
+ @Override
public long getTotalObservations() {
return totalObservations;
}
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyDriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyDriver.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyDriver.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/canopy/CanopyDriver.java Wed Jun 20 12:07:50 2012
@@ -17,8 +17,6 @@
package org.apache.mahout.clustering.canopy;
-import static org.apache.mahout.clustering.topdown.PathDirectory.CLUSTERED_POINTS_DIRECTORY;
-
import java.io.IOException;
import java.util.Collection;
@@ -39,6 +37,7 @@ import org.apache.mahout.clustering.clas
import org.apache.mahout.clustering.classify.ClusterClassifier;
import org.apache.mahout.clustering.iterator.CanopyClusteringPolicy;
import org.apache.mahout.clustering.iterator.ClusterWritable;
+import org.apache.mahout.clustering.topdown.PathDirectory;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.ClassUtils;
import org.apache.mahout.common.HadoopUtil;
@@ -155,13 +154,12 @@ public class CanopyDriver extends Abstra
Path clustersOut = buildClusters(conf, input, output, measure, t1, t2, t3,
t4, clusterFilter, runSequential);
if (runClustering) {
- clusterData(conf, input, clustersOut, output, clusterClassificationThreshold, runSequential);
+ clusterData(input, clustersOut, output, clusterClassificationThreshold, runSequential);
}
}
/**
* Convenience method to provide backward compatibility
- * @param clusterClassificationThreshold TODO
*/
public static void run(Configuration conf, Path input, Path output,
DistanceMeasure measure, double t1, double t2, boolean runClustering,
@@ -365,12 +363,17 @@ public class CanopyDriver extends Abstra
return canopyOutputDir;
}
- private static void clusterData(Configuration conf, Path points, Path canopies, Path output,
- double clusterClassificationThreshold, boolean runSequential) throws IOException, InterruptedException,
- ClassNotFoundException {
+ private static void clusterData(Path points,
+ Path canopies,
+ Path output,
+ double clusterClassificationThreshold,
+ boolean runSequential)
+ throws IOException, InterruptedException, ClassNotFoundException {
ClusterClassifier.writePolicy(new CanopyClusteringPolicy(), canopies);
- ClusterClassificationDriver.run(points, output, new Path(output, CLUSTERED_POINTS_DIRECTORY),
- clusterClassificationThreshold, true, runSequential);
+ ClusterClassificationDriver.run(points,
+ output,
+ new Path(output, PathDirectory.CLUSTERED_POINTS_DIRECTORY),
+ clusterClassificationThreshold, true, runSequential);
}
}
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationConfigKeys.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationConfigKeys.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationConfigKeys.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationConfigKeys.java Wed Jun 20 12:07:50 2012
@@ -20,12 +20,14 @@ package org.apache.mahout.clustering.cla
/**
* Constants used in Cluster Classification.
*/
-public class ClusterClassificationConfigKeys {
+public final class ClusterClassificationConfigKeys {
public static final String CLUSTERS_IN = "clusters_in";
public static final String OUTLIER_REMOVAL_THRESHOLD = "pdf_threshold";
public static final String EMIT_MOST_LIKELY = "emit_most_likely";
-
+
+ private ClusterClassificationConfigKeys() {
+ }
}
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationDriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationDriver.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationDriver.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationDriver.java Wed Jun 20 12:07:50 2012
@@ -17,10 +17,6 @@
package org.apache.mahout.clustering.classify;
-import static org.apache.mahout.clustering.classify.ClusterClassificationConfigKeys.CLUSTERS_IN;
-import static org.apache.mahout.clustering.classify.ClusterClassificationConfigKeys.EMIT_MOST_LIKELY;
-import static org.apache.mahout.clustering.classify.ClusterClassificationConfigKeys.OUTLIER_REMOVAL_THRESHOLD;
-
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
@@ -56,7 +52,7 @@ import org.apache.mahout.math.VectorWrit
* Classifies the vectors into different clusters found by the clustering
* algorithm.
*/
-public class ClusterClassificationDriver extends AbstractJob {
+public final class ClusterClassificationDriver extends AbstractJob {
/**
* CLI to run Cluster Classification Driver.
@@ -98,7 +94,8 @@ public class ClusterClassificationDriver
/**
* Constructor to be used by the ToolRunner.
*/
- private ClusterClassificationDriver() {}
+ private ClusterClassificationDriver() {
+ }
public static void main(String[] args) throws Exception {
ToolRunner.run(new Configuration(), new ClusterClassificationDriver(), args);
@@ -158,13 +155,12 @@ public class ClusterClassificationDriver
*/
private static List<Cluster> populateClusterModels(Path clusterOutputPath, Configuration conf) throws IOException {
List<Cluster> clusterModels = new ArrayList<Cluster>();
- Cluster cluster = null;
Path finalClustersPath = finalClustersPath(conf, clusterOutputPath);
Iterator<?> it = new SequenceFileDirValueIterator<Writable>(finalClustersPath, PathType.LIST,
PathFilters.partFilter(), null, false, conf);
while (it.hasNext()) {
ClusterWritable next = (ClusterWritable) it.next();
- cluster = (Cluster) next.getValue();
+ Cluster cluster = next.getValue();
cluster.configure(conf);
clusterModels.add(cluster);
}
@@ -174,8 +170,7 @@ public class ClusterClassificationDriver
private static Path finalClustersPath(Configuration conf, Path clusterOutputPath) throws IOException {
FileSystem fileSystem = clusterOutputPath.getFileSystem(conf);
FileStatus[] clusterFiles = fileSystem.listStatus(clusterOutputPath, PathFilters.finalPartFilter());
- Path finalClustersPath = clusterFiles[0].getPath();
- return finalClustersPath;
+ return clusterFiles[0].getPath();
}
/**
@@ -246,17 +241,17 @@ public class ClusterClassificationDriver
* @return whether the vector should be classified or not.
*/
private static boolean shouldClassify(Vector pdfPerCluster, Double clusterClassificationThreshold) {
- boolean isMaxPDFGreatherThanThreshold = pdfPerCluster.maxValue() >= clusterClassificationThreshold;
- return isMaxPDFGreatherThanThreshold;
+ return pdfPerCluster.maxValue() >= clusterClassificationThreshold;
}
private static void classifyClusterMR(Configuration conf, Path input, Path clustersIn, Path output,
Double clusterClassificationThreshold, boolean emitMostLikely) throws IOException, InterruptedException,
ClassNotFoundException {
- conf.setFloat(OUTLIER_REMOVAL_THRESHOLD, clusterClassificationThreshold.floatValue());
- conf.setBoolean(EMIT_MOST_LIKELY, emitMostLikely);
- conf.set(CLUSTERS_IN, clustersIn.toUri().toString());
+ conf.setFloat(ClusterClassificationConfigKeys.OUTLIER_REMOVAL_THRESHOLD,
+ clusterClassificationThreshold.floatValue());
+ conf.setBoolean(ClusterClassificationConfigKeys.EMIT_MOST_LIKELY, emitMostLikely);
+ conf.set(ClusterClassificationConfigKeys.CLUSTERS_IN, clustersIn.toUri().toString());
Job job = new Job(conf, "Cluster Classification Driver running over input: " + input);
job.setJarByClass(ClusterClassificationDriver.class);
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationMapper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationMapper.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationMapper.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationMapper.java Wed Jun 20 12:07:50 2012
@@ -17,10 +17,6 @@
package org.apache.mahout.clustering.classify;
-import static org.apache.mahout.clustering.classify.ClusterClassificationConfigKeys.CLUSTERS_IN;
-import static org.apache.mahout.clustering.classify.ClusterClassificationConfigKeys.EMIT_MOST_LIKELY;
-import static org.apache.mahout.clustering.classify.ClusterClassificationConfigKeys.OUTLIER_REMOVAL_THRESHOLD;
-
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
@@ -50,7 +46,7 @@ import org.apache.mahout.math.VectorWrit
public class ClusterClassificationMapper extends
Mapper<WritableComparable<?>,VectorWritable,IntWritable,WeightedVectorWritable> {
- private static double threshold;
+ private double threshold;
private List<Cluster> clusterModels;
private ClusterClassifier clusterClassifier;
private IntWritable clusterId;
@@ -58,14 +54,13 @@ public class ClusterClassificationMapper
private boolean emitMostLikely;
@Override
- protected void setup(Context context) throws IOException,
- InterruptedException {
+ protected void setup(Context context) throws IOException, InterruptedException {
super.setup(context);
Configuration conf = context.getConfiguration();
- String clustersIn = conf.get(CLUSTERS_IN);
- threshold = conf.getFloat(OUTLIER_REMOVAL_THRESHOLD, 0.0f);
- emitMostLikely = conf.getBoolean(EMIT_MOST_LIKELY, false);
+ String clustersIn = conf.get(ClusterClassificationConfigKeys.CLUSTERS_IN);
+ threshold = conf.getFloat(ClusterClassificationConfigKeys.OUTLIER_REMOVAL_THRESHOLD, 0.0f);
+ emitMostLikely = conf.getBoolean(ClusterClassificationConfigKeys.EMIT_MOST_LIKELY, false);
clusterModels = new ArrayList<Cluster>();
@@ -119,37 +114,29 @@ public class ClusterClassificationMapper
context.write(clusterId, weightedVW);
}
- public static List<Cluster> populateClusterModels(Path clusterOutputPath, Configuration conf)
- throws IOException {
+ public static List<Cluster> populateClusterModels(Path clusterOutputPath, Configuration conf) throws IOException {
List<Cluster> clusters = new ArrayList<Cluster>();
- Cluster cluster = null;
FileSystem fileSystem = clusterOutputPath.getFileSystem(conf);
- FileStatus[] clusterFiles = fileSystem.listStatus(clusterOutputPath,
- PathFilters.finalPartFilter());
+ FileStatus[] clusterFiles = fileSystem.listStatus(clusterOutputPath, PathFilters.finalPartFilter());
Iterator<?> it = new SequenceFileDirValueIterator<Writable>(
clusterFiles[0].getPath(), PathType.LIST, PathFilters.partFilter(),
null, false, conf);
while (it.hasNext()) {
ClusterWritable next = (ClusterWritable) it.next();
- cluster = next.getValue();
+ Cluster cluster = next.getValue();
cluster.configure(conf);
clusters.add(cluster);
}
return clusters;
}
- private static boolean shouldClassify(Vector pdfPerCluster) {
- boolean isMaxPDFGreatherThanThreshold = pdfPerCluster.maxValue() >= threshold;
- return isMaxPDFGreatherThanThreshold;
+ private boolean shouldClassify(Vector pdfPerCluster) {
+ return pdfPerCluster.maxValue() >= threshold;
}
- private static Path finalClustersPath(Path clusterOutputPath)
- throws IOException {
- FileSystem fileSystem = clusterOutputPath
- .getFileSystem(new Configuration());
- FileStatus[] clusterFiles = fileSystem.listStatus(clusterOutputPath,
- PathFilters.finalPartFilter());
- Path finalClustersPath = clusterFiles[0].getPath();
- return finalClustersPath;
+ private static Path finalClustersPath(Path clusterOutputPath) throws IOException {
+ FileSystem fileSystem = clusterOutputPath.getFileSystem(new Configuration());
+ FileStatus[] clusterFiles = fileSystem.listStatus(clusterOutputPath, PathFilters.finalPartFilter());
+ return clusterFiles[0].getPath();
}
}
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/classify/WeightedPropertyVectorWritable.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/classify/WeightedPropertyVectorWritable.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/classify/WeightedPropertyVectorWritable.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/classify/WeightedPropertyVectorWritable.java Wed Jun 20 12:07:50 2012
@@ -80,7 +80,7 @@ public class WeightedPropertyVectorWrita
@Override
public String toString() {
Vector vector = getVector();
- StringBuilder bldr = new StringBuilder("wt: ").append(getWeight()).append(" ");
+ StringBuilder bldr = new StringBuilder("wt: ").append(getWeight()).append(' ');
if (properties != null && !properties.isEmpty()) {
for (Map.Entry<Text, Text> entry : properties.entrySet()) {
bldr.append(entry.getKey().toString()).append(": ").append(entry.getValue().toString()).append(' ');
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java Wed Jun 20 12:07:50 2012
@@ -17,8 +17,6 @@
package org.apache.mahout.clustering.dirichlet;
-import static org.apache.mahout.clustering.topdown.PathDirectory.CLUSTERED_POINTS_DIRECTORY;
-
import java.io.IOException;
import java.util.List;
@@ -36,6 +34,7 @@ import org.apache.mahout.clustering.diri
import org.apache.mahout.clustering.dirichlet.models.GaussianClusterDistribution;
import org.apache.mahout.clustering.iterator.ClusterIterator;
import org.apache.mahout.clustering.iterator.DirichletClusteringPolicy;
+import org.apache.mahout.clustering.topdown.PathDirectory;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
@@ -200,9 +199,9 @@ public class DirichletDriver extends Abs
prior.writeToSeqFiles(clustersIn);
if (runSequential) {
- new ClusterIterator().iterateSeq(conf, input, clustersIn, output, maxIterations);
+ ClusterIterator.iterateSeq(conf, input, clustersIn, output, maxIterations);
} else {
- new ClusterIterator().iterateMR(conf, input, clustersIn, output, maxIterations);
+ ClusterIterator.iterateMR(conf, input, clustersIn, output, maxIterations);
}
return output;
@@ -218,10 +217,6 @@ public class DirichletDriver extends Abs
* the directory pathname for input state
* @param output
* the directory pathname for output points
- * @param alpha0
- * TODO
- * @param numModels
- * TODO
* @param emitMostLikely
* a boolean if true emit only most likely cluster for each point
* @param threshold
@@ -233,7 +228,7 @@ public class DirichletDriver extends Abs
int numModels, boolean emitMostLikely, double threshold, boolean runSequential) throws IOException,
InterruptedException, ClassNotFoundException {
ClusterClassifier.writePolicy(new DirichletClusteringPolicy(numModels, alpha0), stateIn);
- ClusterClassificationDriver.run(conf, input, output, new Path(output, CLUSTERED_POINTS_DIRECTORY), threshold,
+ ClusterClassificationDriver.run(conf, input, output, new Path(output, PathDirectory.CLUSTERED_POINTS_DIRECTORY), threshold,
emitMostLikely, runSequential);
}
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/DistributionDescription.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/DistributionDescription.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/DistributionDescription.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/DistributionDescription.java Wed Jun 20 12:07:50 2012
@@ -17,8 +17,6 @@
package org.apache.mahout.clustering.dirichlet.models;
-import java.lang.reflect.Constructor;
-import java.lang.reflect.InvocationTargetException;
import java.util.Iterator;
import org.apache.hadoop.conf.Configuration;
@@ -40,7 +38,10 @@ public final class DistributionDescripti
private final String distanceMeasure;
private final int prototypeSize;
- public DistributionDescription(String modelFactory, String modelPrototype, String distanceMeasure, int prototypeSize) {
+ public DistributionDescription(String modelFactory,
+ String modelPrototype,
+ String distanceMeasure,
+ int prototypeSize) {
this.modelFactory = modelFactory;
this.modelPrototype = modelPrototype;
this.distanceMeasure = distanceMeasure;
@@ -65,36 +66,24 @@ public final class DistributionDescripti
/**
* Create an instance of AbstractVectorModelDistribution from the given command line arguments
- *
- * @param conf
- * the Configuration
*/
public ModelDistribution<VectorWritable> createModelDistribution(Configuration conf) {
- ClassLoader ccl = Thread.currentThread().getContextClassLoader();
- AbstractVectorModelDistribution modelDistribution;
- try {
- modelDistribution = ClassUtils.instantiateAs(modelFactory, AbstractVectorModelDistribution.class);
-
- Class<? extends Vector> vcl = ccl.loadClass(modelPrototype).asSubclass(Vector.class);
- Constructor<? extends Vector> v = vcl.getConstructor(int.class);
- modelDistribution.setModelPrototype(new VectorWritable(v.newInstance(prototypeSize)));
+ AbstractVectorModelDistribution modelDistribution =
+ ClassUtils.instantiateAs(modelFactory, AbstractVectorModelDistribution.class);
+
+ Vector prototype = ClassUtils.instantiateAs(modelPrototype,
+ Vector.class,
+ new Class<?>[] {int.class},
+ new Object[] {prototypeSize});
- if (modelDistribution instanceof DistanceMeasureClusterDistribution) {
- DistanceMeasure measure = ClassUtils.instantiateAs(distanceMeasure, DistanceMeasure.class);
- measure.configure(conf);
- ((DistanceMeasureClusterDistribution) modelDistribution).setMeasure(measure);
- }
- } catch (ClassNotFoundException cnfe) {
- throw new IllegalStateException(cnfe);
- } catch (NoSuchMethodException nsme) {
- throw new IllegalStateException(nsme);
- } catch (InstantiationException ie) {
- throw new IllegalStateException(ie);
- } catch (IllegalAccessException iae) {
- throw new IllegalStateException(iae);
- } catch (InvocationTargetException ite) {
- throw new IllegalStateException(ite);
+ modelDistribution.setModelPrototype(new VectorWritable(prototype));
+
+ if (modelDistribution instanceof DistanceMeasureClusterDistribution) {
+ DistanceMeasure measure = ClassUtils.instantiateAs(distanceMeasure, DistanceMeasure.class);
+ measure.configure(conf);
+ ((DistanceMeasureClusterDistribution) modelDistribution).setMeasure(measure);
}
+
return modelDistribution;
}
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/GaussianCluster.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/GaussianCluster.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/GaussianCluster.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/models/GaussianCluster.java Wed Jun 20 12:07:50 2012
@@ -47,10 +47,7 @@ public class GaussianCluster extends Abs
public Model<VectorWritable> sampleFromPosterior() {
return new GaussianCluster(getCenter(), getRadius(), getId());
}
-
- /* (non-Javadoc)
- * @see org.apache.mahout.clustering.AbstractCluster#setRadius(org.apache.mahout.math.Vector)
- */
+
@Override
protected void setRadius(Vector s2) {
super.setRadius(s2);
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java Wed Jun 20 12:07:50 2012
@@ -17,8 +17,6 @@
package org.apache.mahout.clustering.fuzzykmeans;
-import static org.apache.mahout.clustering.topdown.PathDirectory.CLUSTERED_POINTS_DIRECTORY;
-
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
@@ -33,6 +31,7 @@ import org.apache.mahout.clustering.iter
import org.apache.mahout.clustering.iterator.ClusteringPolicy;
import org.apache.mahout.clustering.iterator.FuzzyKMeansClusteringPolicy;
import org.apache.mahout.clustering.kmeans.RandomSeedGenerator;
+import org.apache.mahout.clustering.topdown.PathDirectory;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.ClassUtils;
import org.apache.mahout.common.HadoopUtil;
@@ -283,9 +282,9 @@ public class FuzzyKMeansDriver extends A
prior.writeToSeqFiles(priorClustersPath);
if (runSequential) {
- new ClusterIterator().iterateSeq(conf, input, priorClustersPath, output, maxIterations);
+ ClusterIterator.iterateSeq(conf, input, priorClustersPath, output, maxIterations);
} else {
- new ClusterIterator().iterateMR(conf, input, priorClustersPath, output, maxIterations);
+ ClusterIterator.iterateMR(conf, input, priorClustersPath, output, maxIterations);
}
return output;
}
@@ -321,7 +320,7 @@ public class FuzzyKMeansDriver extends A
throws IOException, ClassNotFoundException, InterruptedException {
ClusterClassifier.writePolicy(new FuzzyKMeansClusteringPolicy(m, convergenceDelta), clustersIn);
- ClusterClassificationDriver.run(input, output, new Path(output, CLUSTERED_POINTS_DIRECTORY), threshold, true,
+ ClusterClassificationDriver.run(input, output, new Path(output, PathDirectory.CLUSTERED_POINTS_DIRECTORY), threshold, true,
runSequential);
}
}
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/iterator/CIMapper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/iterator/CIMapper.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/iterator/CIMapper.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/iterator/CIMapper.java Wed Jun 20 12:07:50 2012
@@ -18,16 +18,8 @@ import org.apache.mahout.math.VectorWrit
public class CIMapper extends Mapper<WritableComparable<?>,VectorWritable,IntWritable,ClusterWritable> {
private ClusterClassifier classifier;
-
private ClusteringPolicy policy;
-
- /*
- * (non-Javadoc)
- *
- * @see
- * org.apache.hadoop.mapreduce.Mapper#setup(org.apache.hadoop.mapreduce.Mapper
- * .Context)
- */
+
@Override
protected void setup(Context context) throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
@@ -38,13 +30,7 @@ public class CIMapper extends Mapper<Wri
policy.update(classifier);
super.setup(context);
}
-
- /*
- * (non-Javadoc)
- *
- * @see org.apache.hadoop.mapreduce.Mapper#map(java.lang.Object,
- * java.lang.Object, org.apache.hadoop.mapreduce.Mapper.Context)
- */
+
@Override
protected void map(WritableComparable<?> key, VectorWritable value, Context context) throws IOException,
InterruptedException {
@@ -55,14 +41,7 @@ public class CIMapper extends Mapper<Wri
classifier.train(el.index(), value.get(), el.get());
}
}
-
- /*
- * (non-Javadoc)
- *
- * @see
- * org.apache.hadoop.mapreduce.Mapper#cleanup(org.apache.hadoop.mapreduce.
- * Mapper.Context)
- */
+
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
List<Cluster> clusters = classifier.getModels();
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/iterator/CIReducer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/iterator/CIReducer.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/iterator/CIReducer.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/iterator/CIReducer.java Wed Jun 20 12:07:50 2012
@@ -53,14 +53,7 @@ public class CIReducer extends Reducer<I
classifier.close();
context.write(key, first);
}
-
- /*
- * (non-Javadoc)
- *
- * @see
- * org.apache.hadoop.mapreduce.Mapper#setup(org.apache.hadoop.mapreduce.Mapper
- * .Context)
- */
+
@Override
protected void setup(Context context) throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/iterator/CanopyClusteringPolicy.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/iterator/CanopyClusteringPolicy.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/iterator/CanopyClusteringPolicy.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/iterator/CanopyClusteringPolicy.java Wed Jun 20 12:07:50 2012
@@ -29,20 +29,10 @@ import org.apache.mahout.math.Vector;
*
*/
public class CanopyClusteringPolicy extends AbstractClusteringPolicy {
-
- public CanopyClusteringPolicy() {
- super();
- }
-
- private double t1, t2;
-
- /*
- * (non-Javadoc)
- *
- * @see
- * org.apache.mahout.clustering.ClusteringPolicy#select(org.apache.mahout.
- * math.Vector)
- */
+
+ private double t1;
+ private double t2;
+
@Override
public Vector select(Vector probabilities) {
int maxValueIndex = probabilities.maxValueIndex();
@@ -50,23 +40,13 @@ public class CanopyClusteringPolicy exte
weights.set(maxValueIndex, 1.0);
return weights;
}
-
- /*
- * (non-Javadoc)
- *
- * @see org.apache.hadoop.io.Writable#write(java.io.DataOutput)
- */
+
@Override
public void write(DataOutput out) throws IOException {
out.writeDouble(t1);
out.writeDouble(t2);
}
-
- /*
- * (non-Javadoc)
- *
- * @see org.apache.hadoop.io.Writable#readFields(java.io.DataInput)
- */
+
@Override
public void readFields(DataInput in) throws IOException {
this.t1 = in.readDouble();
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/iterator/ClusterIterator.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/iterator/ClusterIterator.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/iterator/ClusterIterator.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/iterator/ClusterIterator.java Wed Jun 20 12:07:50 2012
@@ -46,15 +46,16 @@ import com.google.common.io.Closeables;
* algorithm (currently k-means, fuzzy-k-means and Dirichlet) that processes all the input vectors in each iteration.
* The cluster classifier is configured with a ClusteringPolicy to select the desired clustering algorithm.
*/
-public class ClusterIterator {
+public final class ClusterIterator {
public static final String PRIOR_PATH_KEY = "org.apache.mahout.clustering.prior.path";
+
+ private ClusterIterator() {
+ }
/**
* Iterate over data using a prior-trained ClusterClassifier, for a number of iterations
- *
- * @param policy
- * the ClusteringPolicy to use
+ *
* @param data
* a {@code List<Vector>} of input vectors
* @param classifier
@@ -64,7 +65,7 @@ public class ClusterIterator {
*
* @return the posterior ClusterClassifier
*/
- public ClusterClassifier iterate(Iterable<Vector> data, ClusterClassifier classifier, int numIterations) {
+ public static ClusterClassifier iterate(Iterable<Vector> data, ClusterClassifier classifier, int numIterations) {
ClusteringPolicy policy = classifier.getPolicy();
for (int iteration = 1; iteration <= numIterations; iteration++) {
for (Vector vector : data) {
@@ -100,11 +101,9 @@ public class ClusterIterator {
* a Path of output directory
* @param numIterations
* the int number of iterations to perform
- *
- * @throws IOException
*/
- public void iterateSeq(Configuration conf, Path inPath, Path priorPath, Path outPath, int numIterations)
- throws IOException {
+ public static void iterateSeq(Configuration conf, Path inPath, Path priorPath, Path outPath, int numIterations)
+ throws IOException {
ClusterClassifier classifier = new ClusterClassifier();
classifier.readFromSeqFiles(conf, priorPath);
Path clustersOut = null;
@@ -155,8 +154,8 @@ public class ClusterIterator {
* @param numIterations
* the int number of iterations to perform
*/
- public void iterateMR(Configuration conf, Path inPath, Path priorPath, Path outPath, int numIterations)
- throws IOException, InterruptedException, ClassNotFoundException {
+ public static void iterateMR(Configuration conf, Path inPath, Path priorPath, Path outPath, int numIterations)
+ throws IOException, InterruptedException, ClassNotFoundException {
ClusteringPolicy policy = ClusterClassifier.readPolicy(priorPath);
Path clustersOut = null;
int iteration = 1;
@@ -164,7 +163,6 @@ public class ClusterIterator {
conf.set(PRIOR_PATH_KEY, priorPath.toString());
String jobName = "Cluster Iterator running iteration " + iteration + " over priorPath: " + priorPath;
- System.out.println(jobName);
Job job = new Job(conf, jobName);
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(ClusterWritable.class);
@@ -205,7 +203,7 @@ public class ClusterIterator {
* @throws IOException
* if there was an IO error
*/
- private boolean isConverged(Path filePath, Configuration conf, FileSystem fs) throws IOException {
+ private static boolean isConverged(Path filePath, Configuration conf, FileSystem fs) throws IOException {
for (FileStatus part : fs.listStatus(filePath, PathFilters.partFilter())) {
SequenceFileValueIterator<ClusterWritable> iterator = new SequenceFileValueIterator<ClusterWritable>(
part.getPath(), true, conf);
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/iterator/ClusteringPolicy.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/iterator/ClusteringPolicy.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/iterator/ClusteringPolicy.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/iterator/ClusteringPolicy.java Wed Jun 20 12:07:50 2012
@@ -36,7 +36,7 @@ public interface ClusteringPolicy extend
* @return a Vector of probabilities that the data is described by each of the
* models
*/
- public Vector classify(Vector data, ClusterClassifier prior);
+ Vector classify(Vector data, ClusterClassifier prior);
/**
* Return a vector of weights for each of the models given those probabilities
@@ -45,7 +45,7 @@ public interface ClusteringPolicy extend
* a Vector of pdfs
* @return a Vector of weights
*/
- public Vector select(Vector probabilities);
+ Vector select(Vector probabilities);
/**
* Update the policy with the given classifier
@@ -53,7 +53,7 @@ public interface ClusteringPolicy extend
* @param posterior
* a ClusterClassifier
*/
- public void update(ClusterClassifier posterior);
+ void update(ClusterClassifier posterior);
/**
* Close the policy using the classifier's models
@@ -61,6 +61,6 @@ public interface ClusteringPolicy extend
* @param posterior
* a posterior ClusterClassifier
*/
- public void close(ClusterClassifier posterior);
+ void close(ClusterClassifier posterior);
}
\ No newline at end of file
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/iterator/DirichletClusteringPolicy.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/iterator/DirichletClusteringPolicy.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/iterator/DirichletClusteringPolicy.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/iterator/DirichletClusteringPolicy.java Wed Jun 20 12:07:50 2012
@@ -30,7 +30,6 @@ import org.apache.mahout.math.VectorWrit
public class DirichletClusteringPolicy extends AbstractClusteringPolicy {
public DirichletClusteringPolicy() {
- super();
}
/**
@@ -51,14 +50,7 @@ public class DirichletClusteringPolicy e
// Alpha_0 primes the Dirichlet distribution
private double alpha0;
-
- /*
- * (non-Javadoc)
- *
- * @see
- * org.apache.mahout.clustering.ClusteringPolicy#select(org.apache.mahout.
- * math.Vector)
- */
+
@Override
public Vector select(Vector probabilities) {
int rMultinom = UncommonDistributions.rMultinom(probabilities.times(mixture));
@@ -68,13 +60,7 @@ public class DirichletClusteringPolicy e
}
// update the total counts and then the mixture
- /*
- * (non-Javadoc)
- *
- * @see
- * org.apache.mahout.clustering.ClusteringPolicy#update(org.apache.mahout.
- * clustering.ClusterClassifier)
- */
+
@Override
public void update(ClusterClassifier prior) {
Vector totalCounts = new DenseVector(prior.getModels().size());
@@ -83,23 +69,13 @@ public class DirichletClusteringPolicy e
}
mixture = UncommonDistributions.rDirichlet(totalCounts, alpha0);
}
-
- /*
- * (non-Javadoc)
- *
- * @see org.apache.hadoop.io.Writable#write(java.io.DataOutput)
- */
+
@Override
public void write(DataOutput out) throws IOException {
out.writeDouble(alpha0);
VectorWritable.writeVector(out, mixture);
}
-
- /*
- * (non-Javadoc)
- *
- * @see org.apache.hadoop.io.Writable#readFields(java.io.DataInput)
- */
+
@Override
public void readFields(DataInput in) throws IOException {
this.alpha0 = in.readDouble();
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/iterator/FuzzyKMeansClusteringPolicy.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/iterator/FuzzyKMeansClusteringPolicy.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/iterator/FuzzyKMeansClusteringPolicy.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/iterator/FuzzyKMeansClusteringPolicy.java Wed Jun 20 12:07:50 2012
@@ -36,26 +36,18 @@ import com.google.common.collect.Lists;
*
*/
public class FuzzyKMeansClusteringPolicy extends AbstractClusteringPolicy {
-
- public FuzzyKMeansClusteringPolicy() {
- super();
- }
-
+
private double m = 2;
-
private double convergenceDelta = 0.05;
-
+
+ public FuzzyKMeansClusteringPolicy() {
+ }
+
public FuzzyKMeansClusteringPolicy(double m, double convergenceDelta) {
this.m = m;
+ this.convergenceDelta = convergenceDelta;
}
-
- /*
- * (non-Javadoc)
- *
- * @see
- * org.apache.mahout.clustering.ClusteringPolicy#select(org.apache.mahout.
- * math.Vector)
- */
+
@Override
public Vector select(Vector probabilities) {
return probabilities;
@@ -74,23 +66,13 @@ public class FuzzyKMeansClusteringPolicy
fuzzyKMeansClusterer.setM(m);
return fuzzyKMeansClusterer.computePi(clusters, distances);
}
-
- /*
- * (non-Javadoc)
- *
- * @see org.apache.hadoop.io.Writable#write(java.io.DataOutput)
- */
+
@Override
public void write(DataOutput out) throws IOException {
out.writeDouble(m);
out.writeDouble(convergenceDelta);
}
-
- /*
- * (non-Javadoc)
- *
- * @see org.apache.hadoop.io.Writable#readFields(java.io.DataInput)
- */
+
@Override
public void readFields(DataInput in) throws IOException {
this.m = in.readDouble();
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/iterator/KMeansClusteringPolicy.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/iterator/KMeansClusteringPolicy.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/iterator/KMeansClusteringPolicy.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/iterator/KMeansClusteringPolicy.java Wed Jun 20 12:07:50 2012
@@ -31,31 +31,19 @@ import org.apache.mahout.clustering.clas
public class KMeansClusteringPolicy extends AbstractClusteringPolicy {
public KMeansClusteringPolicy() {
- super();
}
public KMeansClusteringPolicy(double convergenceDelta) {
- super();
this.convergenceDelta = convergenceDelta;
}
private double convergenceDelta = 0.001;
-
- /*
- * (non-Javadoc)
- *
- * @see org.apache.hadoop.io.Writable#write(java.io.DataOutput)
- */
+
@Override
public void write(DataOutput out) throws IOException {
out.writeDouble(convergenceDelta);
}
-
- /*
- * (non-Javadoc)
- *
- * @see org.apache.hadoop.io.Writable#readFields(java.io.DataInput)
- */
+
@Override
public void readFields(DataInput in) throws IOException {
this.convergenceDelta = in.readDouble();
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/iterator/MeanShiftClusteringPolicy.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/iterator/MeanShiftClusteringPolicy.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/iterator/MeanShiftClusteringPolicy.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/iterator/MeanShiftClusteringPolicy.java Wed Jun 20 12:07:50 2012
@@ -27,17 +27,11 @@ import java.io.IOException;
*/
public class MeanShiftClusteringPolicy extends AbstractClusteringPolicy {
- public MeanShiftClusteringPolicy() {
- super();
- }
-
- private double t1, t2, t3, t4;
-
- /*
- * (non-Javadoc)
- *
- * @see org.apache.hadoop.io.Writable#write(java.io.DataOutput)
- */
+ private double t1;
+ private double t2;
+ private double t3;
+ private double t4;
+
@Override
public void write(DataOutput out) throws IOException {
out.writeDouble(t1);
@@ -45,12 +39,7 @@ public class MeanShiftClusteringPolicy e
out.writeDouble(t3);
out.writeDouble(t4);
}
-
- /*
- * (non-Javadoc)
- *
- * @see org.apache.hadoop.io.Writable#readFields(java.io.DataInput)
- */
+
@Override
public void readFields(DataInput in) throws IOException {
this.t1 = in.readDouble();
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java Wed Jun 20 12:07:50 2012
@@ -16,8 +16,6 @@
*/
package org.apache.mahout.clustering.kmeans;
-import static org.apache.mahout.clustering.topdown.PathDirectory.CLUSTERED_POINTS_DIRECTORY;
-
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
@@ -31,6 +29,7 @@ import org.apache.mahout.clustering.clas
import org.apache.mahout.clustering.iterator.ClusterIterator;
import org.apache.mahout.clustering.iterator.ClusteringPolicy;
import org.apache.mahout.clustering.iterator.KMeansClusteringPolicy;
+import org.apache.mahout.clustering.topdown.PathDirectory;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.ClassUtils;
import org.apache.mahout.common.HadoopUtil;
@@ -224,9 +223,9 @@ public class KMeansDriver extends Abstra
prior.writeToSeqFiles(priorClustersPath);
if (runSequential) {
- new ClusterIterator().iterateSeq(conf, input, priorClustersPath, output, maxIterations);
+ ClusterIterator.iterateSeq(conf, input, priorClustersPath, output, maxIterations);
} else {
- new ClusterIterator().iterateMR(conf, input, priorClustersPath, output, maxIterations);
+ ClusterIterator.iterateMR(conf, input, priorClustersPath, output, maxIterations);
}
return output;
}
@@ -257,7 +256,7 @@ public class KMeansDriver extends Abstra
log.info("Input: {} Clusters In: {} Out: {} Distance: {}", new Object[] {input, clustersIn, output, measure});
}
ClusterClassifier.writePolicy(new KMeansClusteringPolicy(), clustersIn);
- ClusterClassificationDriver.run(input, output, new Path(output, CLUSTERED_POINTS_DIRECTORY),
+ ClusterClassificationDriver.run(input, output, new Path(output, PathDirectory.CLUSTERED_POINTS_DIRECTORY),
clusterClassificationThreshold, true, runSequential);
}
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/Kluster.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/Kluster.java?rev=1352052&r1=1352051&r2=1352052&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/Kluster.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/Kluster.java Wed Jun 20 12:07:50 2012
@@ -30,7 +30,8 @@ public class Kluster extends DistanceMea
private boolean converged;
/** For (de)serialization as a Writable */
- public Kluster() {}
+ public Kluster() {
+ }
/**
* Construct a new cluster with the given point as its center