You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ra...@apache.org on 2018/06/27 13:14:32 UTC
[06/24] mahout git commit: MAHOUT-2034 Split MR and New Examples into seperate modules

http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainNewsGroups.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainNewsGroups.java b/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainNewsGroups.java
deleted file mode 100644
index 632b32c..0000000
--- a/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainNewsGroups.java
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.sgd;
-
-import com.google.common.collect.HashMultiset;
-import com.google.common.collect.Multiset;
-import com.google.common.collect.Ordering;
-import org.apache.mahout.classifier.NewsgroupHelper;
-import org.apache.mahout.ep.State;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.vectorizer.encoders.Dictionary;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.List;
-
-/**
- * Reads and trains an adaptive logistic regression model on the 20 newsgroups data.
- * The first command line argument gives the path of the directory holding the training
- * data.  The optional second argument, leakType, defines which classes of features to use.
- * Importantly, leakType controls whether a synthetic date is injected into the data as
- * a target leak and if so, how.
- * <p/>
- * The value of leakType % 3 determines whether the target leak is injected according to
- * the following table:
- * <p/>
- * <table>
- * <tr><td valign='top'>0</td><td>No leak injected</td></tr>
- * <tr><td valign='top'>1</td><td>Synthetic date injected in MMM-yyyy format. This will be a single token and
- * is a perfect target leak since each newsgroup is given a different month</td></tr>
- * <tr><td valign='top'>2</td><td>Synthetic date injected in dd-MMM-yyyy HH:mm:ss format.  The day varies
- * and thus there are more leak symbols that need to be learned.  Ultimately this is just
- * as big a leak as case 1.</td></tr>
- * </table>
- * <p/>
- * Leaktype also determines what other text will be indexed.  If leakType is greater
- * than or equal to 6, then neither headers nor text body will be used for features and the leak is the only
- * source of data.  If leakType is greater than or equal to 3, then subject words will be used as features.
- * If leakType is less than 3, then both subject and body text will be used as features.
- * <p/>
- * A leakType of 0 gives no leak and all textual features.
- * <p/>
- * See the following table for a summary of commonly used values for leakType
- * <p/>
- * <table>
- * <tr><td><b>leakType</b></td><td><b>Leak?</b></td><td><b>Subject?</b></td><td><b>Body?</b></td></tr>
- * <tr><td colspan=4><hr></td></tr>
- * <tr><td>0</td><td>no</td><td>yes</td><td>yes</td></tr>
- * <tr><td>1</td><td>mmm-yyyy</td><td>yes</td><td>yes</td></tr>
- * <tr><td>2</td><td>dd-mmm-yyyy</td><td>yes</td><td>yes</td></tr>
- * <tr><td colspan=4><hr></td></tr>
- * <tr><td>3</td><td>no</td><td>yes</td><td>no</td></tr>
- * <tr><td>4</td><td>mmm-yyyy</td><td>yes</td><td>no</td></tr>
- * <tr><td>5</td><td>dd-mmm-yyyy</td><td>yes</td><td>no</td></tr>
- * <tr><td colspan=4><hr></td></tr>
- * <tr><td>6</td><td>no</td><td>no</td><td>no</td></tr>
- * <tr><td>7</td><td>mmm-yyyy</td><td>no</td><td>no</td></tr>
- * <tr><td>8</td><td>dd-mmm-yyyy</td><td>no</td><td>no</td></tr>
- * <tr><td colspan=4><hr></td></tr>
- * </table>
- */
-public final class TrainNewsGroups {
-
-  private TrainNewsGroups() {
-  }
-
-  public static void main(String[] args) throws IOException {
-    File base = new File(args[0]);
-
-    Multiset<String> overallCounts = HashMultiset.create();
-
-    int leakType = 0;
-    if (args.length > 1) {
-      leakType = Integer.parseInt(args[1]);
-    }
-
-    Dictionary newsGroups = new Dictionary();
-
-    NewsgroupHelper helper = new NewsgroupHelper();
-    helper.getEncoder().setProbes(2);
-    AdaptiveLogisticRegression learningAlgorithm =
-        new AdaptiveLogisticRegression(20, NewsgroupHelper.FEATURES, new L1());
-    learningAlgorithm.setInterval(800);
-    learningAlgorithm.setAveragingWindow(500);
-
-    List<File> files = new ArrayList<>();
-    for (File newsgroup : base.listFiles()) {
-      if (newsgroup.isDirectory()) {
-        newsGroups.intern(newsgroup.getName());
-        files.addAll(Arrays.asList(newsgroup.listFiles()));
-      }
-    }
-    Collections.shuffle(files);
-    System.out.println(files.size() + " training files");
-    SGDInfo info = new SGDInfo();
-
-    int k = 0;
-
-    for (File file : files) {
-      String ng = file.getParentFile().getName();
-      int actual = newsGroups.intern(ng);
-
-      Vector v = helper.encodeFeatureVector(file, actual, leakType, overallCounts);
-      learningAlgorithm.train(actual, v);
-
-      k++;
-      State<AdaptiveLogisticRegression.Wrapper, CrossFoldLearner> best = learningAlgorithm.getBest();
-
-      SGDHelper.analyzeState(info, leakType, k, best);
-    }
-    learningAlgorithm.close();
-    SGDHelper.dissect(leakType, newsGroups, learningAlgorithm, files, overallCounts);
-    System.out.println("exiting main");
-
-    File modelFile = new File(System.getProperty("java.io.tmpdir"), "news-group.model");
-    ModelSerializer.writeBinary(modelFile.getAbsolutePath(),
-        learningAlgorithm.getBest().getPayload().getLearner().getModels().get(0));
-
-    List<Integer> counts = new ArrayList<>();
-    System.out.println("Word counts");
-    for (String count : overallCounts.elementSet()) {
-      counts.add(overallCounts.count(count));
-    }
-    Collections.sort(counts, Ordering.natural().reverse());
-    k = 0;
-    for (Integer count : counts) {
-      System.out.println(k + "\t" + count);
-      k++;
-      if (k > 1000) {
-        break;
-      }
-    }
-  }
-
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/examples/src/main/java/org/apache/mahout/classifier/sgd/ValidateAdaptiveLogistic.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/org/apache/mahout/classifier/sgd/ValidateAdaptiveLogistic.java b/examples/src/main/java/org/apache/mahout/classifier/sgd/ValidateAdaptiveLogistic.java
deleted file mode 100644
index 7a74289..0000000
--- a/examples/src/main/java/org/apache/mahout/classifier/sgd/ValidateAdaptiveLogistic.java
+++ /dev/null
@@ -1,218 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.sgd;
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.io.PrintWriter;
-import java.util.Locale;
-
-import org.apache.commons.cli2.CommandLine;
-import org.apache.commons.cli2.Group;
-import org.apache.commons.cli2.Option;
-import org.apache.commons.cli2.builder.ArgumentBuilder;
-import org.apache.commons.cli2.builder.DefaultOptionBuilder;
-import org.apache.commons.cli2.builder.GroupBuilder;
-import org.apache.commons.cli2.commandline.Parser;
-import org.apache.commons.cli2.util.HelpFormatter;
-import org.apache.commons.io.Charsets;
-import org.apache.mahout.classifier.ConfusionMatrix;
-import org.apache.mahout.classifier.evaluation.Auc;
-import org.apache.mahout.classifier.sgd.AdaptiveLogisticRegression.Wrapper;
-import org.apache.mahout.ep.State;
-import org.apache.mahout.math.Matrix;
-import org.apache.mahout.math.SequentialAccessSparseVector;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.stats.OnlineSummarizer;
-
-/*
- * Auc and averageLikelihood are always shown if possible, if the number of target value is more than 2, 
- * then Auc and entropy matirx are not shown regardless the value of showAuc and showEntropy
- * the user passes, because the current implementation does not support them on two value targets.
- * */
-public final class ValidateAdaptiveLogistic {
-
-  private static String inputFile;
-  private static String modelFile;
-  private static String defaultCategory;
-  private static boolean showAuc;
-  private static boolean showScores;
-  private static boolean showConfusion;
-
-  private ValidateAdaptiveLogistic() {
-  }
-
-  public static void main(String[] args) throws IOException {
-    mainToOutput(args, new PrintWriter(new OutputStreamWriter(System.out, Charsets.UTF_8), true));
-  }
-
-  static void mainToOutput(String[] args, PrintWriter output) throws IOException {
-    if (parseArgs(args)) {
-      if (!showAuc && !showConfusion && !showScores) {
-        showAuc = true;
-        showConfusion = true;
-      }
-
-      Auc collector = null;
-      AdaptiveLogisticModelParameters lmp = AdaptiveLogisticModelParameters
-          .loadFromFile(new File(modelFile));
-      CsvRecordFactory csv = lmp.getCsvRecordFactory();
-      AdaptiveLogisticRegression lr = lmp.createAdaptiveLogisticRegression();      
-
-      if (lmp.getTargetCategories().size() <= 2) {
-        collector = new Auc();
-      }
-
-      OnlineSummarizer slh = new OnlineSummarizer();
-      ConfusionMatrix cm = new ConfusionMatrix(lmp.getTargetCategories(), defaultCategory);
-
-      State<Wrapper, CrossFoldLearner> best = lr.getBest();
-      if (best == null) {
-        output.println("AdaptiveLogisticRegression has not be trained probably.");
-        return;
-      }
-      CrossFoldLearner learner = best.getPayload().getLearner();
-
-      BufferedReader in = TrainLogistic.open(inputFile);
-      String line = in.readLine();
-      csv.firstLine(line);
-      line = in.readLine();
-      if (showScores) {
-        output.println("\"target\", \"model-output\", \"log-likelihood\", \"average-likelihood\"");
-      }
-      while (line != null) {
-        Vector v = new SequentialAccessSparseVector(lmp.getNumFeatures());
-        //TODO: How to avoid extra target values not shown in the training process.
-        int target = csv.processLine(line, v);
-        double likelihood = learner.logLikelihood(target, v);
-        double score = learner.classifyFull(v).maxValue();
-
-        slh.add(likelihood);
-        cm.addInstance(csv.getTargetString(line), csv.getTargetLabel(target));        
-
-        if (showScores) {
-          output.printf(Locale.ENGLISH, "%8d, %.12f, %.13f, %.13f%n", target,
-              score, learner.logLikelihood(target, v), slh.getMean());
-        }
-        if (collector != null) {
-          collector.add(target, score);
-        }
-        line = in.readLine();
-      }
-
-      output.printf(Locale.ENGLISH,"\nLog-likelihood:");
-      output.printf(Locale.ENGLISH, "Min=%.2f, Max=%.2f, Mean=%.2f, Median=%.2f%n",
-          slh.getMin(), slh.getMax(), slh.getMean(), slh.getMedian());
-
-      if (collector != null) {        
-        output.printf(Locale.ENGLISH, "%nAUC = %.2f%n", collector.auc());
-      }
-
-      if (showConfusion) {
-        output.printf(Locale.ENGLISH, "%n%s%n%n", cm.toString());
-
-        if (collector != null) {
-          Matrix m = collector.entropy();
-          output.printf(Locale.ENGLISH,
-              "Entropy Matrix: [[%.1f, %.1f], [%.1f, %.1f]]%n", m.get(0, 0),
-              m.get(1, 0), m.get(0, 1), m.get(1, 1));
-        }        
-      }
-
-    }
-  }
-
-  private static boolean parseArgs(String[] args) {
-    DefaultOptionBuilder builder = new DefaultOptionBuilder();
-
-    Option help = builder.withLongName("help")
-        .withDescription("print this list").create();
-
-    Option quiet = builder.withLongName("quiet")
-        .withDescription("be extra quiet").create();
-
-    Option auc = builder.withLongName("auc").withDescription("print AUC")
-        .create();
-    Option confusion = builder.withLongName("confusion")
-        .withDescription("print confusion matrix").create();
-
-    Option scores = builder.withLongName("scores")
-        .withDescription("print scores").create();
-
-    ArgumentBuilder argumentBuilder = new ArgumentBuilder();
-    Option inputFileOption = builder
-        .withLongName("input")
-        .withRequired(true)
-        .withArgument(
-            argumentBuilder.withName("input").withMaximum(1)
-                .create())
-        .withDescription("where to get validate data").create();
-
-    Option modelFileOption = builder
-        .withLongName("model")
-        .withRequired(true)
-        .withArgument(
-            argumentBuilder.withName("model").withMaximum(1)
-                .create())
-        .withDescription("where to get the trained model").create();
-
-    Option defaultCagetoryOption = builder
-      .withLongName("defaultCategory")
-      .withRequired(false)
-      .withArgument(
-          argumentBuilder.withName("defaultCategory").withMaximum(1).withDefault("unknown")
-          .create())
-      .withDescription("the default category value to use").create();
-
-    Group normalArgs = new GroupBuilder().withOption(help)
-        .withOption(quiet).withOption(auc).withOption(scores)
-        .withOption(confusion).withOption(inputFileOption)
-        .withOption(modelFileOption).withOption(defaultCagetoryOption).create();
-
-    Parser parser = new Parser();
-    parser.setHelpOption(help);
-    parser.setHelpTrigger("--help");
-    parser.setGroup(normalArgs);
-    parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130));
-    CommandLine cmdLine = parser.parseAndHelp(args);
-
-    if (cmdLine == null) {
-      return false;
-    }
-
-    inputFile = getStringArgument(cmdLine, inputFileOption);
-    modelFile = getStringArgument(cmdLine, modelFileOption);
-    defaultCategory = getStringArgument(cmdLine, defaultCagetoryOption);
-    showAuc = getBooleanArgument(cmdLine, auc);
-    showScores = getBooleanArgument(cmdLine, scores);
-    showConfusion = getBooleanArgument(cmdLine, confusion);
-
-    return true;
-  }
-
-  private static boolean getBooleanArgument(CommandLine cmdLine, Option option) {
-    return cmdLine.hasOption(option);
-  }
-
-  private static String getStringArgument(CommandLine cmdLine, Option inputFile) {
-    return (String) cmdLine.getValue(inputFile);
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/BankMarketingClassificationMain.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/BankMarketingClassificationMain.java b/examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/BankMarketingClassificationMain.java
deleted file mode 100644
index ab3c861..0000000
--- a/examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/BankMarketingClassificationMain.java
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.sgd.bankmarketing;
-
-import com.google.common.collect.Lists;
-import org.apache.mahout.classifier.evaluation.Auc;
-import org.apache.mahout.classifier.sgd.L1;
-import org.apache.mahout.classifier.sgd.OnlineLogisticRegression;
-
-import java.util.Collections;
-import java.util.List;
-
-/**
- * Uses the SGD classifier on the 'Bank marketing' dataset from UCI.
- *
- * See http://archive.ics.uci.edu/ml/datasets/Bank+Marketing
- *
- * Learn when people accept or reject an offer from the bank via telephone based on income, age, education and more.
- */
-public class BankMarketingClassificationMain {
-
-  public static final int NUM_CATEGORIES = 2;
-
-  public static void main(String[] args) throws Exception {
-    List<TelephoneCall> calls = Lists.newArrayList(new TelephoneCallParser("bank-full.csv"));
-
-    double heldOutPercentage = 0.10;
-
-    for (int run = 0; run < 20; run++) {
-      Collections.shuffle(calls);
-      int cutoff = (int) (heldOutPercentage * calls.size());
-      List<TelephoneCall> test = calls.subList(0, cutoff);
-      List<TelephoneCall> train = calls.subList(cutoff, calls.size());
-
-      OnlineLogisticRegression lr = new OnlineLogisticRegression(NUM_CATEGORIES, TelephoneCall.FEATURES, new L1())
-        .learningRate(1)
-        .alpha(1)
-        .lambda(0.000001)
-        .stepOffset(10000)
-        .decayExponent(0.2);
-      for (int pass = 0; pass < 20; pass++) {
-        for (TelephoneCall observation : train) {
-          lr.train(observation.getTarget(), observation.asVector());
-        }
-        if (pass % 5 == 0) {
-          Auc eval = new Auc(0.5);
-          for (TelephoneCall testCall : test) {
-            eval.add(testCall.getTarget(), lr.classifyScalar(testCall.asVector()));
-          }
-          System.out.printf("%d, %.4f, %.4f\n", pass, lr.currentLearningRate(), eval.auc());
-        }
-      }
-    }
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/TelephoneCall.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/TelephoneCall.java b/examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/TelephoneCall.java
deleted file mode 100644
index 728ec20..0000000
--- a/examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/TelephoneCall.java
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.sgd.bankmarketing;
-
-import org.apache.mahout.math.RandomAccessSparseVector;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.vectorizer.encoders.ConstantValueEncoder;
-import org.apache.mahout.vectorizer.encoders.FeatureVectorEncoder;
-import org.apache.mahout.vectorizer.encoders.StaticWordValueEncoder;
-
-import java.util.Iterator;
-import java.util.LinkedHashMap;
-import java.util.Map;
-
-public class TelephoneCall {
-  public static final int FEATURES = 100;
-  private static final ConstantValueEncoder interceptEncoder = new ConstantValueEncoder("intercept");
-  private static final FeatureVectorEncoder featureEncoder = new StaticWordValueEncoder("feature");
-
-  private RandomAccessSparseVector vector;
-
-  private Map<String, String> fields = new LinkedHashMap<>();
-
-  public TelephoneCall(Iterable<String> fieldNames, Iterable<String> values) {
-    vector = new RandomAccessSparseVector(FEATURES);
-    Iterator<String> value = values.iterator();
-    interceptEncoder.addToVector("1", vector);
-    for (String name : fieldNames) {
-      String fieldValue = value.next();
-      fields.put(name, fieldValue);
-
-      switch (name) {
-        case "age": {
-          double v = Double.parseDouble(fieldValue);
-          featureEncoder.addToVector(name, Math.log(v), vector);
-          break;
-        }
-        case "balance": {
-          double v;
-          v = Double.parseDouble(fieldValue);
-          if (v < -2000) {
-            v = -2000;
-          }
-          featureEncoder.addToVector(name, Math.log(v + 2001) - 8, vector);
-          break;
-        }
-        case "duration": {
-          double v;
-          v = Double.parseDouble(fieldValue);
-          featureEncoder.addToVector(name, Math.log(v + 1) - 5, vector);
-          break;
-        }
-        case "pdays": {
-          double v;
-          v = Double.parseDouble(fieldValue);
-          featureEncoder.addToVector(name, Math.log(v + 2), vector);
-          break;
-        }
-        case "job":
-        case "marital":
-        case "education":
-        case "default":
-        case "housing":
-        case "loan":
-        case "contact":
-        case "campaign":
-        case "previous":
-        case "poutcome":
-          featureEncoder.addToVector(name + ":" + fieldValue, 1, vector);
-          break;
-        case "day":
-        case "month":
-        case "y":
-          // ignore these for vectorizing
-          break;
-        default:
-          throw new IllegalArgumentException(String.format("Bad field name: %s", name));
-      }
-    }
-  }
-
-  public Vector asVector() {
-    return vector;
-  }
-
-  public int getTarget() {
-    return fields.get("y").equals("no") ? 0 : 1;
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/TelephoneCallParser.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/TelephoneCallParser.java b/examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/TelephoneCallParser.java
deleted file mode 100644
index 5ef6490..0000000
--- a/examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/TelephoneCallParser.java
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.sgd.bankmarketing;
-
-import com.google.common.base.CharMatcher;
-import com.google.common.base.Splitter;
-import com.google.common.collect.AbstractIterator;
-import com.google.common.io.Resources;
-
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.util.Iterator;
-
-/** Parses semi-colon separated data as TelephoneCalls  */
-public class TelephoneCallParser implements Iterable<TelephoneCall> {
-
-  private final Splitter onSemi = Splitter.on(";").trimResults(CharMatcher.anyOf("\" ;"));
-  private String resourceName;
-
-  public TelephoneCallParser(String resourceName) throws IOException {
-    this.resourceName = resourceName;
-  }
-
-  @Override
-  public Iterator<TelephoneCall> iterator() {
-    try {
-      return new AbstractIterator<TelephoneCall>() {
-        BufferedReader input =
-            new BufferedReader(new InputStreamReader(Resources.getResource(resourceName).openStream()));
-        Iterable<String> fieldNames = onSemi.split(input.readLine());
-
-          @Override
-          protected TelephoneCall computeNext() {
-            try {
-              String line = input.readLine();
-              if (line == null) {
-                return endOfData();
-              }
-
-              return new TelephoneCall(fieldNames, onSemi.split(line));
-            } catch (IOException e) {
-              throw new RuntimeException("Error reading data", e);
-            }
-          }
-        };
-      } catch (IOException e) {
-        throw new RuntimeException("Error reading data", e);
-      }
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/examples/src/main/java/org/apache/mahout/clustering/display/ClustersFilter.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/org/apache/mahout/clustering/display/ClustersFilter.java b/examples/src/main/java/org/apache/mahout/clustering/display/ClustersFilter.java
deleted file mode 100644
index a0b845f..0000000
--- a/examples/src/main/java/org/apache/mahout/clustering/display/ClustersFilter.java
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.clustering.display;
-
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.fs.PathFilter;
-
-final class ClustersFilter implements PathFilter {
-
-  @Override
-  public boolean accept(Path path) {
-    String pathString = path.toString();
-    return pathString.contains("/clusters-");
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/examples/src/main/java/org/apache/mahout/clustering/display/DisplayCanopy.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/org/apache/mahout/clustering/display/DisplayCanopy.java b/examples/src/main/java/org/apache/mahout/clustering/display/DisplayCanopy.java
deleted file mode 100644
index 50dba99..0000000
--- a/examples/src/main/java/org/apache/mahout/clustering/display/DisplayCanopy.java
+++ /dev/null
@@ -1,88 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.clustering.display;
-
-import java.awt.BasicStroke;
-import java.awt.Color;
-import java.awt.Graphics;
-import java.awt.Graphics2D;
-import java.util.List;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.mahout.clustering.Cluster;
-import org.apache.mahout.clustering.canopy.CanopyDriver;
-import org.apache.mahout.common.HadoopUtil;
-import org.apache.mahout.common.RandomUtils;
-import org.apache.mahout.common.distance.ManhattanDistanceMeasure;
-import org.apache.mahout.math.DenseVector;
-
-/**
- * Java desktop graphics class that runs canopy clustering and displays the results.
- * This class generates random data and clusters it.
- */
-@Deprecated
-public class DisplayCanopy extends DisplayClustering {
-
-  DisplayCanopy() {
-    initialize();
-    this.setTitle("Canopy Clusters (>" + (int) (significance * 100) + "% of population)");
-  }
-
-  @Override
-  public void paint(Graphics g) {
-    plotSampleData((Graphics2D) g);
-    plotClusters((Graphics2D) g);
-  }
-
-  protected static void plotClusters(Graphics2D g2) {
-    int cx = CLUSTERS.size() - 1;
-    for (List<Cluster> clusters : CLUSTERS) {
-      for (Cluster cluster : clusters) {
-        if (isSignificant(cluster)) {
-          g2.setStroke(new BasicStroke(1));
-          g2.setColor(Color.BLUE);
-          double[] t1 = {T1, T1};
-          plotEllipse(g2, cluster.getCenter(), new DenseVector(t1));
-          double[] t2 = {T2, T2};
-          plotEllipse(g2, cluster.getCenter(), new DenseVector(t2));
-          g2.setColor(COLORS[Math.min(DisplayClustering.COLORS.length - 1, cx)]);
-          g2.setStroke(new BasicStroke(cx == 0 ? 3 : 1));
-          plotEllipse(g2, cluster.getCenter(), cluster.getRadius().times(3));
-        }
-      }
-      cx--;
-    }
-  }
-
-  public static void main(String[] args) throws Exception {
-    Path samples = new Path("samples");
-    Path output = new Path("output");
-    Configuration conf = new Configuration();
-    HadoopUtil.delete(conf, samples);
-    HadoopUtil.delete(conf, output);
-    RandomUtils.useTestSeed();
-    generateSamples();
-    writeSampleData(samples);
-    CanopyDriver.buildClusters(conf, samples, output, new ManhattanDistanceMeasure(), T1, T2, 0, true);
-    loadClustersWritable(output);
-
-    new DisplayCanopy();
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/examples/src/main/java/org/apache/mahout/clustering/display/DisplayClustering.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/org/apache/mahout/clustering/display/DisplayClustering.java b/examples/src/main/java/org/apache/mahout/clustering/display/DisplayClustering.java
deleted file mode 100644
index ad85c6a..0000000
--- a/examples/src/main/java/org/apache/mahout/clustering/display/DisplayClustering.java
+++ /dev/null
@@ -1,374 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.clustering.display;
-
-import java.awt.*;
-import java.awt.event.WindowAdapter;
-import java.awt.event.WindowEvent;
-import java.awt.geom.AffineTransform;
-import java.awt.geom.Ellipse2D;
-import java.awt.geom.Rectangle2D;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.io.SequenceFile;
-import org.apache.hadoop.io.Text;
-import org.apache.mahout.clustering.AbstractCluster;
-import org.apache.mahout.clustering.Cluster;
-import org.apache.mahout.clustering.UncommonDistributions;
-import org.apache.mahout.clustering.classify.WeightedVectorWritable;
-import org.apache.mahout.clustering.iterator.ClusterWritable;
-import org.apache.mahout.common.Pair;
-import org.apache.mahout.common.RandomUtils;
-import org.apache.mahout.common.iterator.sequencefile.PathFilters;
-import org.apache.mahout.common.iterator.sequencefile.PathType;
-import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable;
-import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
-import org.apache.mahout.math.DenseVector;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.VectorWritable;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-public class DisplayClustering extends Frame {
-  
-  private static final Logger log = LoggerFactory.getLogger(DisplayClustering.class);
-  
-  protected static final int DS = 72; // default scale = 72 pixels per inch
-  
-  protected static final int SIZE = 8; // screen size in inches
-  
-  private static final Collection<Vector> SAMPLE_PARAMS = new ArrayList<>();
-  
-  protected static final List<VectorWritable> SAMPLE_DATA = new ArrayList<>();
-  
-  protected static final List<List<Cluster>> CLUSTERS = new ArrayList<>();
-  
-  static final Color[] COLORS = { Color.red, Color.orange, Color.yellow, Color.green, Color.blue, Color.magenta,
-    Color.lightGray };
-  
-  protected static final double T1 = 3.0;
-  
-  protected static final double T2 = 2.8;
-  
-  static double significance = 0.05;
-  
-  protected static int res; // screen resolution
-  
-  public DisplayClustering() {
-    initialize();
-    this.setTitle("Sample Data");
-  }
-  
-  public void initialize() {
-    // Get screen resolution
-    res = Toolkit.getDefaultToolkit().getScreenResolution();
-    
-    // Set Frame size in inches
-    this.setSize(SIZE * res, SIZE * res);
-    this.setVisible(true);
-    this.setTitle("Asymmetric Sample Data");
-    
-    // Window listener to terminate program.
-    this.addWindowListener(new WindowAdapter() {
-      @Override
-      public void windowClosing(WindowEvent e) {
-        System.exit(0);
-      }
-    });
-  }
-  
-  public static void main(String[] args) throws Exception {
-    RandomUtils.useTestSeed();
-    generateSamples();
-    new DisplayClustering();
-  }
-  
-  // Override the paint() method
-  @Override
-  public void paint(Graphics g) {
-    Graphics2D g2 = (Graphics2D) g;
-    plotSampleData(g2);
-    plotSampleParameters(g2);
-    plotClusters(g2);
-  }
-  
-  protected static void plotClusters(Graphics2D g2) {
-    int cx = CLUSTERS.size() - 1;
-    for (List<Cluster> clusters : CLUSTERS) {
-      g2.setStroke(new BasicStroke(cx == 0 ? 3 : 1));
-      g2.setColor(COLORS[Math.min(COLORS.length - 1, cx--)]);
-      for (Cluster cluster : clusters) {
-        plotEllipse(g2, cluster.getCenter(), cluster.getRadius().times(3));
-      }
-    }
-  }
-  
-  protected static void plotSampleParameters(Graphics2D g2) {
-    Vector v = new DenseVector(2);
-    Vector dv = new DenseVector(2);
-    g2.setColor(Color.RED);
-    for (Vector param : SAMPLE_PARAMS) {
-      v.set(0, param.get(0));
-      v.set(1, param.get(1));
-      dv.set(0, param.get(2) * 3);
-      dv.set(1, param.get(3) * 3);
-      plotEllipse(g2, v, dv);
-    }
-  }
-  
-  protected static void plotSampleData(Graphics2D g2) {
-    double sx = (double) res / DS;
-    g2.setTransform(AffineTransform.getScaleInstance(sx, sx));
-    
-    // plot the axes
-    g2.setColor(Color.BLACK);
-    Vector dv = new DenseVector(2).assign(SIZE / 2.0);
-    plotRectangle(g2, new DenseVector(2).assign(2), dv);
-    plotRectangle(g2, new DenseVector(2).assign(-2), dv);
-    
-    // plot the sample data
-    g2.setColor(Color.DARK_GRAY);
-    dv.assign(0.03);
-    for (VectorWritable v : SAMPLE_DATA) {
-      plotRectangle(g2, v.get(), dv);
-    }
-  }
-  
-  /**
-   * This method plots points and colors them according to their cluster
-   * membership, rather than drawing ellipses.
-   * 
-   * As of commit, this method is used only by K-means spectral clustering.
-   * Since the cluster assignments are set within the eigenspace of the data, it
-   * is not inherent that the original data cluster as they would in K-means:
-   * that is, as symmetric gaussian mixtures.
-   * 
-   * Since Spectral K-Means uses K-Means to cluster the eigenspace data, the raw
-   * output is not directly usable. Rather, the cluster assignments from the raw
-   * output need to be transferred back to the original data. As such, this
-   * method will read the SequenceFile cluster results of K-means and transfer
-   * the cluster assignments to the original data, coloring them appropriately.
-   * 
-   * @param g2
-   * @param data
-   */
-  protected static void plotClusteredSampleData(Graphics2D g2, Path data) {
-    double sx = (double) res / DS;
-    g2.setTransform(AffineTransform.getScaleInstance(sx, sx));
-    
-    g2.setColor(Color.BLACK);
-    Vector dv = new DenseVector(2).assign(SIZE / 2.0);
-    plotRectangle(g2, new DenseVector(2).assign(2), dv);
-    plotRectangle(g2, new DenseVector(2).assign(-2), dv);
-    
-    // plot the sample data, colored according to the cluster they belong to
-    dv.assign(0.03);
-    
-    Path clusteredPointsPath = new Path(data, "clusteredPoints");
-    Path inputPath = new Path(clusteredPointsPath, "part-m-00000");
-    Map<Integer,Color> colors = new HashMap<>();
-    int point = 0;
-    for (Pair<IntWritable,WeightedVectorWritable> record : new SequenceFileIterable<IntWritable,WeightedVectorWritable>(
-        inputPath, new Configuration())) {
-      int clusterId = record.getFirst().get();
-      VectorWritable v = SAMPLE_DATA.get(point++);
-      Integer key = clusterId;
-      if (!colors.containsKey(key)) {
-        colors.put(key, COLORS[Math.min(COLORS.length - 1, colors.size())]);
-      }
-      plotClusteredRectangle(g2, v.get(), dv, colors.get(key));
-    }
-  }
-  
-  /**
-   * Identical to plotRectangle(), but with the option of setting the color of
-   * the rectangle's stroke.
-   * 
-   * NOTE: This should probably be refactored with plotRectangle() since most of
-   * the code here is direct copy/paste from that method.
-   * 
-   * @param g2
-   *          A Graphics2D context.
-   * @param v
-   *          A vector for the rectangle's center.
-   * @param dv
-   *          A vector for the rectangle's dimensions.
-   * @param color
-   *          The color of the rectangle's stroke.
-   */
-  protected static void plotClusteredRectangle(Graphics2D g2, Vector v, Vector dv, Color color) {
-    double[] flip = {1, -1};
-    Vector v2 = v.times(new DenseVector(flip));
-    v2 = v2.minus(dv.divide(2));
-    int h = SIZE / 2;
-    double x = v2.get(0) + h;
-    double y = v2.get(1) + h;
-    
-    g2.setStroke(new BasicStroke(1));
-    g2.setColor(color);
-    g2.draw(new Rectangle2D.Double(x * DS, y * DS, dv.get(0) * DS, dv.get(1) * DS));
-  }
-  
-  /**
-   * Draw a rectangle on the graphics context
-   * 
-   * @param g2
-   *          a Graphics2D context
-   * @param v
-   *          a Vector of rectangle center
-   * @param dv
-   *          a Vector of rectangle dimensions
-   */
-  protected static void plotRectangle(Graphics2D g2, Vector v, Vector dv) {
-    double[] flip = {1, -1};
-    Vector v2 = v.times(new DenseVector(flip));
-    v2 = v2.minus(dv.divide(2));
-    int h = SIZE / 2;
-    double x = v2.get(0) + h;
-    double y = v2.get(1) + h;
-    g2.draw(new Rectangle2D.Double(x * DS, y * DS, dv.get(0) * DS, dv.get(1) * DS));
-  }
-  
-  /**
-   * Draw an ellipse on the graphics context
-   * 
-   * @param g2
-   *          a Graphics2D context
-   * @param v
-   *          a Vector of ellipse center
-   * @param dv
-   *          a Vector of ellipse dimensions
-   */
-  protected static void plotEllipse(Graphics2D g2, Vector v, Vector dv) {
-    double[] flip = {1, -1};
-    Vector v2 = v.times(new DenseVector(flip));
-    v2 = v2.minus(dv.divide(2));
-    int h = SIZE / 2;
-    double x = v2.get(0) + h;
-    double y = v2.get(1) + h;
-    g2.draw(new Ellipse2D.Double(x * DS, y * DS, dv.get(0) * DS, dv.get(1) * DS));
-  }
-  
-  protected static void generateSamples() {
-    generateSamples(500, 1, 1, 3);
-    generateSamples(300, 1, 0, 0.5);
-    generateSamples(300, 0, 2, 0.1);
-  }
-  
-  protected static void generate2dSamples() {
-    generate2dSamples(500, 1, 1, 3, 1);
-    generate2dSamples(300, 1, 0, 0.5, 1);
-    generate2dSamples(300, 0, 2, 0.1, 0.5);
-  }
-  
-  /**
-   * Generate random samples and add them to the sampleData
-   * 
-   * @param num
-   *          int number of samples to generate
-   * @param mx
-   *          double x-value of the sample mean
-   * @param my
-   *          double y-value of the sample mean
-   * @param sd
-   *          double standard deviation of the samples
-   */
-  protected static void generateSamples(int num, double mx, double my, double sd) {
-    double[] params = {mx, my, sd, sd};
-    SAMPLE_PARAMS.add(new DenseVector(params));
-    log.info("Generating {} samples m=[{}, {}] sd={}", num, mx, my, sd);
-    for (int i = 0; i < num; i++) {
-      SAMPLE_DATA.add(new VectorWritable(new DenseVector(new double[] {UncommonDistributions.rNorm(mx, sd),
-          UncommonDistributions.rNorm(my, sd)})));
-    }
-  }
-  
-  protected static void writeSampleData(Path output) throws IOException {
-    Configuration conf = new Configuration();
-    FileSystem fs = FileSystem.get(output.toUri(), conf);
-
-    try (SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, output, Text.class, VectorWritable.class)) {
-      int i = 0;
-      for (VectorWritable vw : SAMPLE_DATA) {
-        writer.append(new Text("sample_" + i++), vw);
-      }
-    }
-  }
-  
-  protected static List<Cluster> readClustersWritable(Path clustersIn) {
-    List<Cluster> clusters = new ArrayList<>();
-    Configuration conf = new Configuration();
-    for (ClusterWritable value : new SequenceFileDirValueIterable<ClusterWritable>(clustersIn, PathType.LIST,
-        PathFilters.logsCRCFilter(), conf)) {
-      Cluster cluster = value.getValue();
-      log.info(
-          "Reading Cluster:{} center:{} numPoints:{} radius:{}",
-          cluster.getId(), AbstractCluster.formatVector(cluster.getCenter(), null),
-          cluster.getNumObservations(), AbstractCluster.formatVector(cluster.getRadius(), null));
-      clusters.add(cluster);
-    }
-    return clusters;
-  }
-  
-  protected static void loadClustersWritable(Path output) throws IOException {
-    Configuration conf = new Configuration();
-    FileSystem fs = FileSystem.get(output.toUri(), conf);
-    for (FileStatus s : fs.listStatus(output, new ClustersFilter())) {
-      List<Cluster> clusters = readClustersWritable(s.getPath());
-      CLUSTERS.add(clusters);
-    }
-  }
-  
-  /**
-   * Generate random samples and add them to the sampleData
-   * 
-   * @param num
-   *          int number of samples to generate
-   * @param mx
-   *          double x-value of the sample mean
-   * @param my
-   *          double y-value of the sample mean
-   * @param sdx
-   *          double x-value standard deviation of the samples
-   * @param sdy
-   *          double y-value standard deviation of the samples
-   */
-  protected static void generate2dSamples(int num, double mx, double my, double sdx, double sdy) {
-    double[] params = {mx, my, sdx, sdy};
-    SAMPLE_PARAMS.add(new DenseVector(params));
-    log.info("Generating {} samples m=[{}, {}] sd=[{}, {}]", num, mx, my, sdx, sdy);
-    for (int i = 0; i < num; i++) {
-      SAMPLE_DATA.add(new VectorWritable(new DenseVector(new double[] {UncommonDistributions.rNorm(mx, sdx),
-          UncommonDistributions.rNorm(my, sdy)})));
-    }
-  }
-  
-  protected static boolean isSignificant(Cluster cluster) {
-    return (double) cluster.getNumObservations() / SAMPLE_DATA.size() > significance;
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/examples/src/main/java/org/apache/mahout/clustering/display/DisplayFuzzyKMeans.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/org/apache/mahout/clustering/display/DisplayFuzzyKMeans.java b/examples/src/main/java/org/apache/mahout/clustering/display/DisplayFuzzyKMeans.java
deleted file mode 100644
index f8ce7c7..0000000
--- a/examples/src/main/java/org/apache/mahout/clustering/display/DisplayFuzzyKMeans.java
+++ /dev/null
@@ -1,110 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.clustering.display;
-
-import java.awt.Graphics;
-import java.awt.Graphics2D;
-import java.io.IOException;
-import java.util.Collection;
-import java.util.List;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.mahout.clustering.Cluster;
-import org.apache.mahout.clustering.classify.ClusterClassifier;
-import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver;
-import org.apache.mahout.clustering.fuzzykmeans.SoftCluster;
-import org.apache.mahout.clustering.iterator.ClusterIterator;
-import org.apache.mahout.clustering.iterator.FuzzyKMeansClusteringPolicy;
-import org.apache.mahout.clustering.kmeans.RandomSeedGenerator;
-import org.apache.mahout.common.HadoopUtil;
-import org.apache.mahout.common.RandomUtils;
-import org.apache.mahout.common.distance.DistanceMeasure;
-import org.apache.mahout.common.distance.ManhattanDistanceMeasure;
-import org.apache.mahout.math.Vector;
-
-import com.google.common.collect.Lists;
-
-public class DisplayFuzzyKMeans extends DisplayClustering {
-  
-  DisplayFuzzyKMeans() {
-    initialize();
-    this.setTitle("Fuzzy k-Means Clusters (>" + (int) (significance * 100) + "% of population)");
-  }
-  
-  // Override the paint() method
-  @Override
-  public void paint(Graphics g) {
-    plotSampleData((Graphics2D) g);
-    plotClusters((Graphics2D) g);
-  }
-  
-  public static void main(String[] args) throws Exception {
-    DistanceMeasure measure = new ManhattanDistanceMeasure();
-    
-    Path samples = new Path("samples");
-    Path output = new Path("output");
-    Configuration conf = new Configuration();
-    HadoopUtil.delete(conf, output);
-    HadoopUtil.delete(conf, samples);
-    RandomUtils.useTestSeed();
-    DisplayClustering.generateSamples();
-    writeSampleData(samples);
-    boolean runClusterer = true;
-    int maxIterations = 10;
-    float threshold = 0.001F;
-    float m = 1.1F;
-    if (runClusterer) {
-      runSequentialFuzzyKClusterer(conf, samples, output, measure, maxIterations, m, threshold);
-    } else {
-      int numClusters = 3;
-      runSequentialFuzzyKClassifier(conf, samples, output, measure, numClusters, maxIterations, m, threshold);
-    }
-    new DisplayFuzzyKMeans();
-  }
-  
-  private static void runSequentialFuzzyKClassifier(Configuration conf, Path samples, Path output,
-      DistanceMeasure measure, int numClusters, int maxIterations, float m, double threshold) throws IOException {
-    Collection<Vector> points = Lists.newArrayList();
-    for (int i = 0; i < numClusters; i++) {
-      points.add(SAMPLE_DATA.get(i).get());
-    }
-    List<Cluster> initialClusters = Lists.newArrayList();
-    int id = 0;
-    for (Vector point : points) {
-      initialClusters.add(new SoftCluster(point, id++, measure));
-    }
-    ClusterClassifier prior = new ClusterClassifier(initialClusters, new FuzzyKMeansClusteringPolicy(m, threshold));
-    Path priorPath = new Path(output, "classifier-0");
-    prior.writeToSeqFiles(priorPath);
-    
-    ClusterIterator.iterateSeq(conf, samples, priorPath, output, maxIterations);
-    loadClustersWritable(output);
-  }
-  
-  private static void runSequentialFuzzyKClusterer(Configuration conf, Path samples, Path output,
-      DistanceMeasure measure, int maxIterations, float m, double threshold) throws IOException,
-      ClassNotFoundException, InterruptedException {
-    Path clustersIn = new Path(output, "random-seeds");
-    RandomSeedGenerator.buildRandom(conf, samples, clustersIn, 3, measure);
-    FuzzyKMeansDriver.run(samples, clustersIn, output, threshold, maxIterations, m, true, true, threshold,
-        true);
-    
-    loadClustersWritable(output);
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/examples/src/main/java/org/apache/mahout/clustering/display/DisplayKMeans.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/org/apache/mahout/clustering/display/DisplayKMeans.java b/examples/src/main/java/org/apache/mahout/clustering/display/DisplayKMeans.java
deleted file mode 100644
index 336d69e..0000000
--- a/examples/src/main/java/org/apache/mahout/clustering/display/DisplayKMeans.java
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.clustering.display;
-
-import java.awt.Graphics;
-import java.awt.Graphics2D;
-import java.io.IOException;
-import java.util.Collection;
-import java.util.List;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.mahout.clustering.Cluster;
-import org.apache.mahout.clustering.classify.ClusterClassifier;
-import org.apache.mahout.clustering.iterator.ClusterIterator;
-import org.apache.mahout.clustering.iterator.KMeansClusteringPolicy;
-import org.apache.mahout.clustering.kmeans.KMeansDriver;
-import org.apache.mahout.clustering.kmeans.RandomSeedGenerator;
-import org.apache.mahout.common.HadoopUtil;
-import org.apache.mahout.common.RandomUtils;
-import org.apache.mahout.common.distance.DistanceMeasure;
-import org.apache.mahout.common.distance.ManhattanDistanceMeasure;
-import org.apache.mahout.math.Vector;
-
-import com.google.common.collect.Lists;
-
-public class DisplayKMeans extends DisplayClustering {
-  
-  DisplayKMeans() {
-    initialize();
-    this.setTitle("k-Means Clusters (>" + (int) (significance * 100) + "% of population)");
-  }
-  
-  public static void main(String[] args) throws Exception {
-    DistanceMeasure measure = new ManhattanDistanceMeasure();
-    Path samples = new Path("samples");
-    Path output = new Path("output");
-    Configuration conf = new Configuration();
-    HadoopUtil.delete(conf, samples);
-    HadoopUtil.delete(conf, output);
-    
-    RandomUtils.useTestSeed();
-    generateSamples();
-    writeSampleData(samples);
-    boolean runClusterer = true;
-    double convergenceDelta = 0.001;
-    int numClusters = 3;
-    int maxIterations = 10;
-    if (runClusterer) {
-      runSequentialKMeansClusterer(conf, samples, output, measure, numClusters, maxIterations, convergenceDelta);
-    } else {
-      runSequentialKMeansClassifier(conf, samples, output, measure, numClusters, maxIterations, convergenceDelta);
-    }
-    new DisplayKMeans();
-  }
-  
-  private static void runSequentialKMeansClassifier(Configuration conf, Path samples, Path output,
-      DistanceMeasure measure, int numClusters, int maxIterations, double convergenceDelta) throws IOException {
-    Collection<Vector> points = Lists.newArrayList();
-    for (int i = 0; i < numClusters; i++) {
-      points.add(SAMPLE_DATA.get(i).get());
-    }
-    List<Cluster> initialClusters = Lists.newArrayList();
-    int id = 0;
-    for (Vector point : points) {
-      initialClusters.add(new org.apache.mahout.clustering.kmeans.Kluster(point, id++, measure));
-    }
-    ClusterClassifier prior = new ClusterClassifier(initialClusters, new KMeansClusteringPolicy(convergenceDelta));
-    Path priorPath = new Path(output, Cluster.INITIAL_CLUSTERS_DIR);
-    prior.writeToSeqFiles(priorPath);
-    
-    ClusterIterator.iterateSeq(conf, samples, priorPath, output, maxIterations);
-    loadClustersWritable(output);
-  }
-  
-  private static void runSequentialKMeansClusterer(Configuration conf, Path samples, Path output,
-    DistanceMeasure measure, int numClusters, int maxIterations, double convergenceDelta)
-    throws IOException, InterruptedException, ClassNotFoundException {
-    Path clustersIn = new Path(output, "random-seeds");
-    RandomSeedGenerator.buildRandom(conf, samples, clustersIn, numClusters, measure);
-    KMeansDriver.run(samples, clustersIn, output, convergenceDelta, maxIterations, true, 0.0, true);
-    loadClustersWritable(output);
-  }
-  
-  // Override the paint() method
-  @Override
-  public void paint(Graphics g) {
-    plotSampleData((Graphics2D) g);
-    plotClusters((Graphics2D) g);
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/examples/src/main/java/org/apache/mahout/clustering/display/DisplaySpectralKMeans.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/org/apache/mahout/clustering/display/DisplaySpectralKMeans.java b/examples/src/main/java/org/apache/mahout/clustering/display/DisplaySpectralKMeans.java
deleted file mode 100644
index 2b70749..0000000
--- a/examples/src/main/java/org/apache/mahout/clustering/display/DisplaySpectralKMeans.java
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.clustering.display;
-
-import java.awt.Graphics;
-import java.awt.Graphics2D;
-import java.io.BufferedWriter;
-import java.io.FileWriter;
-import java.io.Writer;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.mahout.clustering.spectral.kmeans.SpectralKMeansDriver;
-import org.apache.mahout.common.HadoopUtil;
-import org.apache.mahout.common.RandomUtils;
-import org.apache.mahout.common.distance.DistanceMeasure;
-import org.apache.mahout.common.distance.ManhattanDistanceMeasure;
-
-public class DisplaySpectralKMeans extends DisplayClustering {
-
-  protected static final String SAMPLES = "samples";
-  protected static final String OUTPUT = "output";
-  protected static final String TEMP = "tmp";
-  protected static final String AFFINITIES = "affinities";
-
-  DisplaySpectralKMeans() {
-    initialize();
-    setTitle("Spectral k-Means Clusters (>" + (int) (significance * 100) + "% of population)");
-  }
-
-  public static void main(String[] args) throws Exception {
-    DistanceMeasure measure = new ManhattanDistanceMeasure();
-    Path samples = new Path(SAMPLES);
-    Path output = new Path(OUTPUT);
-    Path tempDir = new Path(TEMP);
-    Configuration conf = new Configuration();
-    HadoopUtil.delete(conf, samples);
-    HadoopUtil.delete(conf, output);
-
-    RandomUtils.useTestSeed();
-    DisplayClustering.generateSamples();
-    writeSampleData(samples);
-    Path affinities = new Path(output, AFFINITIES);
-    FileSystem fs = FileSystem.get(output.toUri(), conf);
-    if (!fs.exists(output)) {
-      fs.mkdirs(output);
-    }
-
-    try (Writer writer = new BufferedWriter(new FileWriter(affinities.toString()))){
-      for (int i = 0; i < SAMPLE_DATA.size(); i++) {
-        for (int j = 0; j < SAMPLE_DATA.size(); j++) {
-          writer.write(i + "," + j + ',' + measure.distance(SAMPLE_DATA.get(i).get(),
-              SAMPLE_DATA.get(j).get()) + '\n');
-        }
-      }
-    }
-
-    int maxIter = 10;
-    double convergenceDelta = 0.001;
-    SpectralKMeansDriver.run(new Configuration(), affinities, output, SAMPLE_DATA.size(), 3, measure,
-        convergenceDelta, maxIter, tempDir);
-    new DisplaySpectralKMeans();
-  }
-
-  @Override
-  public void paint(Graphics g) {
-    plotClusteredSampleData((Graphics2D) g, new Path(new Path(OUTPUT), "kmeans_out"));
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/examples/src/main/java/org/apache/mahout/clustering/display/README.txt
----------------------------------------------------------------------
diff --git a/examples/src/main/java/org/apache/mahout/clustering/display/README.txt b/examples/src/main/java/org/apache/mahout/clustering/display/README.txt
deleted file mode 100644
index 470c16c..0000000
--- a/examples/src/main/java/org/apache/mahout/clustering/display/README.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-The following classes can be run without parameters to generate a sample data set and 
-run the reference clustering implementations over them:
-
-DisplayClustering - generates 1000 samples from three, symmetric distributions. This is the same 
-    data set that is used by the following clustering programs. It displays the points on a screen
-    and superimposes the model parameters that were used to generate the points. You can edit the
-    generateSamples() method to change the sample points used by these programs.
-    
-  * DisplayCanopy - uses Canopy clustering
-  * DisplayKMeans - uses k-Means clustering
-  * DisplayFuzzyKMeans - uses Fuzzy k-Means clustering
-  
-  * NOTE: some of these programs display the sample points and then superimpose all of the clusters
-    from each iteration. The last iteration's clusters are in bold red and the previous several are 
-    colored (orange, yellow, green, blue, violet) in order after which all earlier clusters are in
-    light grey. This helps to visualize how the clusters converge upon a solution over multiple
-    iterations.
-  * NOTE: by changing the parameter values (k, ALPHA_0, numIterations) and the display SIGNIFICANCE
-    you can obtain different results.
-    
-  
-    
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/ClusterQualitySummarizer.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/ClusterQualitySummarizer.java b/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/ClusterQualitySummarizer.java
deleted file mode 100644
index c29cbc4..0000000
--- a/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/ClusterQualitySummarizer.java
+++ /dev/null
@@ -1,279 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.clustering.streaming.tools;
-
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.PrintWriter;
-import java.util.List;
-
-import com.google.common.collect.Iterables;
-import com.google.common.collect.Lists;
-import com.google.common.io.Closeables;
-import org.apache.commons.cli2.CommandLine;
-import org.apache.commons.cli2.Group;
-import org.apache.commons.cli2.Option;
-import org.apache.commons.cli2.builder.ArgumentBuilder;
-import org.apache.commons.cli2.builder.DefaultOptionBuilder;
-import org.apache.commons.cli2.builder.GroupBuilder;
-import org.apache.commons.cli2.commandline.Parser;
-import org.apache.commons.cli2.util.HelpFormatter;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.mahout.clustering.iterator.ClusterWritable;
-import org.apache.mahout.clustering.ClusteringUtils;
-import org.apache.mahout.clustering.streaming.mapreduce.CentroidWritable;
-import org.apache.mahout.common.AbstractJob;
-import org.apache.mahout.common.distance.DistanceMeasure;
-import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure;
-import org.apache.mahout.common.iterator.sequencefile.PathType;
-import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable;
-import org.apache.mahout.math.Centroid;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.VectorWritable;
-import org.apache.mahout.math.stats.OnlineSummarizer;
-
-public class ClusterQualitySummarizer extends AbstractJob {
-  private String outputFile;
-
-  private PrintWriter fileOut;
-
-  private String trainFile;
-  private String testFile;
-  private String centroidFile;
-  private String centroidCompareFile;
-  private boolean mahoutKMeansFormat;
-  private boolean mahoutKMeansFormatCompare;
-
-  private DistanceMeasure distanceMeasure = new SquaredEuclideanDistanceMeasure();
-
-  public void printSummaries(List<OnlineSummarizer> summarizers, String type) {
-    printSummaries(summarizers, type, fileOut);
-  }
-
-  public static void printSummaries(List<OnlineSummarizer> summarizers, String type, PrintWriter fileOut) {
-    double maxDistance = 0;
-    for (int i = 0; i < summarizers.size(); ++i) {
-      OnlineSummarizer summarizer = summarizers.get(i);
-      if (summarizer.getCount() > 1) {
-        maxDistance = Math.max(maxDistance, summarizer.getMax());
-        System.out.printf("Average distance in cluster %d [%d]: %f\n", i, summarizer.getCount(), summarizer.getMean());
-        // If there is just one point in the cluster, quartiles cannot be estimated. We'll just assume all the quartiles
-        // equal the only value.
-        if (fileOut != null) {
-          fileOut.printf("%d,%f,%f,%f,%f,%f,%f,%f,%d,%s\n", i, summarizer.getMean(),
-              summarizer.getSD(),
-              summarizer.getQuartile(0),
-              summarizer.getQuartile(1),
-              summarizer.getQuartile(2),
-              summarizer.getQuartile(3),
-              summarizer.getQuartile(4), summarizer.getCount(), type);
-        }
-      } else {
-        System.out.printf("Cluster %d is has %d data point. Need atleast 2 data points in a cluster for" +
-            " OnlineSummarizer.\n", i, summarizer.getCount());
-      }
-    }
-    System.out.printf("Num clusters: %d; maxDistance: %f\n", summarizers.size(), maxDistance);
-  }
-
-  public int run(String[] args) throws IOException {
-    if (!parseArgs(args)) {
-      return -1;
-    }
-
-    Configuration conf = new Configuration();
-    try {
-      fileOut = new PrintWriter(new FileOutputStream(outputFile));
-      fileOut.printf("cluster,distance.mean,distance.sd,distance.q0,distance.q1,distance.q2,distance.q3,"
-          + "distance.q4,count,is.train\n");
-
-      // Reading in the centroids (both pairs, if they exist).
-      List<Centroid> centroids;
-      List<Centroid> centroidsCompare = null;
-      if (mahoutKMeansFormat) {
-        SequenceFileDirValueIterable<ClusterWritable> clusterIterable =
-            new SequenceFileDirValueIterable<>(new Path(centroidFile), PathType.GLOB, conf);
-        centroids = Lists.newArrayList(IOUtils.getCentroidsFromClusterWritableIterable(clusterIterable));
-      } else {
-        SequenceFileDirValueIterable<CentroidWritable> centroidIterable =
-            new SequenceFileDirValueIterable<>(new Path(centroidFile), PathType.GLOB, conf);
-        centroids = Lists.newArrayList(IOUtils.getCentroidsFromCentroidWritableIterable(centroidIterable));
-      }
-
-      if (centroidCompareFile != null) {
-        if (mahoutKMeansFormatCompare) {
-          SequenceFileDirValueIterable<ClusterWritable> clusterCompareIterable =
-              new SequenceFileDirValueIterable<>(new Path(centroidCompareFile), PathType.GLOB, conf);
-          centroidsCompare = Lists.newArrayList(
-              IOUtils.getCentroidsFromClusterWritableIterable(clusterCompareIterable));
-        } else {
-          SequenceFileDirValueIterable<CentroidWritable> centroidCompareIterable =
-              new SequenceFileDirValueIterable<>(new Path(centroidCompareFile), PathType.GLOB, conf);
-          centroidsCompare = Lists.newArrayList(
-              IOUtils.getCentroidsFromCentroidWritableIterable(centroidCompareIterable));
-        }
-      }
-
-      // Reading in the "training" set.
-      SequenceFileDirValueIterable<VectorWritable> trainIterable =
-          new SequenceFileDirValueIterable<>(new Path(trainFile), PathType.GLOB, conf);
-      Iterable<Vector> trainDatapoints = IOUtils.getVectorsFromVectorWritableIterable(trainIterable);
-      Iterable<Vector> datapoints = trainDatapoints;
-
-      printSummaries(ClusteringUtils.summarizeClusterDistances(trainDatapoints, centroids,
-          new SquaredEuclideanDistanceMeasure()), "train");
-
-      // Also adding in the "test" set.
-      if (testFile != null) {
-        SequenceFileDirValueIterable<VectorWritable> testIterable =
-            new SequenceFileDirValueIterable<>(new Path(testFile), PathType.GLOB, conf);
-        Iterable<Vector> testDatapoints = IOUtils.getVectorsFromVectorWritableIterable(testIterable);
-
-        printSummaries(ClusteringUtils.summarizeClusterDistances(testDatapoints, centroids,
-            new SquaredEuclideanDistanceMeasure()), "test");
-
-        datapoints = Iterables.concat(trainDatapoints, testDatapoints);
-      }
-
-      // At this point, all train/test CSVs have been written. We now compute quality metrics.
-      List<OnlineSummarizer> summaries =
-          ClusteringUtils.summarizeClusterDistances(datapoints, centroids, distanceMeasure);
-      List<OnlineSummarizer> compareSummaries = null;
-      if (centroidsCompare != null) {
-        compareSummaries = ClusteringUtils.summarizeClusterDistances(datapoints, centroidsCompare, distanceMeasure);
-      }
-      System.out.printf("[Dunn Index] First: %f", ClusteringUtils.dunnIndex(centroids, distanceMeasure, summaries));
-      if (compareSummaries != null) {
-        System.out.printf(" Second: %f\n", ClusteringUtils.dunnIndex(centroidsCompare, distanceMeasure, compareSummaries));
-      } else {
-        System.out.printf("\n");
-      }
-      System.out.printf("[Davies-Bouldin Index] First: %f",
-          ClusteringUtils.daviesBouldinIndex(centroids, distanceMeasure, summaries));
-      if (compareSummaries != null) {
-        System.out.printf(" Second: %f\n",
-          ClusteringUtils.daviesBouldinIndex(centroidsCompare, distanceMeasure, compareSummaries));
-      } else {
-        System.out.printf("\n");
-      }
-    } catch (IOException e) {
-      System.out.println(e.getMessage());
-    } finally {
-      Closeables.close(fileOut, false);
-    }
-    return 0;
-  }
-
-  private boolean parseArgs(String[] args) {
-    DefaultOptionBuilder builder = new DefaultOptionBuilder();
-
-    Option help = builder.withLongName("help").withDescription("print this list").create();
-
-    ArgumentBuilder argumentBuilder = new ArgumentBuilder();
-    Option inputFileOption = builder.withLongName("input")
-        .withShortName("i")
-        .withRequired(true)
-        .withArgument(argumentBuilder.withName("input").withMaximum(1).create())
-        .withDescription("where to get seq files with the vectors (training set)")
-        .create();
-
-    Option testInputFileOption = builder.withLongName("testInput")
-        .withShortName("itest")
-        .withArgument(argumentBuilder.withName("testInput").withMaximum(1).create())
-        .withDescription("where to get seq files with the vectors (test set)")
-        .create();
-
-    Option centroidsFileOption = builder.withLongName("centroids")
-        .withShortName("c")
-        .withRequired(true)
-        .withArgument(argumentBuilder.withName("centroids").withMaximum(1).create())
-        .withDescription("where to get seq files with the centroids (from Mahout KMeans or StreamingKMeansDriver)")
-        .create();
-
-    Option centroidsCompareFileOption = builder.withLongName("centroidsCompare")
-        .withShortName("cc")
-        .withRequired(false)
-        .withArgument(argumentBuilder.withName("centroidsCompare").withMaximum(1).create())
-        .withDescription("where to get seq files with the second set of centroids (from Mahout KMeans or "
-            + "StreamingKMeansDriver)")
-        .create();
-
-    Option outputFileOption = builder.withLongName("output")
-        .withShortName("o")
-        .withRequired(true)
-        .withArgument(argumentBuilder.withName("output").withMaximum(1).create())
-        .withDescription("where to dump the CSV file with the results")
-        .create();
-
-    Option mahoutKMeansFormatOption = builder.withLongName("mahoutkmeansformat")
-        .withShortName("mkm")
-        .withDescription("if set, read files as (IntWritable, ClusterWritable) pairs")
-        .withArgument(argumentBuilder.withName("numpoints").withMaximum(1).create())
-        .create();
-
-    Option mahoutKMeansCompareFormatOption = builder.withLongName("mahoutkmeansformatCompare")
-        .withShortName("mkmc")
-        .withDescription("if set, read files as (IntWritable, ClusterWritable) pairs")
-        .withArgument(argumentBuilder.withName("numpoints").withMaximum(1).create())
-        .create();
-
-    Group normalArgs = new GroupBuilder()
-        .withOption(help)
-        .withOption(inputFileOption)
-        .withOption(testInputFileOption)
-        .withOption(outputFileOption)
-        .withOption(centroidsFileOption)
-        .withOption(centroidsCompareFileOption)
-        .withOption(mahoutKMeansFormatOption)
-        .withOption(mahoutKMeansCompareFormatOption)
-        .create();
-
-    Parser parser = new Parser();
-    parser.setHelpOption(help);
-    parser.setHelpTrigger("--help");
-    parser.setGroup(normalArgs);
-    parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 150));
-
-    CommandLine cmdLine = parser.parseAndHelp(args);
-    if (cmdLine == null) {
-      return false;
-    }
-
-    trainFile = (String) cmdLine.getValue(inputFileOption);
-    if (cmdLine.hasOption(testInputFileOption)) {
-      testFile = (String) cmdLine.getValue(testInputFileOption);
-    }
-    centroidFile = (String) cmdLine.getValue(centroidsFileOption);
-    if (cmdLine.hasOption(centroidsCompareFileOption)) {
-      centroidCompareFile = (String) cmdLine.getValue(centroidsCompareFileOption);
-    }
-    outputFile = (String) cmdLine.getValue(outputFileOption);
-    if (cmdLine.hasOption(mahoutKMeansFormatOption)) {
-      mahoutKMeansFormat = true;
-    }
-    if (cmdLine.hasOption(mahoutKMeansCompareFormatOption)) {
-      mahoutKMeansFormatCompare = true;
-    }
-    return true;
-  }
-
-  public static void main(String[] args) throws IOException {
-    new ClusterQualitySummarizer().run(args);
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/IOUtils.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/IOUtils.java b/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/IOUtils.java
deleted file mode 100644
index bd1149b..0000000
--- a/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/IOUtils.java
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.clustering.streaming.tools;
-
-import com.google.common.base.Function;
-import com.google.common.base.Preconditions;
-import com.google.common.collect.Iterables;
-import org.apache.mahout.clustering.iterator.ClusterWritable;
-import org.apache.mahout.clustering.streaming.mapreduce.CentroidWritable;
-import org.apache.mahout.math.Centroid;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.VectorWritable;
-
-public class IOUtils {
-
-  private IOUtils() {}
-
-  /**
-   * Converts CentroidWritable values in a sequence file into Centroids lazily.
-   * @param dirIterable the source iterable (comes from a SequenceFileDirIterable).
-   * @return an Iterable<Centroid> with the converted vectors.
-   */
-  public static Iterable<Centroid> getCentroidsFromCentroidWritableIterable(
-      Iterable<CentroidWritable>  dirIterable) {
-    return Iterables.transform(dirIterable, new Function<CentroidWritable, Centroid>() {
-      @Override
-      public Centroid apply(CentroidWritable input) {
-        Preconditions.checkNotNull(input);
-        return input.getCentroid().clone();
-      }
-    });
-  }
-
-  /**
-   * Converts CentroidWritable values in a sequence file into Centroids lazily.
-   * @param dirIterable the source iterable (comes from a SequenceFileDirIterable).
-   * @return an Iterable<Centroid> with the converted vectors.
-   */
-  public static Iterable<Centroid> getCentroidsFromClusterWritableIterable(Iterable<ClusterWritable>  dirIterable) {
-    return Iterables.transform(dirIterable, new Function<ClusterWritable, Centroid>() {
-      int numClusters = 0;
-      @Override
-      public Centroid apply(ClusterWritable input) {
-        Preconditions.checkNotNull(input);
-        return new Centroid(numClusters++, input.getValue().getCenter().clone(),
-            input.getValue().getTotalObservations());
-      }
-    });
-  }
-
-  /**
-   * Converts VectorWritable values in a sequence file into Vectors lazily.
-   * @param dirIterable the source iterable (comes from a SequenceFileDirIterable).
-   * @return an Iterable<Vector> with the converted vectors.
-   */
-  public static Iterable<Vector> getVectorsFromVectorWritableIterable(Iterable<VectorWritable> dirIterable) {
-    return Iterables.transform(dirIterable, new Function<VectorWritable, Vector>() {
-      @Override
-      public Vector apply(VectorWritable input) {
-        Preconditions.checkNotNull(input);
-        return input.get().clone();
-      }
-    });
-  }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/02f75f99/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java
----------------------------------------------------------------------
diff --git a/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java b/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java
deleted file mode 100644
index 083cd8c..0000000
--- a/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java
+++ /dev/null
@@ -1,125 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.clustering.syntheticcontrol.canopy;
-
-import java.util.List;
-import java.util.Map;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.util.ToolRunner;
-import org.apache.mahout.clustering.canopy.CanopyDriver;
-import org.apache.mahout.clustering.conversion.InputDriver;
-import org.apache.mahout.common.AbstractJob;
-import org.apache.mahout.common.ClassUtils;
-import org.apache.mahout.common.HadoopUtil;
-import org.apache.mahout.common.commandline.DefaultOptionCreator;
-import org.apache.mahout.common.distance.DistanceMeasure;
-import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
-import org.apache.mahout.utils.clustering.ClusterDumper;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-@Deprecated
-public final class Job extends AbstractJob {
-
-  private static final String DIRECTORY_CONTAINING_CONVERTED_INPUT = "data";
-
-  private Job() {
-  }
-
-  private static final Logger log = LoggerFactory.getLogger(Job.class);
-
-  public static void main(String[] args) throws Exception {
-    if (args.length > 0) {
-      log.info("Running with only user-supplied arguments");
-      ToolRunner.run(new Configuration(), new Job(), args);
-    } else {
-      log.info("Running with default arguments");
-      Path output = new Path("output");
-      HadoopUtil.delete(new Configuration(), output);
-      run(new Path("testdata"), output, new EuclideanDistanceMeasure(), 80, 55);
-    }
-  }
-
-  /**
-   * Run the canopy clustering job on an input dataset using the given distance
-   * measure, t1 and t2 parameters. All output data will be written to the
-   * output directory, which will be initially deleted if it exists. The
-   * clustered points will reside in the path <output>/clustered-points. By
-   * default, the job expects the a file containing synthetic_control.data as
-   * obtained from
-   * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series
-   * resides in a directory named "testdata", and writes output to a directory
-   * named "output".
-   * 
-   * @param input
-   *          the String denoting the input directory path
-   * @param output
-   *          the String denoting the output directory path
-   * @param measure
-   *          the DistanceMeasure to use
-   * @param t1
-   *          the canopy T1 threshold
-   * @param t2
-   *          the canopy T2 threshold
-   */
-  private static void run(Path input, Path output, DistanceMeasure measure,
-      double t1, double t2) throws Exception {
-    Path directoryContainingConvertedInput = new Path(output,
-        DIRECTORY_CONTAINING_CONVERTED_INPUT);
-    InputDriver.runJob(input, directoryContainingConvertedInput,
-        "org.apache.mahout.math.RandomAccessSparseVector");
-    CanopyDriver.run(new Configuration(), directoryContainingConvertedInput,
-        output, measure, t1, t2, true, 0.0, false);
-    // run ClusterDumper
-    ClusterDumper clusterDumper = new ClusterDumper(new Path(output,
-        "clusters-0-final"), new Path(output, "clusteredPoints"));
-    clusterDumper.printClusters(null);
-  }
-
-  @Override
-  public int run(String[] args) throws Exception {
-
-    addInputOption();
-    addOutputOption();
-    addOption(DefaultOptionCreator.distanceMeasureOption().create());
-    addOption(DefaultOptionCreator.t1Option().create());
-    addOption(DefaultOptionCreator.t2Option().create());
-    addOption(DefaultOptionCreator.overwriteOption().create());
-
-    Map<String, List<String>> argMap = parseArguments(args);
-    if (argMap == null) {
-      return -1;
-    }
-
-    Path input = getInputPath();
-    Path output = getOutputPath();
-    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
-      HadoopUtil.delete(new Configuration(), output);
-    }
-    String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
-    double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION));
-    double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION));
-    DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class);
-
-    run(input, output, measure, t1, t2);
-    return 0;
-  }
-
-}