You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by df...@apache.org on 2013/06/12 16:26:38 UTC

svn commit: r1492219 - in /mahout/trunk: ./ examples/src/main/java/org/apache/mahout/clustering/streaming/ examples/src/main/java/org/apache/mahout/clustering/streaming/tools/ src/conf/

Author: dfilimon
Date: Wed Jun 12 14:26:38 2013
New Revision: 1492219

URL: http://svn.apache.org/r1492219
Log:
MAHOUT-1253: Add experiment tools for StreamingKMeans, part 1


Added:
    mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/streaming/
    mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/
    mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/ClusterQualitySummarizer.java
    mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/IOUtils.java
    mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/ResplitSequenceFiles.java
Modified:
    mahout/trunk/CHANGELOG
    mahout/trunk/src/conf/driver.classes.default.props

Modified: mahout/trunk/CHANGELOG
URL: http://svn.apache.org/viewvc/mahout/trunk/CHANGELOG?rev=1492219&r1=1492218&r2=1492219&view=diff
==============================================================================
--- mahout/trunk/CHANGELOG (original)
+++ mahout/trunk/CHANGELOG Wed Jun 12 14:26:38 2013
@@ -2,6 +2,8 @@ Mahout Change Log
 
 Release 0.8 - unreleased
 
+__MAHOUT-1253: Add experiment tools for StreamingKMeans, part 1 (dfilimon)
+
   MAHOUT-884: Matrix Concatenate Utility (Lance Norskog, smarthi)
 
   MAHOUT-1250: Deprecate unused algorithms (ssc)

Added: mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/ClusterQualitySummarizer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/ClusterQualitySummarizer.java?rev=1492219&view=auto
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/ClusterQualitySummarizer.java (added)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/ClusterQualitySummarizer.java Wed Jun 12 14:26:38 2013
@@ -0,0 +1,267 @@
+package org.apache.mahout.clustering.streaming.tools;
+
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+import java.util.List;
+
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.commons.cli2.util.HelpFormatter;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.mahout.clustering.iterator.ClusterWritable;
+import org.apache.mahout.clustering.ClusteringUtils;
+import org.apache.mahout.clustering.streaming.mapreduce.CentroidWritable;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable;
+import org.apache.mahout.math.Centroid;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.stats.OnlineSummarizer;
+
+public class ClusterQualitySummarizer {
+  private String outputFile;
+
+
+  private PrintWriter fileOut;
+
+  private String trainFile;
+  private String testFile;
+  private String centroidFile;
+  private String centroidCompareFile;
+  private boolean mahoutKMeansFormat;
+  private boolean mahoutKMeansFormatCompare;
+
+  private DistanceMeasure distanceMeasure = new SquaredEuclideanDistanceMeasure();
+
+  public void printSummaries(List<OnlineSummarizer> summarizers, String type) {
+    printSummaries(summarizers, type, fileOut);
+  }
+
+  public static void printSummaries(List<OnlineSummarizer> summarizers, String type, PrintWriter fileOut) {
+    double maxDistance = 0;
+    for (int i = 0; i < summarizers.size(); ++i) {
+      OnlineSummarizer summarizer = summarizers.get(i);
+      if (summarizer.getCount() == 0) {
+        System.out.printf("Cluster %d is empty\n", i);
+        continue;
+      }
+      maxDistance = Math.max(maxDistance, summarizer.getMax());
+      System.out.printf("Average distance in cluster %d [%d]: %f\n", i, summarizer.getCount(), summarizer.getMean());
+      // If there is just one point in the cluster, quartiles cannot be estimated. We'll just assume all the quartiles
+      // equal the only value.
+      boolean moreThanOne = summarizer.getCount() > 1;
+      if (fileOut != null) {
+        fileOut.printf("%d,%f,%f,%f,%f,%f,%f,%f,%d,%s\n", i, summarizer.getMean(),
+            summarizer.getSD(),
+            summarizer.getQuartile(0),
+            moreThanOne ? summarizer.getQuartile(1) : summarizer.getQuartile(0),
+            moreThanOne ? summarizer.getQuartile(2) : summarizer.getQuartile(0),
+            moreThanOne ? summarizer.getQuartile(3) : summarizer.getQuartile(0),
+            summarizer.getQuartile(4), summarizer.getCount(), type);
+      }
+    }
+    System.out.printf("Num clusters: %d; maxDistance: %f\n", summarizers.size(), maxDistance);
+  }
+
+  public void run(String[] args) {
+    if (!parseArgs(args)) {
+      return;
+    }
+
+    Configuration conf = new Configuration();
+    try {
+      Configuration.dumpConfiguration(conf, new OutputStreamWriter(System.out));
+
+      fileOut = new PrintWriter(new FileOutputStream(outputFile));
+      fileOut.printf("cluster,distance.mean,distance.sd,distance.q0,distance.q1,distance.q2,distance.q3,"
+          + "distance.q4,count,is.train\n");
+
+      // Reading in the centroids (both pairs, if they exist).
+      List<Centroid> centroids;
+      List<Centroid> centroidsCompare = null;
+      if (mahoutKMeansFormat) {
+        SequenceFileDirValueIterable<ClusterWritable> clusterIterable =
+            new SequenceFileDirValueIterable<ClusterWritable>(new Path(centroidFile), PathType.GLOB, conf);
+        centroids = Lists.newArrayList(IOUtils.getCentroidsFromClusterWritableIterable(clusterIterable));
+      } else {
+        SequenceFileDirValueIterable<CentroidWritable> centroidIterable =
+            new SequenceFileDirValueIterable<CentroidWritable>(new Path(centroidFile), PathType.GLOB, conf);
+        centroids = Lists.newArrayList(IOUtils.getCentroidsFromCentroidWritableIterable(centroidIterable));
+      }
+
+      if (centroidCompareFile != null) {
+        if (mahoutKMeansFormatCompare) {
+          SequenceFileDirValueIterable<ClusterWritable> clusterCompareIterable =
+              new SequenceFileDirValueIterable<ClusterWritable>(new Path(centroidCompareFile), PathType.GLOB, conf);
+          centroidsCompare = Lists.newArrayList(
+              IOUtils.getCentroidsFromClusterWritableIterable(clusterCompareIterable));
+        } else {
+          SequenceFileDirValueIterable<CentroidWritable> centroidCompareIterable =
+              new SequenceFileDirValueIterable<CentroidWritable>(new Path(centroidCompareFile), PathType.GLOB, conf);
+          centroidsCompare = Lists.newArrayList(
+              IOUtils.getCentroidsFromCentroidWritableIterable(centroidCompareIterable));
+        }
+      }
+
+      // Reading in the "training" set.
+      SequenceFileDirValueIterable<VectorWritable> trainIterable =
+          new SequenceFileDirValueIterable<VectorWritable>(new Path(trainFile), PathType.GLOB, conf);
+      Iterable<Vector> trainDatapoints = IOUtils.getVectorsFromVectorWritableIterable(trainIterable);
+      Iterable<Vector> datapoints = trainDatapoints;
+
+      printSummaries(ClusteringUtils.summarizeClusterDistances(trainDatapoints, centroids,
+          new SquaredEuclideanDistanceMeasure()), "train");
+
+      // Also adding in the "test" set.
+      if (testFile != null) {
+        SequenceFileDirValueIterable<VectorWritable> testIterable =
+            new SequenceFileDirValueIterable<VectorWritable>(new Path(testFile), PathType.GLOB, conf);
+        Iterable<Vector> testDatapoints = IOUtils.getVectorsFromVectorWritableIterable(testIterable);
+
+        printSummaries(ClusteringUtils.summarizeClusterDistances(testDatapoints, centroids,
+            new SquaredEuclideanDistanceMeasure()), "test");
+
+        datapoints = Iterables.concat(trainDatapoints, testDatapoints);
+      }
+
+      // At this point, all train/test CSVs have been written. We now compute quality metrics.
+      List<OnlineSummarizer> summaries =
+          ClusteringUtils.summarizeClusterDistances(datapoints, centroids, distanceMeasure);
+      List<OnlineSummarizer> compareSummaries = null;
+      if (centroidsCompare != null) {
+            compareSummaries =
+                ClusteringUtils.summarizeClusterDistances(datapoints, centroidsCompare, distanceMeasure);
+      }
+      System.out.printf("[Dunn Index] First: %f", ClusteringUtils.dunnIndex(centroids, distanceMeasure, summaries));
+      if (compareSummaries != null) {
+        System.out.printf(" Second: %f\n",
+            ClusteringUtils.dunnIndex(centroidsCompare, distanceMeasure, compareSummaries));
+      } else {
+        System.out.printf("\n");
+      }
+      System.out.printf("[Davies-Bouldin Index] First: %f",
+          ClusteringUtils.daviesBouldinIndex(centroids, distanceMeasure, summaries));
+      if (compareSummaries != null) {
+        System.out.printf(" Second: %f\n",
+          ClusteringUtils.daviesBouldinIndex(centroidsCompare, distanceMeasure, compareSummaries));
+      } else {
+        System.out.printf("\n");
+      }
+
+      if (outputFile != null) {
+        fileOut.close();
+      }
+    } catch (IOException e) {
+      System.out.println(e.getMessage());
+    }
+  }
+
+  private boolean parseArgs(String[] args) {
+    DefaultOptionBuilder builder = new DefaultOptionBuilder();
+
+    Option help = builder.withLongName("help").withDescription("print this list").create();
+
+    ArgumentBuilder argumentBuilder = new ArgumentBuilder();
+    Option inputFileOption = builder.withLongName("input")
+        .withShortName("i")
+        .withRequired(true)
+        .withArgument(argumentBuilder.withName("input").withMaximum(1).create())
+        .withDescription("where to get seq files with the vectors (training set)")
+        .create();
+
+    Option testInputFileOption = builder.withLongName("testInput")
+        .withShortName("itest")
+        .withArgument(argumentBuilder.withName("testInput").withMaximum(1).create())
+        .withDescription("where to get seq files with the vectors (test set)")
+        .create();
+
+    Option centroidsFileOption = builder.withLongName("centroids")
+        .withShortName("c")
+        .withRequired(true)
+        .withArgument(argumentBuilder.withName("centroids").withMaximum(1).create())
+        .withDescription("where to get seq files with the centroids (from Mahout KMeans or StreamingKMeansDriver)")
+        .create();
+
+    Option centroidsCompareFileOption = builder.withLongName("centroidsCompare")
+        .withShortName("cc")
+        .withRequired(false)
+        .withArgument(argumentBuilder.withName("centroidsCompare").withMaximum(1).create())
+        .withDescription("where to get seq files with the second set of centroids (from Mahout KMeans or " +
+            "StreamingKMeansDriver)")
+        .create();
+
+    Option outputFileOption = builder.withLongName("output")
+        .withShortName("o")
+        .withRequired(true)
+        .withArgument(argumentBuilder.withName("output").withMaximum(1).create())
+        .withDescription("where to dump the CSV file with the results")
+        .create();
+
+    Option mahoutKMeansFormatOption = builder.withLongName("mahoutkmeansformat")
+        .withShortName("mkm")
+        .withDescription("if set, read files as (IntWritable, ClusterWritable) pairs")
+        .withArgument(argumentBuilder.withName("numpoints").withMaximum(1).create())
+        .create();
+
+    Option mahoutKMeansCompareFormatOption = builder.withLongName("mahoutkmeansformatCompare")
+        .withShortName("mkmc")
+        .withDescription("if set, read files as (IntWritable, ClusterWritable) pairs")
+        .withArgument(argumentBuilder.withName("numpoints").withMaximum(1).create())
+        .create();
+
+    Group normalArgs = new GroupBuilder()
+        .withOption(help)
+        .withOption(inputFileOption)
+        .withOption(testInputFileOption)
+        .withOption(outputFileOption)
+        .withOption(centroidsFileOption)
+        .withOption(centroidsCompareFileOption)
+        .withOption(mahoutKMeansFormatOption)
+        .withOption(mahoutKMeansCompareFormatOption)
+        .create();
+
+    Parser parser = new Parser();
+    parser.setHelpOption(help);
+    parser.setHelpTrigger("--help");
+    parser.setGroup(normalArgs);
+    parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 150));
+
+    CommandLine cmdLine = parser.parseAndHelp(args);
+    if (cmdLine == null) {
+      return false;
+    }
+
+    trainFile = (String) cmdLine.getValue(inputFileOption);
+    if (cmdLine.hasOption(testInputFileOption)) {
+      testFile = (String) cmdLine.getValue(testInputFileOption);
+    }
+    centroidFile = (String) cmdLine.getValue(centroidsFileOption);
+    if (cmdLine.hasOption(centroidsCompareFileOption)) {
+      centroidCompareFile = (String) cmdLine.getValue(centroidsCompareFileOption);
+    }
+    outputFile = (String) cmdLine.getValue(outputFileOption);
+    if (cmdLine.hasOption(mahoutKMeansFormatOption)) {
+      mahoutKMeansFormat = true;
+    }
+    if (cmdLine.hasOption(mahoutKMeansCompareFormatOption)) {
+      mahoutKMeansFormatCompare = true;
+    }
+    return true;
+  }
+
+  public static void main(String[] args) {
+    new ClusterQualitySummarizer().run(args);
+  }
+}

Added: mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/IOUtils.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/IOUtils.java?rev=1492219&view=auto
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/IOUtils.java (added)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/IOUtils.java Wed Jun 12 14:26:38 2013
@@ -0,0 +1,60 @@
+package org.apache.mahout.clustering.streaming.tools;
+
+import com.google.common.base.Function;
+import com.google.common.base.Preconditions;
+import com.google.common.collect.Iterables;
+import org.apache.mahout.clustering.iterator.ClusterWritable;
+import org.apache.mahout.clustering.streaming.mapreduce.CentroidWritable;
+import org.apache.mahout.math.Centroid;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+public class IOUtils {
+  /**
+   * Converts CentroidWritable values in a sequence file into Centroids lazily.
+   * @param dirIterable the source iterable (comes from a SequenceFileDirIterable).
+   * @return an Iterable<Centroid> with the converted vectors.
+   */
+  public static Iterable<Centroid> getCentroidsFromCentroidWritableIterable(
+      Iterable<CentroidWritable>  dirIterable) {
+    return Iterables.transform(dirIterable, new Function<CentroidWritable, Centroid>() {
+      @Override
+      public Centroid apply(CentroidWritable input) {
+        Preconditions.checkNotNull(input);
+        return input.getCentroid().clone();
+      }
+    });
+  }
+
+  /**
+   * Converts CentroidWritable values in a sequence file into Centroids lazily.
+   * @param dirIterable the source iterable (comes from a SequenceFileDirIterable).
+   * @return an Iterable<Centroid> with the converted vectors.
+   */
+  public static Iterable<Centroid> getCentroidsFromClusterWritableIterable(Iterable<ClusterWritable>  dirIterable) {
+    return Iterables.transform(dirIterable, new Function<ClusterWritable, Centroid>() {
+      int numClusters = 0;
+      @Override
+      public Centroid apply(ClusterWritable input) {
+        Preconditions.checkNotNull(input);
+        return new Centroid(numClusters++, input.getValue().getCenter().clone(),
+            input.getValue().getTotalObservations());
+      }
+    });
+  }
+
+  /**
+   * Converts VectorWritable values in a sequence file into Vectors lazily.
+   * @param dirIterable the source iterable (comes from a SequenceFileDirIterable).
+   * @return an Iterable<Vector> with the converted vectors.
+   */
+  public static Iterable<Vector> getVectorsFromVectorWritableIterable(Iterable<VectorWritable> dirIterable) {
+    return Iterables.transform(dirIterable, new Function<VectorWritable, Vector>() {
+      @Override
+      public Vector apply(VectorWritable input) {
+        Preconditions.checkNotNull(input);
+        return input.get().clone();
+      }
+    });
+  }
+}

Added: mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/ResplitSequenceFiles.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/ResplitSequenceFiles.java?rev=1492219&view=auto
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/ResplitSequenceFiles.java (added)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/ResplitSequenceFiles.java Wed Jun 12 14:26:38 2013
@@ -0,0 +1,129 @@
+package org.apache.mahout.clustering.streaming.tools;
+
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+import java.util.Iterator;
+
+import com.google.common.collect.Iterables;
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.commons.cli2.util.HelpFormatter;
+import org.apache.commons.io.Charsets;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
+
+public class ResplitSequenceFiles {
+  private String inputFile;
+  private String outputFileBase;
+  private int numSplits;
+
+  private Configuration conf;
+  private FileSystem fs;
+
+  private void writeSplit(Iterator<Pair<Writable, Writable>> inputIterator,
+                          int numSplit, int numEntriesPerSplit) throws IOException {
+    SequenceFile.Writer splitWriter = null;
+    for (int j = 0; j < numEntriesPerSplit; ++j) {
+      Pair<Writable, Writable> item = inputIterator.next();
+      if (splitWriter == null) {
+        splitWriter = SequenceFile.createWriter(fs, conf,
+            new Path(outputFileBase + "-" + numSplit), item.getFirst().getClass(), item.getSecond().getClass());
+      }
+      splitWriter.append(item.getFirst(), item.getSecond());
+    }
+    if (splitWriter != null) {
+      splitWriter.close();
+    }
+  }
+
+  private void run(PrintWriter printWriter) throws IOException {
+    conf = new Configuration();
+    SequenceFileDirIterable<Writable, Writable> inputIterable = new
+        SequenceFileDirIterable<Writable, Writable>(new Path(inputFile), PathType.LIST, conf);
+    fs = FileSystem.get(conf);
+
+    int numEntries = Iterables.size(inputIterable);
+    int numEntriesPerSplit = numEntries / numSplits;
+    int numEntriesLastSplit = numEntriesPerSplit + numEntries - numEntriesPerSplit * numSplits;
+    Iterator<Pair<Writable, Writable>> inputIterator = inputIterable.iterator();
+
+    printWriter.printf("Writing %d splits\n", numSplits);
+    for (int i = 0; i < numSplits - 1; ++i) {
+      printWriter.printf("Writing split %d\n", i);
+      writeSplit(inputIterator, i, numEntriesPerSplit);
+    }
+    printWriter.printf("Writing split %d\n", numSplits - 1);
+    writeSplit(inputIterator, numSplits - 1, numEntriesLastSplit);
+  }
+
+  private boolean parseArgs(String[] args) {
+    DefaultOptionBuilder builder = new DefaultOptionBuilder();
+
+    Option help = builder.withLongName("help").withDescription("print this list").create();
+
+    ArgumentBuilder argumentBuilder = new ArgumentBuilder();
+    Option inputFileOption = builder.withLongName("input")
+        .withShortName("i")
+        .withRequired(true)
+        .withArgument(argumentBuilder.withName("input").withMaximum(1).create())
+        .withDescription("what the base folder for sequence files is (they all must have the same key/value type")
+        .create();
+
+    Option outputFileOption = builder.withLongName("output")
+        .withShortName("o")
+        .withRequired(true)
+        .withArgument(argumentBuilder.withName("output").withMaximum(1).create())
+        .withDescription("the base name of the file split that the files will be split it; the i'th split has the " +
+            "suffix -i")
+        .create();
+
+    Option numSplitsOption = builder.withLongName("numSplits")
+        .withShortName("ns")
+        .withRequired(true)
+        .withArgument(argumentBuilder.withName("numSplits").withMaximum(1).create())
+        .withDescription("how many splits to use for the given files")
+        .create();
+
+    Group normalArgs = new GroupBuilder()
+        .withOption(help)
+        .withOption(inputFileOption)
+        .withOption(outputFileOption)
+        .withOption(numSplitsOption)
+        .create();
+
+    Parser parser = new Parser();
+    parser.setHelpOption(help);
+    parser.setHelpTrigger("--help");
+    parser.setGroup(normalArgs);
+    parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130));
+    CommandLine cmdLine = parser.parseAndHelp(args);
+
+    if (cmdLine == null) {
+      return false;
+    }
+
+    inputFile = (String) cmdLine.getValue(inputFileOption);
+    outputFileBase = (String) cmdLine.getValue(outputFileOption);
+    numSplits = Integer.parseInt((String) cmdLine.getValue(numSplitsOption));
+    return true;
+  }
+
+  public static void main(String[] args) throws IOException {
+    ResplitSequenceFiles runner = new ResplitSequenceFiles();
+    if (runner.parseArgs(args)) {
+      runner.run(new PrintWriter(new OutputStreamWriter(System.out, Charsets.UTF_8), true));
+    }
+  }
+}

Modified: mahout/trunk/src/conf/driver.classes.default.props
URL: http://svn.apache.org/viewvc/mahout/trunk/src/conf/driver.classes.default.props?rev=1492219&r1=1492218&r2=1492219&view=diff
==============================================================================
--- mahout/trunk/src/conf/driver.classes.default.props (original)
+++ mahout/trunk/src/conf/driver.classes.default.props Wed Jun 12 14:26:38 2013
@@ -15,6 +15,8 @@ org.apache.mahout.text.WikipediaToSequen
 org.apache.mahout.text.SequenceFilesFromMailArchives = seqmailarchives : Creates SequenceFile from a directory containing gzipped mail archives
 org.apache.mahout.text.SequenceFilesFromLuceneStorageDriver = lucene2seq : Generate Text SequenceFiles from a Lucene index
 org.apache.mahout.utils.ConcatenateVectorsJob = concatmatrices : Concatenates 2 matrices of same cardinality into a single matrix
+org.apache.mahout.clustering.streaming.tools.ResplitSequenceFiles = resplit : Splits a set of SequenceFiles into a number of equal splits
+org.apache.mahout.clustering.streaming.tools.ClusterQualitySummarizer = qualcluster : Runs clustering experiments and summarizes results in a CSV
 
 #Math
 org.apache.mahout.math.hadoop.TransposeJob = transpose : Take the transpose of a matrix