You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by df...@apache.org on 2013/06/12 16:26:38 UTC
svn commit: r1492219 - in /mahout/trunk: ./
examples/src/main/java/org/apache/mahout/clustering/streaming/
examples/src/main/java/org/apache/mahout/clustering/streaming/tools/
src/conf/
Author: dfilimon
Date: Wed Jun 12 14:26:38 2013
New Revision: 1492219
URL: http://svn.apache.org/r1492219
Log:
MAHOUT-1253: Add experiment tools for StreamingKMeans, part 1
Added:
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/streaming/
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/ClusterQualitySummarizer.java
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/IOUtils.java
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/ResplitSequenceFiles.java
Modified:
mahout/trunk/CHANGELOG
mahout/trunk/src/conf/driver.classes.default.props
Modified: mahout/trunk/CHANGELOG
URL: http://svn.apache.org/viewvc/mahout/trunk/CHANGELOG?rev=1492219&r1=1492218&r2=1492219&view=diff
==============================================================================
--- mahout/trunk/CHANGELOG (original)
+++ mahout/trunk/CHANGELOG Wed Jun 12 14:26:38 2013
@@ -2,6 +2,8 @@ Mahout Change Log
Release 0.8 - unreleased
+__MAHOUT-1253: Add experiment tools for StreamingKMeans, part 1 (dfilimon)
+
MAHOUT-884: Matrix Concatenate Utility (Lance Norskog, smarthi)
MAHOUT-1250: Deprecate unused algorithms (ssc)
Added: mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/ClusterQualitySummarizer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/ClusterQualitySummarizer.java?rev=1492219&view=auto
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/ClusterQualitySummarizer.java (added)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/ClusterQualitySummarizer.java Wed Jun 12 14:26:38 2013
@@ -0,0 +1,267 @@
+package org.apache.mahout.clustering.streaming.tools;
+
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+import java.util.List;
+
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.commons.cli2.util.HelpFormatter;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.mahout.clustering.iterator.ClusterWritable;
+import org.apache.mahout.clustering.ClusteringUtils;
+import org.apache.mahout.clustering.streaming.mapreduce.CentroidWritable;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable;
+import org.apache.mahout.math.Centroid;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.stats.OnlineSummarizer;
+
+public class ClusterQualitySummarizer {
+ private String outputFile;
+
+
+ private PrintWriter fileOut;
+
+ private String trainFile;
+ private String testFile;
+ private String centroidFile;
+ private String centroidCompareFile;
+ private boolean mahoutKMeansFormat;
+ private boolean mahoutKMeansFormatCompare;
+
+ private DistanceMeasure distanceMeasure = new SquaredEuclideanDistanceMeasure();
+
+ public void printSummaries(List<OnlineSummarizer> summarizers, String type) {
+ printSummaries(summarizers, type, fileOut);
+ }
+
+ public static void printSummaries(List<OnlineSummarizer> summarizers, String type, PrintWriter fileOut) {
+ double maxDistance = 0;
+ for (int i = 0; i < summarizers.size(); ++i) {
+ OnlineSummarizer summarizer = summarizers.get(i);
+ if (summarizer.getCount() == 0) {
+ System.out.printf("Cluster %d is empty\n", i);
+ continue;
+ }
+ maxDistance = Math.max(maxDistance, summarizer.getMax());
+ System.out.printf("Average distance in cluster %d [%d]: %f\n", i, summarizer.getCount(), summarizer.getMean());
+ // If there is just one point in the cluster, quartiles cannot be estimated. We'll just assume all the quartiles
+ // equal the only value.
+ boolean moreThanOne = summarizer.getCount() > 1;
+ if (fileOut != null) {
+ fileOut.printf("%d,%f,%f,%f,%f,%f,%f,%f,%d,%s\n", i, summarizer.getMean(),
+ summarizer.getSD(),
+ summarizer.getQuartile(0),
+ moreThanOne ? summarizer.getQuartile(1) : summarizer.getQuartile(0),
+ moreThanOne ? summarizer.getQuartile(2) : summarizer.getQuartile(0),
+ moreThanOne ? summarizer.getQuartile(3) : summarizer.getQuartile(0),
+ summarizer.getQuartile(4), summarizer.getCount(), type);
+ }
+ }
+ System.out.printf("Num clusters: %d; maxDistance: %f\n", summarizers.size(), maxDistance);
+ }
+
+ public void run(String[] args) {
+ if (!parseArgs(args)) {
+ return;
+ }
+
+ Configuration conf = new Configuration();
+ try {
+ Configuration.dumpConfiguration(conf, new OutputStreamWriter(System.out));
+
+ fileOut = new PrintWriter(new FileOutputStream(outputFile));
+ fileOut.printf("cluster,distance.mean,distance.sd,distance.q0,distance.q1,distance.q2,distance.q3,"
+ + "distance.q4,count,is.train\n");
+
+ // Reading in the centroids (both pairs, if they exist).
+ List<Centroid> centroids;
+ List<Centroid> centroidsCompare = null;
+ if (mahoutKMeansFormat) {
+ SequenceFileDirValueIterable<ClusterWritable> clusterIterable =
+ new SequenceFileDirValueIterable<ClusterWritable>(new Path(centroidFile), PathType.GLOB, conf);
+ centroids = Lists.newArrayList(IOUtils.getCentroidsFromClusterWritableIterable(clusterIterable));
+ } else {
+ SequenceFileDirValueIterable<CentroidWritable> centroidIterable =
+ new SequenceFileDirValueIterable<CentroidWritable>(new Path(centroidFile), PathType.GLOB, conf);
+ centroids = Lists.newArrayList(IOUtils.getCentroidsFromCentroidWritableIterable(centroidIterable));
+ }
+
+ if (centroidCompareFile != null) {
+ if (mahoutKMeansFormatCompare) {
+ SequenceFileDirValueIterable<ClusterWritable> clusterCompareIterable =
+ new SequenceFileDirValueIterable<ClusterWritable>(new Path(centroidCompareFile), PathType.GLOB, conf);
+ centroidsCompare = Lists.newArrayList(
+ IOUtils.getCentroidsFromClusterWritableIterable(clusterCompareIterable));
+ } else {
+ SequenceFileDirValueIterable<CentroidWritable> centroidCompareIterable =
+ new SequenceFileDirValueIterable<CentroidWritable>(new Path(centroidCompareFile), PathType.GLOB, conf);
+ centroidsCompare = Lists.newArrayList(
+ IOUtils.getCentroidsFromCentroidWritableIterable(centroidCompareIterable));
+ }
+ }
+
+ // Reading in the "training" set.
+ SequenceFileDirValueIterable<VectorWritable> trainIterable =
+ new SequenceFileDirValueIterable<VectorWritable>(new Path(trainFile), PathType.GLOB, conf);
+ Iterable<Vector> trainDatapoints = IOUtils.getVectorsFromVectorWritableIterable(trainIterable);
+ Iterable<Vector> datapoints = trainDatapoints;
+
+ printSummaries(ClusteringUtils.summarizeClusterDistances(trainDatapoints, centroids,
+ new SquaredEuclideanDistanceMeasure()), "train");
+
+ // Also adding in the "test" set.
+ if (testFile != null) {
+ SequenceFileDirValueIterable<VectorWritable> testIterable =
+ new SequenceFileDirValueIterable<VectorWritable>(new Path(testFile), PathType.GLOB, conf);
+ Iterable<Vector> testDatapoints = IOUtils.getVectorsFromVectorWritableIterable(testIterable);
+
+ printSummaries(ClusteringUtils.summarizeClusterDistances(testDatapoints, centroids,
+ new SquaredEuclideanDistanceMeasure()), "test");
+
+ datapoints = Iterables.concat(trainDatapoints, testDatapoints);
+ }
+
+ // At this point, all train/test CSVs have been written. We now compute quality metrics.
+ List<OnlineSummarizer> summaries =
+ ClusteringUtils.summarizeClusterDistances(datapoints, centroids, distanceMeasure);
+ List<OnlineSummarizer> compareSummaries = null;
+ if (centroidsCompare != null) {
+ compareSummaries =
+ ClusteringUtils.summarizeClusterDistances(datapoints, centroidsCompare, distanceMeasure);
+ }
+ System.out.printf("[Dunn Index] First: %f", ClusteringUtils.dunnIndex(centroids, distanceMeasure, summaries));
+ if (compareSummaries != null) {
+ System.out.printf(" Second: %f\n",
+ ClusteringUtils.dunnIndex(centroidsCompare, distanceMeasure, compareSummaries));
+ } else {
+ System.out.printf("\n");
+ }
+ System.out.printf("[Davies-Bouldin Index] First: %f",
+ ClusteringUtils.daviesBouldinIndex(centroids, distanceMeasure, summaries));
+ if (compareSummaries != null) {
+ System.out.printf(" Second: %f\n",
+ ClusteringUtils.daviesBouldinIndex(centroidsCompare, distanceMeasure, compareSummaries));
+ } else {
+ System.out.printf("\n");
+ }
+
+ if (outputFile != null) {
+ fileOut.close();
+ }
+ } catch (IOException e) {
+ System.out.println(e.getMessage());
+ }
+ }
+
+ private boolean parseArgs(String[] args) {
+ DefaultOptionBuilder builder = new DefaultOptionBuilder();
+
+ Option help = builder.withLongName("help").withDescription("print this list").create();
+
+ ArgumentBuilder argumentBuilder = new ArgumentBuilder();
+ Option inputFileOption = builder.withLongName("input")
+ .withShortName("i")
+ .withRequired(true)
+ .withArgument(argumentBuilder.withName("input").withMaximum(1).create())
+ .withDescription("where to get seq files with the vectors (training set)")
+ .create();
+
+ Option testInputFileOption = builder.withLongName("testInput")
+ .withShortName("itest")
+ .withArgument(argumentBuilder.withName("testInput").withMaximum(1).create())
+ .withDescription("where to get seq files with the vectors (test set)")
+ .create();
+
+ Option centroidsFileOption = builder.withLongName("centroids")
+ .withShortName("c")
+ .withRequired(true)
+ .withArgument(argumentBuilder.withName("centroids").withMaximum(1).create())
+ .withDescription("where to get seq files with the centroids (from Mahout KMeans or StreamingKMeansDriver)")
+ .create();
+
+ Option centroidsCompareFileOption = builder.withLongName("centroidsCompare")
+ .withShortName("cc")
+ .withRequired(false)
+ .withArgument(argumentBuilder.withName("centroidsCompare").withMaximum(1).create())
+ .withDescription("where to get seq files with the second set of centroids (from Mahout KMeans or " +
+ "StreamingKMeansDriver)")
+ .create();
+
+ Option outputFileOption = builder.withLongName("output")
+ .withShortName("o")
+ .withRequired(true)
+ .withArgument(argumentBuilder.withName("output").withMaximum(1).create())
+ .withDescription("where to dump the CSV file with the results")
+ .create();
+
+ Option mahoutKMeansFormatOption = builder.withLongName("mahoutkmeansformat")
+ .withShortName("mkm")
+ .withDescription("if set, read files as (IntWritable, ClusterWritable) pairs")
+ .withArgument(argumentBuilder.withName("numpoints").withMaximum(1).create())
+ .create();
+
+ Option mahoutKMeansCompareFormatOption = builder.withLongName("mahoutkmeansformatCompare")
+ .withShortName("mkmc")
+ .withDescription("if set, read files as (IntWritable, ClusterWritable) pairs")
+ .withArgument(argumentBuilder.withName("numpoints").withMaximum(1).create())
+ .create();
+
+ Group normalArgs = new GroupBuilder()
+ .withOption(help)
+ .withOption(inputFileOption)
+ .withOption(testInputFileOption)
+ .withOption(outputFileOption)
+ .withOption(centroidsFileOption)
+ .withOption(centroidsCompareFileOption)
+ .withOption(mahoutKMeansFormatOption)
+ .withOption(mahoutKMeansCompareFormatOption)
+ .create();
+
+ Parser parser = new Parser();
+ parser.setHelpOption(help);
+ parser.setHelpTrigger("--help");
+ parser.setGroup(normalArgs);
+ parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 150));
+
+ CommandLine cmdLine = parser.parseAndHelp(args);
+ if (cmdLine == null) {
+ return false;
+ }
+
+ trainFile = (String) cmdLine.getValue(inputFileOption);
+ if (cmdLine.hasOption(testInputFileOption)) {
+ testFile = (String) cmdLine.getValue(testInputFileOption);
+ }
+ centroidFile = (String) cmdLine.getValue(centroidsFileOption);
+ if (cmdLine.hasOption(centroidsCompareFileOption)) {
+ centroidCompareFile = (String) cmdLine.getValue(centroidsCompareFileOption);
+ }
+ outputFile = (String) cmdLine.getValue(outputFileOption);
+ if (cmdLine.hasOption(mahoutKMeansFormatOption)) {
+ mahoutKMeansFormat = true;
+ }
+ if (cmdLine.hasOption(mahoutKMeansCompareFormatOption)) {
+ mahoutKMeansFormatCompare = true;
+ }
+ return true;
+ }
+
+ public static void main(String[] args) {
+ new ClusterQualitySummarizer().run(args);
+ }
+}
Added: mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/IOUtils.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/IOUtils.java?rev=1492219&view=auto
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/IOUtils.java (added)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/IOUtils.java Wed Jun 12 14:26:38 2013
@@ -0,0 +1,60 @@
+package org.apache.mahout.clustering.streaming.tools;
+
+import com.google.common.base.Function;
+import com.google.common.base.Preconditions;
+import com.google.common.collect.Iterables;
+import org.apache.mahout.clustering.iterator.ClusterWritable;
+import org.apache.mahout.clustering.streaming.mapreduce.CentroidWritable;
+import org.apache.mahout.math.Centroid;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+public class IOUtils {
+ /**
+ * Converts CentroidWritable values in a sequence file into Centroids lazily.
+ * @param dirIterable the source iterable (comes from a SequenceFileDirIterable).
+ * @return an Iterable<Centroid> with the converted vectors.
+ */
+ public static Iterable<Centroid> getCentroidsFromCentroidWritableIterable(
+ Iterable<CentroidWritable> dirIterable) {
+ return Iterables.transform(dirIterable, new Function<CentroidWritable, Centroid>() {
+ @Override
+ public Centroid apply(CentroidWritable input) {
+ Preconditions.checkNotNull(input);
+ return input.getCentroid().clone();
+ }
+ });
+ }
+
+ /**
+ * Converts CentroidWritable values in a sequence file into Centroids lazily.
+ * @param dirIterable the source iterable (comes from a SequenceFileDirIterable).
+ * @return an Iterable<Centroid> with the converted vectors.
+ */
+ public static Iterable<Centroid> getCentroidsFromClusterWritableIterable(Iterable<ClusterWritable> dirIterable) {
+ return Iterables.transform(dirIterable, new Function<ClusterWritable, Centroid>() {
+ int numClusters = 0;
+ @Override
+ public Centroid apply(ClusterWritable input) {
+ Preconditions.checkNotNull(input);
+ return new Centroid(numClusters++, input.getValue().getCenter().clone(),
+ input.getValue().getTotalObservations());
+ }
+ });
+ }
+
+ /**
+ * Converts VectorWritable values in a sequence file into Vectors lazily.
+ * @param dirIterable the source iterable (comes from a SequenceFileDirIterable).
+ * @return an Iterable<Vector> with the converted vectors.
+ */
+ public static Iterable<Vector> getVectorsFromVectorWritableIterable(Iterable<VectorWritable> dirIterable) {
+ return Iterables.transform(dirIterable, new Function<VectorWritable, Vector>() {
+ @Override
+ public Vector apply(VectorWritable input) {
+ Preconditions.checkNotNull(input);
+ return input.get().clone();
+ }
+ });
+ }
+}
Added: mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/ResplitSequenceFiles.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/ResplitSequenceFiles.java?rev=1492219&view=auto
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/ResplitSequenceFiles.java (added)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/ResplitSequenceFiles.java Wed Jun 12 14:26:38 2013
@@ -0,0 +1,129 @@
+package org.apache.mahout.clustering.streaming.tools;
+
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+import java.util.Iterator;
+
+import com.google.common.collect.Iterables;
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.commons.cli2.util.HelpFormatter;
+import org.apache.commons.io.Charsets;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
+
+public class ResplitSequenceFiles {
+ private String inputFile;
+ private String outputFileBase;
+ private int numSplits;
+
+ private Configuration conf;
+ private FileSystem fs;
+
+ private void writeSplit(Iterator<Pair<Writable, Writable>> inputIterator,
+ int numSplit, int numEntriesPerSplit) throws IOException {
+ SequenceFile.Writer splitWriter = null;
+ for (int j = 0; j < numEntriesPerSplit; ++j) {
+ Pair<Writable, Writable> item = inputIterator.next();
+ if (splitWriter == null) {
+ splitWriter = SequenceFile.createWriter(fs, conf,
+ new Path(outputFileBase + "-" + numSplit), item.getFirst().getClass(), item.getSecond().getClass());
+ }
+ splitWriter.append(item.getFirst(), item.getSecond());
+ }
+ if (splitWriter != null) {
+ splitWriter.close();
+ }
+ }
+
+ private void run(PrintWriter printWriter) throws IOException {
+ conf = new Configuration();
+ SequenceFileDirIterable<Writable, Writable> inputIterable = new
+ SequenceFileDirIterable<Writable, Writable>(new Path(inputFile), PathType.LIST, conf);
+ fs = FileSystem.get(conf);
+
+ int numEntries = Iterables.size(inputIterable);
+ int numEntriesPerSplit = numEntries / numSplits;
+ int numEntriesLastSplit = numEntriesPerSplit + numEntries - numEntriesPerSplit * numSplits;
+ Iterator<Pair<Writable, Writable>> inputIterator = inputIterable.iterator();
+
+ printWriter.printf("Writing %d splits\n", numSplits);
+ for (int i = 0; i < numSplits - 1; ++i) {
+ printWriter.printf("Writing split %d\n", i);
+ writeSplit(inputIterator, i, numEntriesPerSplit);
+ }
+ printWriter.printf("Writing split %d\n", numSplits - 1);
+ writeSplit(inputIterator, numSplits - 1, numEntriesLastSplit);
+ }
+
+ private boolean parseArgs(String[] args) {
+ DefaultOptionBuilder builder = new DefaultOptionBuilder();
+
+ Option help = builder.withLongName("help").withDescription("print this list").create();
+
+ ArgumentBuilder argumentBuilder = new ArgumentBuilder();
+ Option inputFileOption = builder.withLongName("input")
+ .withShortName("i")
+ .withRequired(true)
+ .withArgument(argumentBuilder.withName("input").withMaximum(1).create())
+ .withDescription("what the base folder for sequence files is (they all must have the same key/value type")
+ .create();
+
+ Option outputFileOption = builder.withLongName("output")
+ .withShortName("o")
+ .withRequired(true)
+ .withArgument(argumentBuilder.withName("output").withMaximum(1).create())
+ .withDescription("the base name of the file split that the files will be split it; the i'th split has the " +
+ "suffix -i")
+ .create();
+
+ Option numSplitsOption = builder.withLongName("numSplits")
+ .withShortName("ns")
+ .withRequired(true)
+ .withArgument(argumentBuilder.withName("numSplits").withMaximum(1).create())
+ .withDescription("how many splits to use for the given files")
+ .create();
+
+ Group normalArgs = new GroupBuilder()
+ .withOption(help)
+ .withOption(inputFileOption)
+ .withOption(outputFileOption)
+ .withOption(numSplitsOption)
+ .create();
+
+ Parser parser = new Parser();
+ parser.setHelpOption(help);
+ parser.setHelpTrigger("--help");
+ parser.setGroup(normalArgs);
+ parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130));
+ CommandLine cmdLine = parser.parseAndHelp(args);
+
+ if (cmdLine == null) {
+ return false;
+ }
+
+ inputFile = (String) cmdLine.getValue(inputFileOption);
+ outputFileBase = (String) cmdLine.getValue(outputFileOption);
+ numSplits = Integer.parseInt((String) cmdLine.getValue(numSplitsOption));
+ return true;
+ }
+
+ public static void main(String[] args) throws IOException {
+ ResplitSequenceFiles runner = new ResplitSequenceFiles();
+ if (runner.parseArgs(args)) {
+ runner.run(new PrintWriter(new OutputStreamWriter(System.out, Charsets.UTF_8), true));
+ }
+ }
+}
Modified: mahout/trunk/src/conf/driver.classes.default.props
URL: http://svn.apache.org/viewvc/mahout/trunk/src/conf/driver.classes.default.props?rev=1492219&r1=1492218&r2=1492219&view=diff
==============================================================================
--- mahout/trunk/src/conf/driver.classes.default.props (original)
+++ mahout/trunk/src/conf/driver.classes.default.props Wed Jun 12 14:26:38 2013
@@ -15,6 +15,8 @@ org.apache.mahout.text.WikipediaToSequen
org.apache.mahout.text.SequenceFilesFromMailArchives = seqmailarchives : Creates SequenceFile from a directory containing gzipped mail archives
org.apache.mahout.text.SequenceFilesFromLuceneStorageDriver = lucene2seq : Generate Text SequenceFiles from a Lucene index
org.apache.mahout.utils.ConcatenateVectorsJob = concatmatrices : Concatenates 2 matrices of same cardinality into a single matrix
+org.apache.mahout.clustering.streaming.tools.ResplitSequenceFiles = resplit : Splits a set of SequenceFiles into a number of equal splits
+org.apache.mahout.clustering.streaming.tools.ClusterQualitySummarizer = qualcluster : Runs clustering experiments and summarizes results in a CSV
#Math
org.apache.mahout.math.hadoop.TransposeJob = transpose : Take the transpose of a matrix