You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@mahout.apache.org by je...@apache.org on 2010/05/23 17:22:30 UTC

svn commit: r947427 - in /mahout/trunk: core/src/main/java/org/apache/mahout/clustering/dirichlet/ core/src/main/java/org/apache/mahout/clustering/lda/ core/src/main/java/org/apache/mahout/common/commandline/ examples/bin/ examples/src/main/java/org/ap...

Author: jeastman
Date: Sun May 23 15:22:28 2010
New Revision: 947427

URL: http://svn.apache.org/viewvc?rev=947427&view=rev
Log:
MAHOUT-294: fixed -k option as optional but added required=true for Dirichlet
MAHOUT-398: added minimal vector renaming to improve clarity
MAHOUT-397: fixes to allow setting -nr in vector output stages

Tests all ran before I installed Java update. Will test on EC2 again today.

Modified:
    mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java
    mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java
    mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java
    mahout/trunk/examples/bin/build-reuters.sh
    mahout/trunk/examples/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java
    mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
    mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java
    mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
    mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java
    mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java?rev=947427&r1=947426&r2=947427&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java Sun May 23 15:22:28 2010
@@ -83,7 +83,7 @@ public class DirichletDriver {
     Option inputOpt = DefaultOptionCreator.inputOption().create();
     Option outputOpt = DefaultOptionCreator.outputOption().create();
     Option maxIterOpt = DefaultOptionCreator.maxIterationsOption().create();
-    Option kOpt = DefaultOptionCreator.kOption().create();
+    Option kOpt = DefaultOptionCreator.kOption().withRequired(true).create();
     Option overwriteOutput = DefaultOptionCreator.overwriteOption().create();
     Option clusteringOpt = DefaultOptionCreator.clusteringOption().create();
     Option alphaOpt = DefaultOptionCreator.alphaOption().create();

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java?rev=947427&r1=947426&r2=947427&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java Sun May 23 15:22:28 2010
@@ -145,10 +145,10 @@ public final class LDADriver {
     double oldLL = Double.NEGATIVE_INFINITY;
     boolean converged = false;
 
-    for (int iteration = 0; ((maxIterations < 1) || (iteration < maxIterations)) && !converged; iteration++) {
+    for (int iteration = 1; ((maxIterations < 1) || (iteration <= maxIterations)) && !converged; iteration++) {
       log.info("Iteration {}", iteration);
       // point the output to a new directory per iteration
-      Path stateOut = new Path(output, "state-" + (iteration + 1));
+      Path stateOut = new Path(output, "state-" + iteration);
       double ll = runIteration(input, stateIn, stateOut, numTopics, numWords, topicSmoothing, numReducers);
       double relChange = (oldLL - ll) / oldLL;
 
@@ -157,7 +157,7 @@ public final class LDADriver {
       log.info("(Old LL: {})", oldLL);
       log.info("(Rel Change: {})", relChange);
 
-      converged = (iteration > 2) && (relChange < OVERALL_CONVERGENCE);
+      converged = (iteration > 3) && (relChange < OVERALL_CONVERGENCE);
       stateIn = stateOut;
       oldLL = ll;
     }

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java?rev=947427&r1=947426&r2=947427&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java Sun May 23 15:22:28 2010
@@ -112,7 +112,7 @@ public final class DefaultOptionCreator 
    * Returns a default command line option for specification of numbers of clusters to create. Used by Dirichlet, FuzzyKmeans, Kmeans
    */
   public static DefaultOptionBuilder kOption() {
-    return new DefaultOptionBuilder().withLongName("k").withRequired(true).withArgument(
+    return new DefaultOptionBuilder().withLongName("k").withRequired(false).withArgument(
         new ArgumentBuilder().withName("k").withMinimum(1).withMaximum(1).create()).withDescription(
         "The number of clusters to create").withShortName("k");
   }

Modified: mahout/trunk/examples/bin/build-reuters.sh
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/bin/build-reuters.sh?rev=947427&r1=947426&r2=947427&view=diff
==============================================================================
--- mahout/trunk/examples/bin/build-reuters.sh (original)
+++ mahout/trunk/examples/bin/build-reuters.sh Sun May 23 15:22:28 2010
@@ -38,14 +38,15 @@ fi
 
 cd ../..
 ./bin/mahout org.apache.lucene.benchmark.utils.ExtractReuters ./examples/bin/work/reuters-sgm/ ./examples/bin/work/reuters-out/
-./bin/mahout seqdirectory -i ./examples/bin/work/reuters-out/ -o ./examples/bin/work/reuters-out-seqdir -c UTF-8
-./bin/mahout seq2sparse -i ./examples/bin/work/reuters-out-seqdir/ -o ./examples/bin/work/reuters-out-seqdir-sparse
+./bin/mahout seqdirectory -i ./examples/bin/work/reuters-out/ -o ./examples/bin/work/reuters-out-seqdir -c UTF-8 -chunk 5
 
-# to use k-Means clustering, uncomment the next two lines
-#./bin/mahout kmeans -i ./examples/bin/work/reuters-out-seqdir-sparse/tfidf/vectors/ -c ./examples/bin/work/clusters -o ./examples/bin/work/reuters-kmeans -x 10 -k 20 -ow
+# to use k-Means clustering, uncomment the next three lines
+#./bin/mahout seq2sparse -i ./examples/bin/work/reuters-out-seqdir/ -o ./examples/bin/work/reuters-out-seqdir-sparse
+#./bin/mahout kmeans -i ./examples/bin/work/reuters-out-seqdir-sparse/tfidf/tfidf-vectors/ -c ./examples/bin/work/clusters -o ./examples/bin/work/reuters-kmeans -x 10 -k 20 -ow
 #./bin/mahout clusterdump -s examples/bin/work/reuters-kmeans/clusters-10 -d examples/bin/work/reuters-out-seqdir-sparse/dictionary.file-0 -dt sequencefile -b 100 -n 20
 
-# to use LDA clustering, uncomment the next two lines
-#./bin/mahout lda -i ./examples/bin/work/reuters-out-seqdir-sparse/tfidf/vectors -o ./examples/bin/work/reuters-lda -k 20 -v 50000 -ow
-#./bin/mahout ldatopics -i ./examples/bin/work/reuters-lda/state-9 -d ./examples/bin/work/reuters-out-seqdir-sparse/dictionary.file-0 -dt sequencefile
+# to use LDA clustering, uncomment the next three lines
+#./bin/mahout seq2sparse -i ./examples/bin/work/reuters-out-seqdir/ -o ./examples/bin/work/reuters-out-seqdir-sparse -wt tf -seq -nr 3
+#./bin/mahout lda -i ./examples/bin/work/reuters-out-seqdir-sparse/tf-vectors -o ./examples/bin/work/reuters-lda -k 20 -v 50000 -ow -x 20
+#./bin/mahout ldatopics -i ./examples/bin/work/reuters-lda/state-20 -d ./examples/bin/work/reuters-out-seqdir-sparse/dictionary.file-0 -dt sequencefile
 

Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java?rev=947427&r1=947426&r2=947427&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java Sun May 23 15:22:28 2010
@@ -79,9 +79,7 @@ public final class SequenceFilesFromDire
     private final FileSystem fs;
     
     public ChunkedWriter(int chunkSizeInMB, String outputDir) throws IOException {
-      if (chunkSizeInMB < 64) {
-        chunkSizeInMB = 64;
-      } else if (chunkSizeInMB > 1984) {
+      if (chunkSizeInMB > 1984) {
         chunkSizeInMB = 1984;
       }
       maxChunkSizeInBytes = chunkSizeInMB * 1024 * 1024;

Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java?rev=947427&r1=947426&r2=947427&view=diff
==============================================================================
--- mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java (original)
+++ mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java Sun May 23 15:22:28 2010
@@ -101,14 +101,14 @@ public final class SparseVectorsFromSequ
       abuilder.withName("ngramSize").withMinimum(1).withMaximum(1).create())
         .withDescription(
           "(Optional) The maximum size of ngrams to create"
-              + " (2 = bigrams, 3 = trigrams, etc) Default Value:2").withShortName("ng").create();
+              + " (2 = bigrams, 3 = trigrams, etc) Default Value:1").withShortName("ng").create();
     Option sequentialAccessVectorOpt = obuilder.withLongName("sequentialAccessVector").withRequired(false)
         .withDescription(
-          "(Optional) Whether output vectors should be SequentialAccessVectors If set true else false")
+          "(Optional) Whether output vectors should be SequentialAccessVectors. If set true else false")
         .withShortName("seq").create();
     
     Option overwriteOutput = obuilder.withLongName("overwrite").withRequired(false).withDescription(
-      "If set, overwrite the output directory").withShortName("w").create();
+      "If set, overwrite the output directory").withShortName("ow").create();
     Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
         .create();
     
@@ -165,7 +165,7 @@ public final class SparseVectorsFromSequ
       if (cmdLine.hasOption(numReduceTasksOpt)) {
         reduceTasks = Integer.parseInt(cmdLine.getValue(numReduceTasksOpt).toString());
       }
-      log.info("Pass1 reduce tasks: {}", reduceTasks);
+      log.info("Number of reduce tasks: {}", reduceTasks);
       
       Class<? extends Analyzer> analyzerClass = DefaultAnalyzer.class;
       if (cmdLine.hasOption(analyzerNameOpt)) {
@@ -224,7 +224,7 @@ public final class SparseVectorsFromSequ
         TFIDFConverter.processTfIdf(
           new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER),
           new Path(outputDir, TFIDFConverter.TFIDF_OUTPUT_FOLDER), chunkSize, minDf, maxDFPercent, norm,
-          sequentialAccessOutput);
+          sequentialAccessOutput, reduceTasks);
       }
     } catch (OptionException e) {
       log.error("Exception", e);

Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java?rev=947427&r1=947426&r2=947427&view=diff
==============================================================================
--- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java (original)
+++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java Sun May 23 15:22:28 2010
@@ -69,13 +69,16 @@ public final class PartialVectorMerger {
    *          output directory were the partial vectors have to be created
    * @param normPower
    *          The normalization value. Must be greater than or equal to 0 or equal to {@link #NO_NORMALIZING}
+   * @param numReducers 
+   *          The number of reducers to spawn
    * @throws IOException
    */
   public static void mergePartialVectors(List<Path> partialVectorPaths,
                                          Path output,
                                          float normPower,
                                          int dimension,
-                                         boolean sequentialAccess) throws IOException {
+                                         boolean sequentialAccess, 
+                                         int numReducers) throws IOException {
     if (normPower != NO_NORMALIZING && normPower < 0) {
       throw new IllegalArgumentException("normPower must either be -1 or >= 0");
     }
@@ -101,6 +104,7 @@ public final class PartialVectorMerger {
     conf.setInputFormat(SequenceFileInputFormat.class);
     conf.setReducerClass(PartialVectorMergeReducer.class);
     conf.setOutputFormat(SequenceFileOutputFormat.class);
+    conf.setNumReduceTasks(numReducers);
     
     HadoopUtil.overwriteOutput(output);
 

Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java?rev=947427&r1=947426&r2=947427&view=diff
==============================================================================
--- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java (original)
+++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java Sun May 23 15:22:28 2010
@@ -59,7 +59,7 @@ import org.apache.mahout.utils.vectors.t
  */
 public final class DictionaryVectorizer {
   
-  public static final String DOCUMENT_VECTOR_OUTPUT_FOLDER = "vectors";
+  public static final String DOCUMENT_VECTOR_OUTPUT_FOLDER = "tf-vectors";
   
   public static final String MIN_SUPPORT = "min.support";
   
@@ -153,7 +153,7 @@ public final class DictionaryVectorizer 
       Path partialVectorOutputPath = new Path(output, VECTOR_OUTPUT_FOLDER + partialVectorIndex++);
       partialVectorPaths.add(partialVectorOutputPath);
       makePartialVectors(input, maxNGramSize, dictionaryChunk, partialVectorOutputPath,
-        maxTermDimension[0], sequentialAccess);
+        maxTermDimension[0], sequentialAccess, numReducers);
     }
     
     Configuration conf = new Configuration();
@@ -162,7 +162,7 @@ public final class DictionaryVectorizer 
     Path outputDir = new Path(output, DOCUMENT_VECTOR_OUTPUT_FOLDER);
     if (dictionaryChunks.size() > 1) {
       PartialVectorMerger.mergePartialVectors(partialVectorPaths, outputDir, -1, maxTermDimension[0],
-        sequentialAccess);
+        sequentialAccess, numReducers);
       HadoopUtil.deletePaths(partialVectorPaths, fs);
     } else {
       Path singlePartialVectorOutputPath = partialVectorPaths.get(0);
@@ -245,6 +245,8 @@ public final class DictionaryVectorizer 
    *          location of the chunk of features and the id's
    * @param output
    *          output directory were the partial vectors have to be created
+   * @param numReducers 
+   *          the desired number of reducer tasks
    * @throws IOException
    */
   private static void makePartialVectors(Path input,
@@ -252,7 +254,8 @@ public final class DictionaryVectorizer 
                                          Path dictionaryFilePath,
                                          Path output,
                                          int dimension,
-                                         boolean sequentialAccess) throws IOException {
+                                         boolean sequentialAccess, 
+                                         int numReducers) throws IOException {
     
     Configurable client = new JobClient();
     JobConf conf = new JobConf(DictionaryVectorizer.class);
@@ -279,6 +282,7 @@ public final class DictionaryVectorizer 
     conf.setInputFormat(SequenceFileInputFormat.class);
     conf.setReducerClass(TFPartialVectorReducer.class);
     conf.setOutputFormat(SequenceFileOutputFormat.class);
+    conf.setNumReduceTasks(numReducers);
 
     HadoopUtil.overwriteOutput(output);
     

Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java?rev=947427&r1=947426&r2=947427&view=diff
==============================================================================
--- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java (original)
+++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java Sun May 23 15:22:28 2010
@@ -66,7 +66,7 @@ public final class TFIDFConverter {
   
   public static final String TFIDF_OUTPUT_FOLDER = "tfidf";
   
-  private static final String DOCUMENT_VECTOR_OUTPUT_FOLDER = "vectors";
+  private static final String DOCUMENT_VECTOR_OUTPUT_FOLDER = "tfidf-vectors";
   
   private static final String FREQUENCY_FILE = "frequency.file-";
   
@@ -99,17 +99,21 @@ public final class TFIDFConverter {
    * @param output
    *          output directory where {@link org.apache.mahout.math.RandomAccessSparseVector}'s of the document
    *          are generated
-   * @param minDf
-   *          The minimum document frequency. Default 1
-   * @param maxDFPercent
-   *          The max percentage of vectors for the DF. Can be used to remove really high frequency features.
-   *          Expressed as an integer between 0 and 100. Default 99
    * @param chunkSizeInMegabytes
    *          the size in MB of the feature => id chunk to be kept in memory at each node during Map/Reduce
    *          stage. Its recommended you calculated this based on the number of cores and the free memory
    *          available to you per node. Say, you have 2 cores and around 1GB extra memory to spare we
    *          recommend you use a split size of around 400-500MB so that two simultaneous reducers can create
    *          partial vectors without thrashing the system due to increased swapping
+   * @param minDf
+   *          The minimum document frequency. Default 1
+   * @param maxDFPercent
+   *          The max percentage of vectors for the DF. Can be used to remove really high frequency features.
+   *          Expressed as an integer between 0 and 100. Default 99
+   * @param numReducers 
+   *          The number of reducers to spawn. This also affects the possible parallelism since each reducer
+   *          will typically produce a single output file containing tf-idf vectors for a subset of the
+   *          documents in the corpus.
    * @throws IOException
    */
   public static void processTfIdf(Path input,
@@ -118,7 +122,8 @@ public final class TFIDFConverter {
                                   int minDf,
                                   int maxDFPercent,
                                   float normPower,
-                                  boolean sequentialAccessOutput) throws IOException {
+                                  boolean sequentialAccessOutput, 
+                                  int numReducers) throws IOException {
     if (chunkSizeInMegabytes < MIN_CHUNKSIZE) {
       chunkSizeInMegabytes = MIN_CHUNKSIZE;
     } else if (chunkSizeInMegabytes > MAX_CHUNKSIZE) { // 10GB
@@ -158,7 +163,7 @@ public final class TFIDFConverter {
     Path outputDir = new Path(output, DOCUMENT_VECTOR_OUTPUT_FOLDER);
     if (dictionaryChunks.size() > 1) {
       PartialVectorMerger.mergePartialVectors(partialVectorPaths, outputDir, normPower,
-        datasetFeatures.getFirst()[0].intValue(), sequentialAccessOutput);
+        datasetFeatures.getFirst()[0].intValue(), sequentialAccessOutput, numReducers);
       HadoopUtil.deletePaths(partialVectorPaths, fs);
     } else {
       Path singlePartialVectorOutputPath = partialVectorPaths.get(0);

Modified: mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java?rev=947427&r1=947426&r2=947427&view=diff
==============================================================================
--- mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java (original)
+++ mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java Sun May 23 15:22:28 2010
@@ -119,7 +119,7 @@ public class DictionaryVectorizerTest ex
     DictionaryVectorizer.createTermFrequencyVectors(getTestTempDirPath("output/tokenized-documents"),
       getTestTempDirPath("output/wordcount"), 2, 1, 0.0f, 1, 100, false);
     TFIDFConverter.processTfIdf(getTestTempDirPath("output/wordcount/vectors"),
-                                getTestTempDirPath("output/tfidf"), 100, 1, 99, 1.0f, false);
+                                getTestTempDirPath("output/tfidf"), 100, 1, 99, 1.0f, false, 1);
     
   }
 }

Re: svn commit: r947427 - in /mahout/trunk: core/src/main/java/org/apache/mahout/clustering/dirichlet/ core/src/main/java/org/apache/mahout/clustering/lda/ core/src/main/java/org/apache/mahout/common/commandline/ examples/bin/ examples/src/main/java/org/ap...

Posted by Jeff Eastman <jd...@windwardsolutions.com>.

Tests all run on Java 1.6.0_10

On 5/23/10 8:22 AM, jeastman@apache.org wrote:
> Author: jeastman
> Date: Sun May 23 15:22:28 2010
> New Revision: 947427
>
> URL: http://svn.apache.org/viewvc?rev=947427&view=rev
> Log:
> MAHOUT-294: fixed -k option as optional but added required=true for Dirichlet
> MAHOUT-398: added minimal vector renaming to improve clarity
> MAHOUT-397: fixes to allow setting -nr in vector output stages
>
> Tests all ran before I installed Java update. Will test on EC2 again today.
>
> Modified:
>      mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java
>      mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java
>      mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java
>      mahout/trunk/examples/bin/build-reuters.sh
>      mahout/trunk/examples/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java
>      mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
>      mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java
>      mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
>      mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java
>      mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java
>
> Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java
> URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java?rev=947427&r1=947426&r2=947427&view=diff
> ==============================================================================
> --- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java (original)
> +++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java Sun May 23 15:22:28 2010
> @@ -83,7 +83,7 @@ public class DirichletDriver {
>       Option inputOpt = DefaultOptionCreator.inputOption().create();
>       Option outputOpt = DefaultOptionCreator.outputOption().create();
>       Option maxIterOpt = DefaultOptionCreator.maxIterationsOption().create();
> -    Option kOpt = DefaultOptionCreator.kOption().create();
> +    Option kOpt = DefaultOptionCreator.kOption().withRequired(true).create();
>       Option overwriteOutput = DefaultOptionCreator.overwriteOption().create();
>       Option clusteringOpt = DefaultOptionCreator.clusteringOption().create();
>       Option alphaOpt = DefaultOptionCreator.alphaOption().create();
>
> Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java
> URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java?rev=947427&r1=947426&r2=947427&view=diff
> ==============================================================================
> --- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java (original)
> +++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java Sun May 23 15:22:28 2010
> @@ -145,10 +145,10 @@ public final class LDADriver {
>       double oldLL = Double.NEGATIVE_INFINITY;
>       boolean converged = false;
>
> -    for (int iteration = 0; ((maxIterations<  1) || (iteration<  maxIterations))&&  !converged; iteration++) {
> +    for (int iteration = 1; ((maxIterations<  1) || (iteration<= maxIterations))&&  !converged; iteration++) {
>         log.info("Iteration {}", iteration);
>         // point the output to a new directory per iteration
> -      Path stateOut = new Path(output, "state-" + (iteration + 1));
> +      Path stateOut = new Path(output, "state-" + iteration);
>         double ll = runIteration(input, stateIn, stateOut, numTopics, numWords, topicSmoothing, numReducers);
>         double relChange = (oldLL - ll) / oldLL;
>
> @@ -157,7 +157,7 @@ public final class LDADriver {
>         log.info("(Old LL: {})", oldLL);
>         log.info("(Rel Change: {})", relChange);
>
> -      converged = (iteration>  2)&&  (relChange<  OVERALL_CONVERGENCE);
> +      converged = (iteration>  3)&&  (relChange<  OVERALL_CONVERGENCE);
>         stateIn = stateOut;
>         oldLL = ll;
>       }
>
> Modified: mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java
> URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java?rev=947427&r1=947426&r2=947427&view=diff
> ==============================================================================
> --- mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java (original)
> +++ mahout/trunk/core/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java Sun May 23 15:22:28 2010
> @@ -112,7 +112,7 @@ public final class DefaultOptionCreator
>      * Returns a default command line option for specification of numbers of clusters to create. Used by Dirichlet, FuzzyKmeans, Kmeans
>      */
>     public static DefaultOptionBuilder kOption() {
> -    return new DefaultOptionBuilder().withLongName("k").withRequired(true).withArgument(
> +    return new DefaultOptionBuilder().withLongName("k").withRequired(false).withArgument(
>           new ArgumentBuilder().withName("k").withMinimum(1).withMaximum(1).create()).withDescription(
>           "The number of clusters to create").withShortName("k");
>     }
>
> Modified: mahout/trunk/examples/bin/build-reuters.sh
> URL: http://svn.apache.org/viewvc/mahout/trunk/examples/bin/build-reuters.sh?rev=947427&r1=947426&r2=947427&view=diff
> ==============================================================================
> --- mahout/trunk/examples/bin/build-reuters.sh (original)
> +++ mahout/trunk/examples/bin/build-reuters.sh Sun May 23 15:22:28 2010
> @@ -38,14 +38,15 @@ fi
>
>   cd ../..
>   ./bin/mahout org.apache.lucene.benchmark.utils.ExtractReuters ./examples/bin/work/reuters-sgm/ ./examples/bin/work/reuters-out/
> -./bin/mahout seqdirectory -i ./examples/bin/work/reuters-out/ -o ./examples/bin/work/reuters-out-seqdir -c UTF-8
> -./bin/mahout seq2sparse -i ./examples/bin/work/reuters-out-seqdir/ -o ./examples/bin/work/reuters-out-seqdir-sparse
> +./bin/mahout seqdirectory -i ./examples/bin/work/reuters-out/ -o ./examples/bin/work/reuters-out-seqdir -c UTF-8 -chunk 5
>
> -# to use k-Means clustering, uncomment the next two lines
> -#./bin/mahout kmeans -i ./examples/bin/work/reuters-out-seqdir-sparse/tfidf/vectors/ -c ./examples/bin/work/clusters -o ./examples/bin/work/reuters-kmeans -x 10 -k 20 -ow
> +# to use k-Means clustering, uncomment the next three lines
> +#./bin/mahout seq2sparse -i ./examples/bin/work/reuters-out-seqdir/ -o ./examples/bin/work/reuters-out-seqdir-sparse
> +#./bin/mahout kmeans -i ./examples/bin/work/reuters-out-seqdir-sparse/tfidf/tfidf-vectors/ -c ./examples/bin/work/clusters -o ./examples/bin/work/reuters-kmeans -x 10 -k 20 -ow
>   #./bin/mahout clusterdump -s examples/bin/work/reuters-kmeans/clusters-10 -d examples/bin/work/reuters-out-seqdir-sparse/dictionary.file-0 -dt sequencefile -b 100 -n 20
>
> -# to use LDA clustering, uncomment the next two lines
> -#./bin/mahout lda -i ./examples/bin/work/reuters-out-seqdir-sparse/tfidf/vectors -o ./examples/bin/work/reuters-lda -k 20 -v 50000 -ow
> -#./bin/mahout ldatopics -i ./examples/bin/work/reuters-lda/state-9 -d ./examples/bin/work/reuters-out-seqdir-sparse/dictionary.file-0 -dt sequencefile
> +# to use LDA clustering, uncomment the next three lines
> +#./bin/mahout seq2sparse -i ./examples/bin/work/reuters-out-seqdir/ -o ./examples/bin/work/reuters-out-seqdir-sparse -wt tf -seq -nr 3
> +#./bin/mahout lda -i ./examples/bin/work/reuters-out-seqdir-sparse/tf-vectors -o ./examples/bin/work/reuters-lda -k 20 -v 50000 -ow -x 20
> +#./bin/mahout ldatopics -i ./examples/bin/work/reuters-lda/state-20 -d ./examples/bin/work/reuters-out-seqdir-sparse/dictionary.file-0 -dt sequencefile
>
>
> Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java
> URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java?rev=947427&r1=947426&r2=947427&view=diff
> ==============================================================================
> --- mahout/trunk/examples/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java (original)
> +++ mahout/trunk/examples/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java Sun May 23 15:22:28 2010
> @@ -79,9 +79,7 @@ public final class SequenceFilesFromDire
>       private final FileSystem fs;
>
>       public ChunkedWriter(int chunkSizeInMB, String outputDir) throws IOException {
> -      if (chunkSizeInMB<  64) {
> -        chunkSizeInMB = 64;
> -      } else if (chunkSizeInMB>  1984) {
> +      if (chunkSizeInMB>  1984) {
>           chunkSizeInMB = 1984;
>         }
>         maxChunkSizeInBytes = chunkSizeInMB * 1024 * 1024;
>
> Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java?rev=947427&r1=947426&r2=947427&view=diff
> ==============================================================================
> --- mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java (original)
> +++ mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java Sun May 23 15:22:28 2010
> @@ -101,14 +101,14 @@ public final class SparseVectorsFromSequ
>         abuilder.withName("ngramSize").withMinimum(1).withMaximum(1).create())
>           .withDescription(
>             "(Optional) The maximum size of ngrams to create"
> -              + " (2 = bigrams, 3 = trigrams, etc) Default Value:2").withShortName("ng").create();
> +              + " (2 = bigrams, 3 = trigrams, etc) Default Value:1").withShortName("ng").create();
>       Option sequentialAccessVectorOpt = obuilder.withLongName("sequentialAccessVector").withRequired(false)
>           .withDescription(
> -          "(Optional) Whether output vectors should be SequentialAccessVectors If set true else false")
> +          "(Optional) Whether output vectors should be SequentialAccessVectors. If set true else false")
>           .withShortName("seq").create();
>
>       Option overwriteOutput = obuilder.withLongName("overwrite").withRequired(false).withDescription(
> -      "If set, overwrite the output directory").withShortName("w").create();
> +      "If set, overwrite the output directory").withShortName("ow").create();
>       Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
>           .create();
>
> @@ -165,7 +165,7 @@ public final class SparseVectorsFromSequ
>         if (cmdLine.hasOption(numReduceTasksOpt)) {
>           reduceTasks = Integer.parseInt(cmdLine.getValue(numReduceTasksOpt).toString());
>         }
> -      log.info("Pass1 reduce tasks: {}", reduceTasks);
> +      log.info("Number of reduce tasks: {}", reduceTasks);
>
>         Class<? extends Analyzer>  analyzerClass = DefaultAnalyzer.class;
>         if (cmdLine.hasOption(analyzerNameOpt)) {
> @@ -224,7 +224,7 @@ public final class SparseVectorsFromSequ
>           TFIDFConverter.processTfIdf(
>             new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER),
>             new Path(outputDir, TFIDFConverter.TFIDF_OUTPUT_FOLDER), chunkSize, minDf, maxDFPercent, norm,
> -          sequentialAccessOutput);
> +          sequentialAccessOutput, reduceTasks);
>         }
>       } catch (OptionException e) {
>         log.error("Exception", e);
>
> Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java
> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java?rev=947427&r1=947426&r2=947427&view=diff
> ==============================================================================
> --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java (original)
> +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java Sun May 23 15:22:28 2010
> @@ -69,13 +69,16 @@ public final class PartialVectorMerger {
>      *          output directory were the partial vectors have to be created
>      * @param normPower
>      *          The normalization value. Must be greater than or equal to 0 or equal to {@link #NO_NORMALIZING}
> +   * @param numReducers
> +   *          The number of reducers to spawn
>      * @throws IOException
>      */
>     public static void mergePartialVectors(List<Path>  partialVectorPaths,
>                                            Path output,
>                                            float normPower,
>                                            int dimension,
> -                                         boolean sequentialAccess) throws IOException {
> +                                         boolean sequentialAccess,
> +                                         int numReducers) throws IOException {
>       if (normPower != NO_NORMALIZING&&  normPower<  0) {
>         throw new IllegalArgumentException("normPower must either be -1 or>= 0");
>       }
> @@ -101,6 +104,7 @@ public final class PartialVectorMerger {
>       conf.setInputFormat(SequenceFileInputFormat.class);
>       conf.setReducerClass(PartialVectorMergeReducer.class);
>       conf.setOutputFormat(SequenceFileOutputFormat.class);
> +    conf.setNumReduceTasks(numReducers);
>
>       HadoopUtil.overwriteOutput(output);
>
>
> Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java?rev=947427&r1=947426&r2=947427&view=diff
> ==============================================================================
> --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java (original)
> +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java Sun May 23 15:22:28 2010
> @@ -59,7 +59,7 @@ import org.apache.mahout.utils.vectors.t
>    */
>   public final class DictionaryVectorizer {
>
> -  public static final String DOCUMENT_VECTOR_OUTPUT_FOLDER = "vectors";
> +  public static final String DOCUMENT_VECTOR_OUTPUT_FOLDER = "tf-vectors";
>
>     public static final String MIN_SUPPORT = "min.support";
>
> @@ -153,7 +153,7 @@ public final class DictionaryVectorizer
>         Path partialVectorOutputPath = new Path(output, VECTOR_OUTPUT_FOLDER + partialVectorIndex++);
>         partialVectorPaths.add(partialVectorOutputPath);
>         makePartialVectors(input, maxNGramSize, dictionaryChunk, partialVectorOutputPath,
> -        maxTermDimension[0], sequentialAccess);
> +        maxTermDimension[0], sequentialAccess, numReducers);
>       }
>
>       Configuration conf = new Configuration();
> @@ -162,7 +162,7 @@ public final class DictionaryVectorizer
>       Path outputDir = new Path(output, DOCUMENT_VECTOR_OUTPUT_FOLDER);
>       if (dictionaryChunks.size()>  1) {
>         PartialVectorMerger.mergePartialVectors(partialVectorPaths, outputDir, -1, maxTermDimension[0],
> -        sequentialAccess);
> +        sequentialAccess, numReducers);
>         HadoopUtil.deletePaths(partialVectorPaths, fs);
>       } else {
>         Path singlePartialVectorOutputPath = partialVectorPaths.get(0);
> @@ -245,6 +245,8 @@ public final class DictionaryVectorizer
>      *          location of the chunk of features and the id's
>      * @param output
>      *          output directory were the partial vectors have to be created
> +   * @param numReducers
> +   *          the desired number of reducer tasks
>      * @throws IOException
>      */
>     private static void makePartialVectors(Path input,
> @@ -252,7 +254,8 @@ public final class DictionaryVectorizer
>                                            Path dictionaryFilePath,
>                                            Path output,
>                                            int dimension,
> -                                         boolean sequentialAccess) throws IOException {
> +                                         boolean sequentialAccess,
> +                                         int numReducers) throws IOException {
>
>       Configurable client = new JobClient();
>       JobConf conf = new JobConf(DictionaryVectorizer.class);
> @@ -279,6 +282,7 @@ public final class DictionaryVectorizer
>       conf.setInputFormat(SequenceFileInputFormat.class);
>       conf.setReducerClass(TFPartialVectorReducer.class);
>       conf.setOutputFormat(SequenceFileOutputFormat.class);
> +    conf.setNumReduceTasks(numReducers);
>
>       HadoopUtil.overwriteOutput(output);
>
>
> Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java
> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java?rev=947427&r1=947426&r2=947427&view=diff
> ==============================================================================
> --- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java (original)
> +++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java Sun May 23 15:22:28 2010
> @@ -66,7 +66,7 @@ public final class TFIDFConverter {
>
>     public static final String TFIDF_OUTPUT_FOLDER = "tfidf";
>
> -  private static final String DOCUMENT_VECTOR_OUTPUT_FOLDER = "vectors";
> +  private static final String DOCUMENT_VECTOR_OUTPUT_FOLDER = "tfidf-vectors";
>
>     private static final String FREQUENCY_FILE = "frequency.file-";
>
> @@ -99,17 +99,21 @@ public final class TFIDFConverter {
>      * @param output
>      *          output directory where {@link org.apache.mahout.math.RandomAccessSparseVector}'s of the document
>      *          are generated
> -   * @param minDf
> -   *          The minimum document frequency. Default 1
> -   * @param maxDFPercent
> -   *          The max percentage of vectors for the DF. Can be used to remove really high frequency features.
> -   *          Expressed as an integer between 0 and 100. Default 99
>      * @param chunkSizeInMegabytes
>      *          the size in MB of the feature =>  id chunk to be kept in memory at each node during Map/Reduce
>      *          stage. Its recommended you calculated this based on the number of cores and the free memory
>      *          available to you per node. Say, you have 2 cores and around 1GB extra memory to spare we
>      *          recommend you use a split size of around 400-500MB so that two simultaneous reducers can create
>      *          partial vectors without thrashing the system due to increased swapping
> +   * @param minDf
> +   *          The minimum document frequency. Default 1
> +   * @param maxDFPercent
> +   *          The max percentage of vectors for the DF. Can be used to remove really high frequency features.
> +   *          Expressed as an integer between 0 and 100. Default 99
> +   * @param numReducers
> +   *          The number of reducers to spawn. This also affects the possible parallelism since each reducer
> +   *          will typically produce a single output file containing tf-idf vectors for a subset of the
> +   *          documents in the corpus.
>      * @throws IOException
>      */
>     public static void processTfIdf(Path input,
> @@ -118,7 +122,8 @@ public final class TFIDFConverter {
>                                     int minDf,
>                                     int maxDFPercent,
>                                     float normPower,
> -                                  boolean sequentialAccessOutput) throws IOException {
> +                                  boolean sequentialAccessOutput,
> +                                  int numReducers) throws IOException {
>       if (chunkSizeInMegabytes<  MIN_CHUNKSIZE) {
>         chunkSizeInMegabytes = MIN_CHUNKSIZE;
>       } else if (chunkSizeInMegabytes>  MAX_CHUNKSIZE) { // 10GB
> @@ -158,7 +163,7 @@ public final class TFIDFConverter {
>       Path outputDir = new Path(output, DOCUMENT_VECTOR_OUTPUT_FOLDER);
>       if (dictionaryChunks.size()>  1) {
>         PartialVectorMerger.mergePartialVectors(partialVectorPaths, outputDir, normPower,
> -        datasetFeatures.getFirst()[0].intValue(), sequentialAccessOutput);
> +        datasetFeatures.getFirst()[0].intValue(), sequentialAccessOutput, numReducers);
>         HadoopUtil.deletePaths(partialVectorPaths, fs);
>       } else {
>         Path singlePartialVectorOutputPath = partialVectorPaths.get(0);
>
> Modified: mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java
> URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java?rev=947427&r1=947426&r2=947427&view=diff
> ==============================================================================
> --- mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java (original)
> +++ mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java Sun May 23 15:22:28 2010
> @@ -119,7 +119,7 @@ public class DictionaryVectorizerTest ex
>       DictionaryVectorizer.createTermFrequencyVectors(getTestTempDirPath("output/tokenized-documents"),
>         getTestTempDirPath("output/wordcount"), 2, 1, 0.0f, 1, 100, false);
>       TFIDFConverter.processTfIdf(getTestTempDirPath("output/wordcount/vectors"),
> -                                getTestTempDirPath("output/tfidf"), 100, 1, 99, 1.0f, false);
> +                                getTestTempDirPath("output/tfidf"), 100, 1, 99, 1.0f, false, 1);
>
>     }
>   }
>
>
>
>