You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ro...@apache.org on 2010/02/13 18:56:01 UTC

svn commit: r909861 [3/4] - in /lucene/mahout/trunk/utils/src: main/java/org/apache/mahout/clustering/lda/ main/java/org/apache/mahout/text/ main/java/org/apache/mahout/utils/ main/java/org/apache/mahout/utils/clustering/ main/java/org/apache/mahout/ut...

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java?rev=909861&r1=909860&r2=909861&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java Sat Feb 13 17:55:56 2010
@@ -17,6 +17,14 @@
 
 package org.apache.mahout.utils.vectors.lucene;
 
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.nio.charset.Charset;
+
 import org.apache.commons.cli2.CommandLine;
 import org.apache.commons.cli2.Group;
 import org.apache.commons.cli2.Option;
@@ -46,87 +54,83 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import java.io.BufferedWriter;
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.FileWriter;
-import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.nio.charset.Charset;
-
 public class Driver {
   private static final Logger log = LoggerFactory.getLogger(Driver.class);
-
-  private Driver() {
-  }
-
+  
+  private Driver() {}
+  
   public static void main(String[] args) throws IOException {
     DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
     ArgumentBuilder abuilder = new ArgumentBuilder();
     GroupBuilder gbuilder = new GroupBuilder();
-
+    
     Option inputOpt = obuilder.withLongName("dir").withRequired(true).withArgument(
-            abuilder.withName("dir").withMinimum(1).withMaximum(1).create()).
-            withDescription("The Lucene directory").withShortName("d").create();
-
+      abuilder.withName("dir").withMinimum(1).withMaximum(1).create())
+        .withDescription("The Lucene directory").withShortName("d").create();
+    
     Option outputOpt = obuilder.withLongName("output").withRequired(true).withArgument(
-            abuilder.withName("output").withMinimum(1).withMaximum(1).create()).
-            withDescription("The output file").withShortName("o").create();
-
+      abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription("The output file")
+        .withShortName("o").create();
+    
     Option fieldOpt = obuilder.withLongName("field").withRequired(true).withArgument(
-            abuilder.withName("field").withMinimum(1).withMaximum(1).create()).
-            withDescription("The field in the index").withShortName("f").create();
-
+      abuilder.withName("field").withMinimum(1).withMaximum(1).create()).withDescription(
+      "The field in the index").withShortName("f").create();
+    
     Option idFieldOpt = obuilder.withLongName("idField").withRequired(false).withArgument(
-            abuilder.withName("idField").withMinimum(1).withMaximum(1).create()).
-            withDescription("The field in the index containing the index.  If null, then the Lucene internal doc " +
-                    "id is used which is prone to error if the underlying index changes").withShortName("i").create();
-
+      abuilder.withName("idField").withMinimum(1).withMaximum(1).create()).withDescription(
+      "The field in the index containing the index.  If null, then the Lucene internal doc "
+          + "id is used which is prone to error if the underlying index changes").withShortName("i").create();
+    
     Option dictOutOpt = obuilder.withLongName("dictOut").withRequired(true).withArgument(
-            abuilder.withName("dictOut").withMinimum(1).withMaximum(1).create()).
-            withDescription("The output of the dictionary").withShortName("t").create();
-
+      abuilder.withName("dictOut").withMinimum(1).withMaximum(1).create()).withDescription(
+      "The output of the dictionary").withShortName("t").create();
+    
     Option weightOpt = obuilder.withLongName("weight").withRequired(false).withArgument(
-            abuilder.withName("weight").withMinimum(1).withMaximum(1).create()).
-            withDescription("The kind of weight to use. Currently TF or TFIDF").withShortName("w").create();
-
+      abuilder.withName("weight").withMinimum(1).withMaximum(1).create()).withDescription(
+      "The kind of weight to use. Currently TF or TFIDF").withShortName("w").create();
+    
     Option delimiterOpt = obuilder.withLongName("delimiter").withRequired(false).withArgument(
-            abuilder.withName("delimiter").withMinimum(1).withMaximum(1).create()).
-            withDescription("The delimiter for outputing the dictionary").withShortName("l").create();
+      abuilder.withName("delimiter").withMinimum(1).withMaximum(1).create()).withDescription(
+      "The delimiter for outputing the dictionary").withShortName("l").create();
     Option powerOpt = obuilder.withLongName("norm").withRequired(false).withArgument(
-            abuilder.withName("norm").withMinimum(1).withMaximum(1).create()).
-            withDescription("The norm to use, expressed as either a double or \"INF\" if you want to use the Infinite norm.  " +
-                    "Must be greater or equal to 0.  The default is not to normalize").withShortName("n").create();
+      abuilder.withName("norm").withMinimum(1).withMaximum(1).create()).withDescription(
+      "The norm to use, expressed as either a double or \"INF\" if you want to use the Infinite norm.  "
+          + "Must be greater or equal to 0.  The default is not to normalize").withShortName("n").create();
     Option maxOpt = obuilder.withLongName("max").withRequired(false).withArgument(
-            abuilder.withName("max").withMinimum(1).withMaximum(1).create()).
-            withDescription("The maximum number of vectors to output.  If not specified, then it will loop over all docs").withShortName("m").create();
-
+      abuilder.withName("max").withMinimum(1).withMaximum(1).create()).withDescription(
+      "The maximum number of vectors to output.  If not specified, then it will loop over all docs")
+        .withShortName("m").create();
+    
     Option outWriterOpt = obuilder.withLongName("outputWriter").withRequired(false).withArgument(
-            abuilder.withName("outputWriter").withMinimum(1).withMaximum(1).create()).
-            withDescription("The VectorWriter to use, either seq (SequenceFileVectorWriter - default) or file (Writes to a File using JSON format)").withShortName("e").create();
+      abuilder.withName("outputWriter").withMinimum(1).withMaximum(1).create()).withDescription(
+      "The VectorWriter to use, either seq "
+          + "(SequenceFileVectorWriter - default) or file (Writes to a File using JSON format)")
+        .withShortName("e").create();
     Option minDFOpt = obuilder.withLongName("minDF").withRequired(false).withArgument(
-            abuilder.withName("minDF").withMinimum(1).withMaximum(1).create()).
-            withDescription("The minimum document frequency.  Default is 1").withShortName("md").create();
+      abuilder.withName("minDF").withMinimum(1).withMaximum(1).create()).withDescription(
+      "The minimum document frequency.  Default is 1").withShortName("md").create();
     Option maxDFPercentOpt = obuilder.withLongName("maxDFPercent").withRequired(false).withArgument(
-            abuilder.withName("maxDFPercent").withMinimum(1).withMaximum(1).create()).
-            withDescription("The max percentage of docs for the DF.  Can be used to remove really high frequency terms.  Expressed as an integer between 0 and 100. Default is 99.").withShortName("x").create();
-    Option helpOpt = obuilder.withLongName("help").
-            withDescription("Print out help").withShortName("h").create();
-    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(idFieldOpt).withOption(outputOpt).withOption(delimiterOpt)
-            .withOption(helpOpt).withOption(fieldOpt).withOption(maxOpt).withOption(dictOutOpt).withOption(powerOpt).withOption(outWriterOpt).withOption(maxDFPercentOpt)
-            .withOption(weightOpt).withOption(minDFOpt).create();
+      abuilder.withName("maxDFPercent").withMinimum(1).withMaximum(1).create()).withDescription(
+      "The max percentage of docs for the DF.  Can be used to remove really high frequency terms."
+          + "  Expressed as an integer between 0 and 100. Default is 99.").withShortName("x").create();
+    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
+        .create();
+    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(idFieldOpt).withOption(
+      outputOpt).withOption(delimiterOpt).withOption(helpOpt).withOption(fieldOpt).withOption(maxOpt)
+        .withOption(dictOutOpt).withOption(powerOpt).withOption(outWriterOpt).withOption(maxDFPercentOpt)
+        .withOption(weightOpt).withOption(minDFOpt).create();
     try {
       Parser parser = new Parser();
       parser.setGroup(group);
       CommandLine cmdLine = parser.parse(args);
-
+      
       if (cmdLine.hasOption(helpOpt)) {
-
+        
         CommandLineUtil.printHelp(group);
         return;
       }
-      //Springify all this
-      if (cmdLine.hasOption(inputOpt)) {//Lucene case
+      // Springify all this
+      if (cmdLine.hasOption(inputOpt)) {// Lucene case
         File file = new File(cmdLine.getValue(inputOpt).toString());
         if (file.exists() && file.isDirectory()) {
           long maxDocs = Long.MAX_VALUE;
@@ -182,8 +186,8 @@
             iterable = new LuceneIterable(reader, idField, field, mapper, norm);
           }
           String outFile = cmdLine.getValue(outputOpt).toString();
-          log.info("Output File: {}", outFile);
-
+          Driver.log.info("Output File: {}", outFile);
+          
           VectorWriter vectorWriter;
           if (cmdLine.hasOption(outWriterOpt)) {
             String outWriter = cmdLine.getValue(outWriterOpt).toString();
@@ -191,42 +195,44 @@
               BufferedWriter writer = new BufferedWriter(new FileWriter(outFile));
               vectorWriter = new JWriterVectorWriter(writer);
             } else {
-              vectorWriter = getSeqFileWriter(outFile);
+              vectorWriter = Driver.getSeqFileWriter(outFile);
             }
           } else {
-            vectorWriter = getSeqFileWriter(outFile);
+            vectorWriter = Driver.getSeqFileWriter(outFile);
           }
-
+          
           long numDocs = vectorWriter.write(iterable, maxDocs);
           vectorWriter.close();
-          log.info("Wrote: {} vectors", numDocs);
-
-          String delimiter = cmdLine.hasOption(delimiterOpt) ? cmdLine.getValue(delimiterOpt).toString() : "\t";
+          Driver.log.info("Wrote: {} vectors", numDocs);
+          
+          String delimiter = cmdLine.hasOption(delimiterOpt) ? cmdLine.getValue(delimiterOpt).toString()
+              : "\t";
           File dictOutFile = new File(cmdLine.getValue(dictOutOpt).toString());
-          log.info("Dictionary Output file: {}", dictOutFile);
-          BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(dictOutFile), Charset.forName("UTF8")));
+          Driver.log.info("Dictionary Output file: {}", dictOutFile);
+          BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(
+              new FileOutputStream(dictOutFile), Charset.forName("UTF8")));
           JWriterTermInfoWriter tiWriter = new JWriterTermInfoWriter(writer, delimiter, field);
           tiWriter.write(termInfo);
           tiWriter.close();
           writer.close();
         }
       }
-
+      
     } catch (OptionException e) {
-      log.error("Exception", e);
+      Driver.log.error("Exception", e);
       CommandLineUtil.printHelp(group);
     }
   }
-
+  
   private static VectorWriter getSeqFileWriter(String outFile) throws IOException {
     Path path = new Path(outFile);
     Configuration conf = new Configuration();
     FileSystem fs = FileSystem.get(conf);
-    //TODO: Make this parameter driven
-    SequenceFile.Writer seqWriter = SequenceFile.createWriter(fs, conf, path, LongWritable.class, VectorWritable.class);
-
+    // TODO: Make this parameter driven
+    SequenceFile.Writer seqWriter = SequenceFile.createWriter(fs, conf, path, LongWritable.class,
+      VectorWritable.class);
+    
     return new SequenceFileVectorWriter(seqWriter);
   }
-
-
+  
 }

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java?rev=909861&r1=909860&r2=909861&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java Sat Feb 13 17:55:56 2010
@@ -17,36 +17,36 @@
 
 package org.apache.mahout.utils.vectors.lucene;
 
+import java.io.IOException;
+import java.util.Collections;
+import java.util.Iterator;
+
 import org.apache.lucene.document.FieldSelector;
 import org.apache.lucene.document.SetBasedFieldSelector;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.TermDocs;
 import org.apache.mahout.math.Vector;
 
-import java.io.IOException;
-import java.util.Collections;
-import java.util.Iterator;
-
 /**
  * A LuceneIterable is an Iterable<Vector> that uses a Lucene index as the source for creating the {@link Vector}.
  * The Field used to create the Vector currently must have Term Vectors stored for it.
  */
 public class LuceneIterable implements Iterable<Vector> {
-
-  private IndexReader indexReader;
-  private String field;
-  private String idField;
-  private FieldSelector idFieldSelector;
-
-  private VectorMapper mapper;
-  private double normPower = NO_NORMALIZING;
-
+  
+  private final IndexReader indexReader;
+  private final String field;
+  private final String idField;
+  private final FieldSelector idFieldSelector;
+  
+  private final VectorMapper mapper;
+  private double normPower = LuceneIterable.NO_NORMALIZING;
+  
   public static final double NO_NORMALIZING = -1.0;
-
+  
   public LuceneIterable(IndexReader reader, String idField, String field, VectorMapper mapper) {
-    this(reader, idField, field, mapper, NO_NORMALIZING);
+    this(reader, idField, field, mapper, LuceneIterable.NO_NORMALIZING);
   }
-
+  
   /**
    * Produce a LuceneIterable that can create the Vector plus normalize it.
    *
@@ -57,7 +57,7 @@
    * @param normPower The normalization value.  Must be greater than or equal to 0 or equal to {@link #NO_NORMALIZING}
    */
   public LuceneIterable(IndexReader reader, String idField, String field, VectorMapper mapper, double normPower) {
-    if (normPower != NO_NORMALIZING && normPower < 0) {
+    if (normPower != LuceneIterable.NO_NORMALIZING && normPower < 0) {
       throw new IllegalArgumentException("normPower must either be -1 or >= 0");
     }
     idFieldSelector = new SetBasedFieldSelector(Collections.singleton(idField), Collections.emptySet());
@@ -67,8 +67,8 @@
     this.mapper = mapper;
     this.normPower = normPower;
   }
-
-
+  
+  
   @Override
   public Iterator<Vector> iterator() {
     try {
@@ -77,25 +77,25 @@
       throw new IllegalStateException(e);
     }
   }
-
+  
   private class TDIterator implements Iterator<Vector> {
     private final TermDocs termDocs;
-
+    
     private TDIterator() throws IOException {
       //term docs(null) is a better way of iterating all the docs in Lucene
       this.termDocs = indexReader.termDocs(null);
     }
-
+    
     @Override
     public boolean hasNext() {
-      // TODO this doesn't work with the Iterator contract -- hasNext() cannot have a side effect      
+      // TODO this doesn't work with the Iterator contract -- hasNext() cannot have a side effect
       try {
         return termDocs.next();
       } catch (IOException e) {
         throw new IllegalStateException(e);
       }
     }
-
+    
     @Override
     public Vector next() {
       Vector result;
@@ -114,24 +114,24 @@
         } else {
           result.setName(String.valueOf(doc));
         }
-        if (normPower != NO_NORMALIZING) {
+        if (normPower != LuceneIterable.NO_NORMALIZING) {
           result = result.normalize(normPower);
         }
       } catch (IOException e) {
         //Log?
         throw new IllegalStateException(e);
       }
-
+      
       return result;
     }
-
-
+    
+    
     @Override
     public void remove() {
       throw new UnsupportedOperationException();
     }
-
+    
   }
-
-
+  
+  
 }

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java?rev=909861&r1=909860&r2=909861&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java Sat Feb 13 17:55:56 2010
@@ -21,46 +21,46 @@
 import org.apache.lucene.index.TermVectorOffsetInfo;
 import org.apache.mahout.math.RandomAccessSparseVector;
 import org.apache.mahout.math.Vector;
-import org.apache.mahout.utils.vectors.Weight;
 import org.apache.mahout.utils.vectors.TermEntry;
 import org.apache.mahout.utils.vectors.TermInfo;
+import org.apache.mahout.utils.vectors.Weight;
 
 
 /**
  * Not thread-safe
  */
 public class TFDFMapper extends VectorMapper {
-
+  
   //public static final int DEFAULT_CACHE_SIZE = 256;
-
+  
   //private final IndexReader reader; // TODO never used?
   private Vector vector;
-
+  
   private final Weight weight;
   private int numTerms;
   private final TermInfo termInfo;
   private String field;
   private final int numDocs;
-
+  
   public TFDFMapper(IndexReader reader, Weight weight, TermInfo termInfo) {
     //this.reader = reader;
     this.weight = weight;
     this.termInfo = termInfo;
     this.numDocs = reader.numDocs();
   }
-
+  
   @Override
   public Vector getVector() {
     return vector;
   }
-
+  
   @Override
   public void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions) {
     this.field = field;
     vector = new RandomAccessSparseVector(termInfo.totalTerms(field));
     this.numTerms = numTerms;
   }
-
+  
   @Override
   public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) {
     TermEntry entry = termInfo.getTermEntry(field, term);
@@ -68,12 +68,12 @@
       vector.setQuick(entry.termIdx, weight.calculate(frequency, entry.docFreq, numTerms, numDocs));
     }
   }
-
+  
   @Override
   public boolean isIgnoringPositions() {
     return true;
   }
-
+  
   @Override
   public boolean isIgnoringOffsets() {
     return true;

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java?rev=909861&r1=909860&r2=909861&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java Sat Feb 13 17:55:56 2010
@@ -51,11 +51,10 @@
 import org.apache.mahout.utils.vectors.text.term.TermCountReducer;
 
 /**
- * This class converts a set of input documents in the sequence file format to
- * vectors. The Sequence file input should have a {@link Text} key containing
- * the unique document identifier and a {@link StringTuple} value containing the
- * tokenized document. You may use {@link DocumentProcessor} to tokenize the
- * document. This is a dictionary based Vectorizer.
+ * This class converts a set of input documents in the sequence file format to vectors. The Sequence file
+ * input should have a {@link Text} key containing the unique document identifier and a {@link StringTuple}
+ * value containing the tokenized document. You may use {@link DocumentProcessor} to tokenize the document.
+ * This is a dictionary based Vectorizer.
  * 
  */
 public final class DictionaryVectorizer {
@@ -91,33 +90,28 @@
   }
   
   /**
-   * Create Term Frequency (Tf) Vectors from the input set of documents in
-   * {@link SequenceFile} format. This tries to fix the maximum memory used by
-   * the feature chunk per node thereby splitting the process across multiple
-   * map/reduces.
+   * Create Term Frequency (Tf) Vectors from the input set of documents in {@link SequenceFile} format. This
+   * tries to fix the maximum memory used by the feature chunk per node thereby splitting the process across
+   * multiple map/reduces.
    * 
    * @param input
    *          input directory of the documents in {@link SequenceFile} format
    * @param output
-   *          output directory where
-   *          {@link org.apache.mahout.math.RandomAccessSparseVector}'s of the
-   *          document are generated
+   *          output directory where {@link org.apache.mahout.math.RandomAccessSparseVector}'s of the document
+   *          are generated
    * @param minSupport
-   *          the minimum frequency of the feature in the entire corpus to be
-   *          considered for inclusion in the sparse vector
+   *          the minimum frequency of the feature in the entire corpus to be considered for inclusion in the
+   *          sparse vector
    * @param maxNGramSize
-   *          1 = unigram, 2 = unigram and bigram, 3 = unigram, bigram and
-   *          trigram
+   *          1 = unigram, 2 = unigram and bigram, 3 = unigram, bigram and trigram
    * @param minLLRValue
    *          minValue of log likelihood ratio to used to prune ngrams
    * @param chunkSizeInMegabytes
-   *          the size in MB of the feature => id chunk to be kept in memory at
-   *          each node during Map/Reduce stage. Its recommended you calculated
-   *          this based on the number of cores and the free memory available to
-   *          you per node. Say, you have 2 cores and around 1GB extra memory to
-   *          spare we recommend you use a split size of around 400-500MB so
-   *          that two simultaneous reducers can create partial vectors without
-   *          thrashing the system due to increased swapping
+   *          the size in MB of the feature => id chunk to be kept in memory at each node during Map/Reduce
+   *          stage. Its recommended you calculated this based on the number of cores and the free memory
+   *          available to you per node. Say, you have 2 cores and around 1GB extra memory to spare we
+   *          recommend you use a split size of around 400-500MB so that two simultaneous reducers can create
+   *          partial vectors without thrashing the system due to increased swapping
    * @throws IOException
    */
   public static void createTermFrequencyVectors(String input,
@@ -128,54 +122,49 @@
                                                 int numReducers,
                                                 int chunkSizeInMegabytes,
                                                 boolean sequentialAccess) throws IOException {
-    if (chunkSizeInMegabytes < MIN_CHUNKSIZE) {
-      chunkSizeInMegabytes = MIN_CHUNKSIZE;
-    } else if (chunkSizeInMegabytes > MAX_CHUNKSIZE) { // 10GB
-      chunkSizeInMegabytes = MAX_CHUNKSIZE;
+    if (chunkSizeInMegabytes < DictionaryVectorizer.MIN_CHUNKSIZE) {
+      chunkSizeInMegabytes = DictionaryVectorizer.MIN_CHUNKSIZE;
+    } else if (chunkSizeInMegabytes > DictionaryVectorizer.MAX_CHUNKSIZE) { // 10GB
+      chunkSizeInMegabytes = DictionaryVectorizer.MAX_CHUNKSIZE;
+    }
+    if (minSupport < 0) {
+      minSupport = DictionaryVectorizer.DEFAULT_MIN_SUPPORT;
     }
-    if (minSupport < 0) minSupport = DEFAULT_MIN_SUPPORT;
     
     Path inputPath = new Path(input);
-    Path dictionaryJobPath = new Path(output + DICTIONARY_JOB_FOLDER);
-
+    Path dictionaryJobPath = new Path(output + DictionaryVectorizer.DICTIONARY_JOB_FOLDER);
+    
     int[] maxTermDimension = new int[1];
     List<Path> dictionaryChunks;
     if (maxNGramSize == 1) {
-      startWordCounting(inputPath, dictionaryJobPath, minSupport);
-      dictionaryChunks = createDictionaryChunks(minSupport, dictionaryJobPath,
-        output, chunkSizeInMegabytes, new LongWritable(), maxTermDimension);
+      DictionaryVectorizer.startWordCounting(inputPath, dictionaryJobPath, minSupport);
+      dictionaryChunks = DictionaryVectorizer.createDictionaryChunks(minSupport, dictionaryJobPath, output,
+        chunkSizeInMegabytes, new LongWritable(), maxTermDimension);
     } else {
-      CollocDriver.generateAllGrams(inputPath.toString(), dictionaryJobPath
-          .toString(), maxNGramSize, minSupport, minLLRValue, numReducers);
-      dictionaryChunks = createDictionaryChunks(minSupport, new Path(
-          output + DICTIONARY_JOB_FOLDER, CollocDriver.NGRAM_OUTPUT_DIRECTORY), output,
+      CollocDriver.generateAllGrams(inputPath.toString(), dictionaryJobPath.toString(), maxNGramSize,
+        minSupport, minLLRValue, numReducers);
+      dictionaryChunks = DictionaryVectorizer.createDictionaryChunks(minSupport, new Path(
+          output + DictionaryVectorizer.DICTIONARY_JOB_FOLDER, CollocDriver.NGRAM_OUTPUT_DIRECTORY), output,
         chunkSizeInMegabytes, new DoubleWritable(), maxTermDimension);
     }
     
     int partialVectorIndex = 0;
     List<Path> partialVectorPaths = new ArrayList<Path>();
     for (Path dictionaryChunk : dictionaryChunks) {
-      Path partialVectorOutputPath = getPath(output + VECTOR_OUTPUT_FOLDER,
-        partialVectorIndex++);
+      Path partialVectorOutputPath = DictionaryVectorizer.getPath(
+        output + DictionaryVectorizer.VECTOR_OUTPUT_FOLDER, partialVectorIndex++);
       partialVectorPaths.add(partialVectorOutputPath);
-      makePartialVectors(input,
-                         maxNGramSize,
-                         dictionaryChunk,
-                         partialVectorOutputPath,
-                         maxTermDimension[0],
-                         sequentialAccess);
+      DictionaryVectorizer.makePartialVectors(input, maxNGramSize, dictionaryChunk, partialVectorOutputPath,
+        maxTermDimension[0], sequentialAccess);
     }
     
     Configuration conf = new Configuration();
     FileSystem fs = FileSystem.get(partialVectorPaths.get(0).toUri(), conf);
     
-    String outputDir = output + DOCUMENT_VECTOR_OUTPUT_FOLDER;
+    String outputDir = output + DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER;
     if (dictionaryChunks.size() > 1) {
-      PartialVectorMerger.mergePartialVectors(partialVectorPaths,
-                                              outputDir,
-                                              -1,
-                                              maxTermDimension[0],
-                                              sequentialAccess);
+      PartialVectorMerger.mergePartialVectors(partialVectorPaths, outputDir, -1, maxTermDimension[0],
+        sequentialAccess);
       HadoopUtil.deletePaths(partialVectorPaths, fs);
     } else {
       Path singlePartialVectorOutputPath = partialVectorPaths.get(0);
@@ -185,9 +174,8 @@
   }
   
   /**
-   * Read the feature frequency List which is built at the end of the Word Count
-   * Job and assign ids to them. This will use constant memory and will run at
-   * the speed of your disk read
+   * Read the feature frequency List which is built at the end of the Word Count Job and assign ids to them.
+   * This will use constant memory and will run at the speed of your disk read
    * 
    * @param minSupport
    * @param wordCountPath
@@ -207,15 +195,16 @@
     
     FileSystem fs = FileSystem.get(wordCountPath.toUri(), conf);
     FileStatus[] outputFiles = fs.globStatus(new Path(wordCountPath.toString()
-                                                      + OUTPUT_FILES_PATTERN));
+                                                      + DictionaryVectorizer.OUTPUT_FILES_PATTERN));
     
     long chunkSizeLimit = chunkSizeInMegabytes * 1024 * 1024;
     int chunkIndex = 0;
-    Path chunkPath = getPath(dictionaryPathBase + DICTIONARY_FILE, chunkIndex);
+    Path chunkPath = DictionaryVectorizer.getPath(dictionaryPathBase + DictionaryVectorizer.DICTIONARY_FILE,
+      chunkIndex);
     chunkPaths.add(chunkPath);
     
-    SequenceFile.Writer dictWriter = new SequenceFile.Writer(fs, conf,
-        chunkPath, Text.class, IntWritable.class);
+    SequenceFile.Writer dictWriter = new SequenceFile.Writer(fs, conf, chunkPath, Text.class,
+        IntWritable.class);
     
     long currentChunkSize = 0;
     
@@ -229,21 +218,21 @@
           dictWriter.close();
           chunkIndex++;
           
-          chunkPath = getPath(dictionaryPathBase + DICTIONARY_FILE, chunkIndex);
+          chunkPath = DictionaryVectorizer.getPath(dictionaryPathBase + DictionaryVectorizer.DICTIONARY_FILE,
+            chunkIndex);
           chunkPaths.add(chunkPath);
           
-          dictWriter = new SequenceFile.Writer(fs, conf, chunkPath, Text.class,
-              IntWritable.class);
+          dictWriter = new SequenceFile.Writer(fs, conf, chunkPath, Text.class, IntWritable.class);
           currentChunkSize = 0;
         }
         
-        int fieldSize = DICTIONARY_BYTE_OVERHEAD
-                        + (key.toString().length() * 2) + (Integer.SIZE / 8);
+        int fieldSize = DictionaryVectorizer.DICTIONARY_BYTE_OVERHEAD + key.toString().length() * 2
+                        + Integer.SIZE / 8;
         currentChunkSize += fieldSize;
         dictWriter.append(key, new IntWritable(i++));
       }
     }
-    maxTermDimension[0] = (int)i;
+    maxTermDimension[0] = i;
     dictWriter.close();
     
     return chunkPaths;
@@ -254,8 +243,8 @@
   }
   
   /**
-   * Create a partial vector using a chunk of features from the input documents.
-   * The input documents has to be in the {@link SequenceFile} format
+   * Create a partial vector using a chunk of features from the input documents. The input documents has to be
+   * in the {@link SequenceFile} format
    * 
    * @param input
    *          input directory of the documents in {@link SequenceFile} format
@@ -276,18 +265,16 @@
     
     Configurable client = new JobClient();
     JobConf conf = new JobConf(DictionaryVectorizer.class);
-    conf.set("io.serializations",
-      "org.apache.hadoop.io.serializer.JavaSerialization,"
-          + "org.apache.hadoop.io.serializer.WritableSerialization");
+    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
+                                  + "org.apache.hadoop.io.serializer.WritableSerialization");
     // this conf parameter needs to be set enable serialisation of conf values
     
-    conf.setJobName("DictionaryVectorizer::MakePartialVectors: input-folder: "
-                    + input + ", dictionary-file: "
-                    + dictionaryFilePath.toString());
+    conf.setJobName("DictionaryVectorizer::MakePartialVectors: input-folder: " + input
+                    + ", dictionary-file: " + dictionaryFilePath.toString());
     conf.setInt(PartialVectorMerger.DIMENSION, dimension);
     conf.setBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, sequentialAccess);
-    conf.setInt(MAX_NGRAMS, maxNGramSize);
-
+    conf.setInt(DictionaryVectorizer.MAX_NGRAMS, maxNGramSize);
+    
     conf.setMapOutputKeyClass(Text.class);
     conf.setMapOutputValueClass(StringTuple.class);
     conf.setOutputKeyClass(Text.class);
@@ -311,21 +298,19 @@
   }
   
   /**
-   * Count the frequencies of words in parallel using Map/Reduce. The input
-   * documents have to be in {@link SequenceFile} format
+   * Count the frequencies of words in parallel using Map/Reduce. The input documents have to be in
+   * {@link SequenceFile} format
    */
   private static void startWordCounting(Path input, Path output, int minSupport) throws IOException {
     
     Configurable client = new JobClient();
     JobConf conf = new JobConf(DictionaryVectorizer.class);
-    conf.set("io.serializations",
-      "org.apache.hadoop.io.serializer.JavaSerialization,"
-          + "org.apache.hadoop.io.serializer.WritableSerialization");
+    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
+                                  + "org.apache.hadoop.io.serializer.WritableSerialization");
     // this conf parameter needs to be set enable serialisation of conf values
     
-    conf.setJobName("DictionaryVectorizer::WordCount: input-folder: "
-                    + input.toString());
-    conf.setInt(MIN_SUPPORT, minSupport);
+    conf.setJobName("DictionaryVectorizer::WordCount: input-folder: " + input.toString());
+    conf.setInt(DictionaryVectorizer.MIN_SUPPORT, minSupport);
     
     conf.setOutputKeyClass(Text.class);
     conf.setOutputValueClass(LongWritable.class);

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DocumentProcessor.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DocumentProcessor.java?rev=909861&r1=909860&r2=909861&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DocumentProcessor.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DocumentProcessor.java Sat Feb 13 17:55:56 2010
@@ -36,12 +36,11 @@
 import org.apache.mahout.utils.vectors.text.document.SequenceFileTokenizerMapper;
 
 /**
- * This class converts a set of input documents in the sequence file format of
- * {@link StringTuple}s.The {@link SequenceFile} input should have a
- * {@link Text} key containing the unique document identifier and a {@link Text}
- * value containing the whole document. The document should be stored in UTF-8
- * encoding which is recognizable by hadoop. It uses the given {@link Analyzer}
- * to process the document into {@link org.apache.lucene.analysis.Token}s.
+ * This class converts a set of input documents in the sequence file format of {@link StringTuple}s.The
+ * {@link SequenceFile} input should have a {@link Text} key containing the unique document identifier and a
+ * {@link Text} value containing the whole document. The document should be stored in UTF-8 encoding which is
+ * recognizable by hadoop. It uses the given {@link Analyzer} to process the document into
+ * {@link org.apache.lucene.analysis.Token}s.
  * 
  */
 public final class DocumentProcessor {
@@ -59,32 +58,27 @@
   }
   
   /**
-   * Convert the input documents into token array using the {@link StringTuple}
-   * The input documents has to be in the {@link SequenceFile} format
+   * Convert the input documents into token array using the {@link StringTuple} The input documents has to be
+   * in the {@link SequenceFile} format
    * 
    * @param input
    *          input directory of the documents in {@link SequenceFile} format
    * @param output
-   *          output directory were the {@link StringTuple} token array of each
-   *          document has to be created
+   *          output directory were the {@link StringTuple} token array of each document has to be created
    * @param analyzerClass
    *          The Lucene {@link Analyzer} for tokenizing the UTF-8 text
    * @throws IOException
    */
-  public static void tokenizeDocuments(String input,
-                                       Class<? extends Analyzer> analyzerClass,
-                                       String output) throws IOException {
+  public static void tokenizeDocuments(String input, Class<? extends Analyzer> analyzerClass, String output) throws IOException {
     
     Configurable client = new JobClient();
     JobConf conf = new JobConf(DocumentProcessor.class);
-    conf.set("io.serializations",
-      "org.apache.hadoop.io.serializer.JavaSerialization,"
-          + "org.apache.hadoop.io.serializer.WritableSerialization");
+    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
+                                  + "org.apache.hadoop.io.serializer.WritableSerialization");
     // this conf parameter needs to be set enable serialisation of conf values
     
-    conf.set(ANALYZER_CLASS, analyzerClass.getName());
-    conf.setJobName("DocumentProcessor::DocumentTokenizer: input-folder: "
-                    + input);
+    conf.set(DocumentProcessor.ANALYZER_CLASS, analyzerClass.getName());
+    conf.setJobName("DocumentProcessor::DocumentTokenizer: input-folder: " + input);
     
     conf.setOutputKeyClass(Text.class);
     conf.setOutputValueClass(StringTuple.class);

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/document/SequenceFileTokenizerMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/document/SequenceFileTokenizerMapper.java?rev=909861&r1=909860&r2=909861&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/document/SequenceFileTokenizerMapper.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/document/SequenceFileTokenizerMapper.java Sat Feb 13 17:55:56 2010
@@ -36,20 +36,14 @@
 /**
  * Tokenizes a text document and outputs tokens in a StringTuple
  */
-public class SequenceFileTokenizerMapper extends MapReduceBase implements
-    Mapper<Text,Text,Text,StringTuple> {
+public class SequenceFileTokenizerMapper extends MapReduceBase implements Mapper<Text,Text,Text,StringTuple> {
   
   private Analyzer analyzer;
   
   @Override
-  public void map(Text key,
-                  Text value,
-                  OutputCollector<Text,StringTuple> output,
-                  Reporter reporter) throws IOException {
-    TokenStream stream = analyzer.tokenStream(key.toString(), new StringReader(
-        value.toString()));
-    TermAttribute termAtt = (TermAttribute) stream
-        .addAttribute(TermAttribute.class);
+  public void map(Text key, Text value, OutputCollector<Text,StringTuple> output, Reporter reporter) throws IOException {
+    TokenStream stream = analyzer.tokenStream(key.toString(), new StringReader(value.toString()));
+    TermAttribute termAtt = (TermAttribute) stream.addAttribute(TermAttribute.class);
     StringTuple document = new StringTuple();
     while (stream.incrementToken()) {
       if (termAtt.termLength() > 0) {
@@ -64,8 +58,8 @@
     super.configure(job);
     try {
       ClassLoader ccl = Thread.currentThread().getContextClassLoader();
-      Class<?> cl = ccl.loadClass(job.get(DocumentProcessor.ANALYZER_CLASS,
-        StandardAnalyzer.class.getName()));
+      Class<?> cl = ccl
+          .loadClass(job.get(DocumentProcessor.ANALYZER_CLASS, StandardAnalyzer.class.getName()));
       analyzer = (Analyzer) cl.newInstance();
     } catch (ClassNotFoundException e) {
       throw new IllegalStateException(e);

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TFPartialVectorReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TFPartialVectorReducer.java?rev=909861&r1=909860&r2=909861&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TFPartialVectorReducer.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TFPartialVectorReducer.java Sat Feb 13 17:55:56 2010
@@ -40,8 +40,8 @@
 import org.apache.mahout.math.Vector;
 import org.apache.mahout.math.VectorWritable;
 import org.apache.mahout.math.map.OpenObjectIntHashMap;
-import org.apache.mahout.utils.vectors.common.PartialVectorMerger;
 import org.apache.mahout.utils.nlp.collocations.llr.CollocMapper.IteratorTokenStream;
+import org.apache.mahout.utils.vectors.common.PartialVectorMerger;
 import org.apache.mahout.utils.vectors.text.DictionaryVectorizer;
 
 /**
@@ -52,7 +52,7 @@
   private final OpenObjectIntHashMap<String> dictionary = new OpenObjectIntHashMap<String>();
   
   private final VectorWritable vectorWritable = new VectorWritable();
-
+  
   private int dimension;
   private boolean sequentialAccess;
   
@@ -63,22 +63,24 @@
                      Iterator<StringTuple> values,
                      OutputCollector<Text,VectorWritable> output,
                      Reporter reporter) throws IOException {
-    if (values.hasNext() == false) return;
+    if (values.hasNext() == false) {
+      return;
+    }
     StringTuple value = values.next();
     
-    Vector vector = new RandomAccessSparseVector(key.toString(),
-                                                 dimension,
-                                                 value.length()); // guess at initial size
+    Vector vector = new RandomAccessSparseVector(key.toString(), dimension, value.length()); // guess at
+                                                                                             // initial size
     
     if (maxNGramSize >= 2) {
-      ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value
-          .getEntries().iterator()), maxNGramSize);
+      ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()),
+          maxNGramSize);
       
       do {
-        String term = ((TermAttribute) sf.getAttribute(TermAttribute.class))
-            .term();
+        String term = ((TermAttribute) sf.getAttribute(TermAttribute.class)).term();
         if (term.length() > 0) { // ngram
-          if (dictionary.containsKey(term) == false) continue;
+          if (dictionary.containsKey(term) == false) {
+            continue;
+          }
           int termId = dictionary.get(term);
           vector.setQuick(termId, vector.getQuick(termId) + 1);
         }
@@ -89,7 +91,9 @@
     } else {
       for (String term : value.getEntries()) {
         if (term.length() > 0) { // unigram
-          if (dictionary.containsKey(term) == false) continue;
+          if (dictionary.containsKey(term) == false) {
+            continue;
+          }
           int termId = dictionary.get(term);
           vector.setQuick(termId, vector.getQuick(termId) + 1);
         }
@@ -112,13 +116,11 @@
       maxNGramSize = job.getInt(DictionaryVectorizer.MAX_NGRAMS, maxNGramSize);
       URI[] localFiles = DistributedCache.getCacheFiles(job);
       if (localFiles == null || localFiles.length < 1) {
-        throw new IllegalArgumentException(
-            "missing paths from the DistributedCache");
+        throw new IllegalArgumentException("missing paths from the DistributedCache");
       }
       Path dictionaryFile = new Path(localFiles[0].getPath());
       FileSystem fs = dictionaryFile.getFileSystem(job);
-      SequenceFile.Reader reader = new SequenceFile.Reader(fs, dictionaryFile,
-          job);
+      SequenceFile.Reader reader = new SequenceFile.Reader(fs, dictionaryFile, job);
       Text key = new Text();
       IntWritable value = new IntWritable();
       

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TermCountMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TermCountMapper.java?rev=909861&r1=909860&r2=909861&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TermCountMapper.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TermCountMapper.java Sat Feb 13 17:55:56 2010
@@ -30,12 +30,10 @@
 import org.apache.mahout.math.map.OpenObjectLongHashMap;
 
 /**
- * TextVectorizer Term Count Mapper. Tokenizes a text document and outputs the
- * count of the words
+ * TextVectorizer Term Count Mapper. Tokenizes a text document and outputs the count of the words
  * 
  */
-public class TermCountMapper extends MapReduceBase implements
-    Mapper<Text,StringTuple,Text,LongWritable> {
+public class TermCountMapper extends MapReduceBase implements Mapper<Text,StringTuple,Text,LongWritable> {
   @Override
   public void map(Text key,
                   StringTuple value,
@@ -45,7 +43,9 @@
     for (String word : value.getEntries()) {
       if (wordCount.containsKey(word) == false) {
         wordCount.put(word, 1);
-      } else wordCount.put(word, wordCount.get(word) + 1);
+      } else {
+        wordCount.put(word, wordCount.get(word) + 1);
+      }
     }
     wordCount.forEachPair(new ObjectLongProcedure<String>() {
       @Override

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TermCountReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TermCountReducer.java?rev=909861&r1=909860&r2=909861&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TermCountReducer.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TermCountReducer.java Sat Feb 13 17:55:56 2010
@@ -30,11 +30,9 @@
 import org.apache.mahout.utils.vectors.text.DictionaryVectorizer;
 
 /**
- * Can also be used as a local Combiner. This accumulates all the words and the
- * weights and sums them up.
+ * Can also be used as a local Combiner. This accumulates all the words and the weights and sums them up.
  */
-public class TermCountReducer extends MapReduceBase implements
-    Reducer<Text,LongWritable,Text,LongWritable> {
+public class TermCountReducer extends MapReduceBase implements Reducer<Text,LongWritable,Text,LongWritable> {
   
   private static int minSupport;
   
@@ -44,9 +42,10 @@
                      OutputCollector<Text,LongWritable> output,
                      Reporter reporter) throws IOException {
     long sum = 0;
-    while (values.hasNext())
+    while (values.hasNext()) {
       sum += values.next().get();
-    if (sum >= minSupport) {
+    }
+    if (sum >= TermCountReducer.minSupport) {
       output.collect(key, new LongWritable(sum));
     }
   }
@@ -54,7 +53,7 @@
   @Override
   public void configure(JobConf job) {
     super.configure(job);
-    minSupport = job.getInt(DictionaryVectorizer.MIN_SUPPORT,
+    TermCountReducer.minSupport = job.getInt(DictionaryVectorizer.MIN_SUPPORT,
       DictionaryVectorizer.DEFAULT_MIN_SUPPORT);
   }
 }

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TermDocumentCountMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TermDocumentCountMapper.java?rev=909861&r1=909860&r2=909861&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TermDocumentCountMapper.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TermDocumentCountMapper.java Sat Feb 13 17:55:56 2010
@@ -28,8 +28,8 @@
 import org.apache.hadoop.mapred.OutputCollector;
 import org.apache.hadoop.mapred.Reporter;
 import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.VectorWritable;
 import org.apache.mahout.math.Vector.Element;
+import org.apache.mahout.math.VectorWritable;
 
 /**
  * TextVectorizer Document Frequency Count Mapper. Outputs 1 for each feature
@@ -51,8 +51,8 @@
     
     while (it.hasNext()) {
       Element e = it.next();
-      output.collect(new IntWritable(e.index()), ONE);
+      output.collect(new IntWritable(e.index()), TermDocumentCountMapper.ONE);
     }
-    output.collect(TOTAL_COUNT, ONE);
+    output.collect(TermDocumentCountMapper.TOTAL_COUNT, TermDocumentCountMapper.ONE);
   }
 }

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TermDocumentCountReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TermDocumentCountReducer.java?rev=909861&r1=909860&r2=909861&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TermDocumentCountReducer.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TermDocumentCountReducer.java Sat Feb 13 17:55:56 2010
@@ -28,8 +28,7 @@
 import org.apache.hadoop.mapred.Reporter;
 
 /**
- * Can also be used as a local Combiner. This accumulates all the features and
- * the weights and sums them up.
+ * Can also be used as a local Combiner. This accumulates all the features and the weights and sums them up.
  */
 public class TermDocumentCountReducer extends MapReduceBase implements
     Reducer<IntWritable,LongWritable,IntWritable,LongWritable> {
@@ -40,8 +39,9 @@
                      OutputCollector<IntWritable,LongWritable> output,
                      Reporter reporter) throws IOException {
     long sum = 0;
-    while (values.hasNext())
+    while (values.hasNext()) {
       sum += values.next().get();
+    }
     output.collect(key, new LongWritable(sum));
   }
 }

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java?rev=909861&r1=909860&r2=909861&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java Sat Feb 13 17:55:56 2010
@@ -48,11 +48,10 @@
 import org.apache.mahout.utils.vectors.text.term.TermDocumentCountReducer;
 
 /**
- * This class converts a set of input vectors with term frequencies to TfIdf
- * vectors. The Sequence file input should have a {@link WritableComparable} key
- * containing and a {@link VectorWritable} value containing the term frequency
- * vector. This is conversion class uses multiple map/reduces to convert the
- * vectors to TfIdf format
+ * This class converts a set of input vectors with term frequencies to TfIdf vectors. The Sequence file input
+ * should have a {@link WritableComparable} key containing and a {@link VectorWritable} value containing the
+ * term frequency vector. This is conversion class uses multiple map/reduces to convert the vectors to TfIdf
+ * format
  * 
  */
 public final class TFIDFConverter {
@@ -91,31 +90,26 @@
   }
   
   /**
-   * Create Term Frequency-Inverse Document Frequency (Tf-Idf) Vectors from the
-   * input set of vectors in {@link SequenceFile} format. This job uses a fixed
-   * limit on the maximum memory used by the feature chunk per node thereby
-   * splitting the process across multiple map/reduces.
+   * Create Term Frequency-Inverse Document Frequency (Tf-Idf) Vectors from the input set of vectors in
+   * {@link SequenceFile} format. This job uses a fixed limit on the maximum memory used by the feature chunk
+   * per node thereby splitting the process across multiple map/reduces.
    * 
    * @param input
    *          input directory of the vectors in {@link SequenceFile} format
    * @param output
-   *          output directory where
-   *          {@link org.apache.mahout.math.RandomAccessSparseVector}'s of the
-   *          document are generated
+   *          output directory where {@link org.apache.mahout.math.RandomAccessSparseVector}'s of the document
+   *          are generated
    * @param minDf
    *          The minimum document frequency. Default 1
    * @param maxDFPercent
-   *          The max percentage of vectors for the DF. Can be used to remove
-   *          really high frequency features. Expressed as an integer between 0
-   *          and 100. Default 99
+   *          The max percentage of vectors for the DF. Can be used to remove really high frequency features.
+   *          Expressed as an integer between 0 and 100. Default 99
    * @param chunkSizeInMegabytes
-   *          the size in MB of the feature => id chunk to be kept in memory at
-   *          each node during Map/Reduce stage. Its recommended you calculated
-   *          this based on the number of cores and the free memory available to
-   *          you per node. Say, you have 2 cores and around 1GB extra memory to
-   *          spare we recommend you use a split size of around 400-500MB so
-   *          that two simultaneous reducers can create partial vectors without
-   *          thrashing the system due to increased swapping
+   *          the size in MB of the feature => id chunk to be kept in memory at each node during Map/Reduce
+   *          stage. Its recommended you calculated this based on the number of cores and the free memory
+   *          available to you per node. Say, you have 2 cores and around 1GB extra memory to spare we
+   *          recommend you use a split size of around 400-500MB so that two simultaneous reducers can create
+   *          partial vectors without thrashing the system due to increased swapping
    * @throws IOException
    */
   public static void processTfIdf(String input,
@@ -125,52 +119,48 @@
                                   int maxDFPercent,
                                   float normPower,
                                   boolean sequentialAccessOutput) throws IOException {
-    if (chunkSizeInMegabytes < MIN_CHUNKSIZE) {
-      chunkSizeInMegabytes = MIN_CHUNKSIZE;
-    } else if (chunkSizeInMegabytes > MAX_CHUNKSIZE) { // 10GB
-      chunkSizeInMegabytes = MAX_CHUNKSIZE;
+    if (chunkSizeInMegabytes < TFIDFConverter.MIN_CHUNKSIZE) {
+      chunkSizeInMegabytes = TFIDFConverter.MIN_CHUNKSIZE;
+    } else if (chunkSizeInMegabytes > TFIDFConverter.MAX_CHUNKSIZE) { // 10GB
+      chunkSizeInMegabytes = TFIDFConverter.MAX_CHUNKSIZE;
     }
     
     if (normPower != PartialVectorMerger.NO_NORMALIZING && normPower < 0) {
       throw new IllegalArgumentException("normPower must either be -1 or >= 0");
     }
     
-    if (minDf < 1) minDf = 1;
-    if (maxDFPercent < 0 || maxDFPercent > 100) maxDFPercent = 99;
+    if (minDf < 1) {
+      minDf = 1;
+    }
+    if (maxDFPercent < 0 || maxDFPercent > 100) {
+      maxDFPercent = 99;
+    }
     
     Path inputPath = new Path(input);
-    Path wordCountPath = new Path(output + WORDCOUNT_OUTPUT_FOLDER);
+    Path wordCountPath = new Path(output + TFIDFConverter.WORDCOUNT_OUTPUT_FOLDER);
     
-    startDFCounting(inputPath, wordCountPath);
-    Pair<Long[],List<Path>> datasetFeatures = createDictionaryChunks(
-      wordCountPath, output, chunkSizeInMegabytes);
+    TFIDFConverter.startDFCounting(inputPath, wordCountPath);
+    Pair<Long[],List<Path>> datasetFeatures = TFIDFConverter.createDictionaryChunks(wordCountPath, output,
+      chunkSizeInMegabytes);
     
     int partialVectorIndex = 0;
     List<Path> partialVectorPaths = new ArrayList<Path>();
     List<Path> dictionaryChunks = datasetFeatures.getSecond();
     for (Path dictionaryChunk : dictionaryChunks) {
-      Path partialVectorOutputPath = getPath(output + VECTOR_OUTPUT_FOLDER,
+      Path partialVectorOutputPath = TFIDFConverter.getPath(output + TFIDFConverter.VECTOR_OUTPUT_FOLDER,
         partialVectorIndex++);
       partialVectorPaths.add(partialVectorOutputPath);
-      makePartialVectors(input,
-                         datasetFeatures.getFirst()[0],
-                         datasetFeatures.getFirst()[1],
-                         minDf,
-                         maxDFPercent,
-                         dictionaryChunk,
-                         partialVectorOutputPath);
+      TFIDFConverter.makePartialVectors(input, datasetFeatures.getFirst()[0], datasetFeatures.getFirst()[1],
+        minDf, maxDFPercent, dictionaryChunk, partialVectorOutputPath);
     }
     
     Configuration conf = new Configuration();
     FileSystem fs = FileSystem.get(partialVectorPaths.get(0).toUri(), conf);
     
-    String outputDir = output + DOCUMENT_VECTOR_OUTPUT_FOLDER;
+    String outputDir = output + TFIDFConverter.DOCUMENT_VECTOR_OUTPUT_FOLDER;
     if (dictionaryChunks.size() > 1) {
-      PartialVectorMerger.mergePartialVectors(partialVectorPaths,
-                                              outputDir,
-                                              normPower,
-                                              (int)(long)datasetFeatures.getFirst()[0],
-                                              sequentialAccessOutput);
+      PartialVectorMerger.mergePartialVectors(partialVectorPaths, outputDir, normPower,
+        (int) (long) datasetFeatures.getFirst()[0], sequentialAccessOutput);
       HadoopUtil.deletePaths(partialVectorPaths, fs);
     } else {
       Path singlePartialVectorOutputPath = partialVectorPaths.get(0);
@@ -180,9 +170,8 @@
   }
   
   /**
-   * Read the document frequency List which is built at the end of the DF Count
-   * Job. This will use constant memory and will run at the speed of your disk
-   * read
+   * Read the document frequency List which is built at the end of the DF Count Job. This will use constant
+   * memory and will run at the speed of your disk read
    * 
    * @param featureCountPath
    * @param dictionaryPathBase
@@ -198,16 +187,15 @@
     Configuration conf = new Configuration();
     
     FileSystem fs = FileSystem.get(featureCountPath.toUri(), conf);
-    FileStatus[] outputFiles = fs.globStatus(new Path(featureCountPath
-        .toString()
-                                                      + OUTPUT_FILES_PATTERN));
+    FileStatus[] outputFiles = fs.globStatus(new Path(featureCountPath.toString()
+                                                      + TFIDFConverter.OUTPUT_FILES_PATTERN));
     
     long chunkSizeLimit = chunkSizeInMegabytes * 1024 * 1024;
     int chunkIndex = 0;
-    Path chunkPath = getPath(dictionaryPathBase + FREQUENCY_FILE, chunkIndex);
+    Path chunkPath = TFIDFConverter.getPath(dictionaryPathBase + TFIDFConverter.FREQUENCY_FILE, chunkIndex);
     chunkPaths.add(chunkPath);
-    SequenceFile.Writer freqWriter = new SequenceFile.Writer(fs, conf,
-        chunkPath, IntWritable.class, LongWritable.class);
+    SequenceFile.Writer freqWriter = new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class,
+        LongWritable.class);
     
     long currentChunkSize = 0;
     long featureCount = 0;
@@ -221,16 +209,14 @@
           freqWriter.close();
           chunkIndex++;
           
-          chunkPath = getPath(dictionaryPathBase + FREQUENCY_FILE, chunkIndex);
+          chunkPath = TFIDFConverter.getPath(dictionaryPathBase + TFIDFConverter.FREQUENCY_FILE, chunkIndex);
           chunkPaths.add(chunkPath);
           
-          freqWriter = new SequenceFile.Writer(fs, conf, chunkPath,
-              IntWritable.class, LongWritable.class);
+          freqWriter = new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class, LongWritable.class);
           currentChunkSize = 0;
         }
         
-        int fieldSize = SEQUENCEFILE_BYTE_OVERHEAD + (Integer.SIZE / 8)
-                        + (Long.SIZE / 8);
+        int fieldSize = TFIDFConverter.SEQUENCEFILE_BYTE_OVERHEAD + Integer.SIZE / 8 + Long.SIZE / 8;
         currentChunkSize += fieldSize;
         if (key.get() >= 0) {
           freqWriter.append(key, value);
@@ -251,8 +237,8 @@
   }
   
   /**
-   * Create a partial tfidf vector using a chunk of features from the input
-   * vectors. The input vectors has to be in the {@link SequenceFile} format
+   * Create a partial tfidf vector using a chunk of features from the input vectors. The input vectors has to
+   * be in the {@link SequenceFile} format
    * 
    * @param input
    *          input directory of the vectors in {@link SequenceFile} format
@@ -263,9 +249,8 @@
    * @param minDf
    *          The minimum document frequency. Default 1
    * @param maxDFPercent
-   *          The max percentage of vectors for the DF. Can be used to remove
-   *          really high frequency features. Expressed as an integer between 0
-   *          and 100. Default 99
+   *          The max percentage of vectors for the DF. Can be used to remove really high frequency features.
+   *          Expressed as an integer between 0 and 100. Default 99
    * @param dictionaryFilePath
    *          location of the chunk of features and the id's
    * @param output
@@ -282,22 +267,19 @@
     
     Configurable client = new JobClient();
     JobConf conf = new JobConf(TFIDFConverter.class);
-    conf.set("io.serializations",
-      "org.apache.hadoop.io.serializer.JavaSerialization,"
-          + "org.apache.hadoop.io.serializer.WritableSerialization");
+    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
+                                  + "org.apache.hadoop.io.serializer.WritableSerialization");
     // this conf parameter needs to be set enable serialisation of conf values
     
-    conf.setJobName("TFIDFConverter:: MakePartialVectors: input-folder: "
-                    + input + ", dictionary-file: "
+    conf.setJobName("TFIDFConverter:: MakePartialVectors: input-folder: " + input + ", dictionary-file: "
                     + dictionaryFilePath.toString());
-    conf.setLong(FEATURE_COUNT, featureCount.longValue());
-    conf.setLong(VECTOR_COUNT, vectorCount.longValue());
-    conf.setInt(MIN_DF, minDf);
-    conf.setInt(MAX_DF_PERCENTAGE, maxDFPercent);
+    conf.setLong(TFIDFConverter.FEATURE_COUNT, featureCount.longValue());
+    conf.setLong(TFIDFConverter.VECTOR_COUNT, vectorCount.longValue());
+    conf.setInt(TFIDFConverter.MIN_DF, minDf);
+    conf.setInt(TFIDFConverter.MAX_DF_PERCENTAGE, maxDFPercent);
     conf.setOutputKeyClass(Text.class);
     conf.setOutputValueClass(VectorWritable.class);
-    DistributedCache
-        .setCacheFiles(new URI[] {dictionaryFilePath.toUri()}, conf);
+    DistributedCache.setCacheFiles(new URI[] {dictionaryFilePath.toUri()}, conf);
     FileInputFormat.setInputPaths(conf, new Path(input));
     
     FileOutputFormat.setOutputPath(conf, output);
@@ -316,20 +298,18 @@
   }
   
   /**
-   * Count the document frequencies of features in parallel using Map/Reduce.
-   * The input documents have to be in {@link SequenceFile} format
+   * Count the document frequencies of features in parallel using Map/Reduce. The input documents have to be
+   * in {@link SequenceFile} format
    */
   private static void startDFCounting(Path input, Path output) throws IOException {
     
     Configurable client = new JobClient();
     JobConf conf = new JobConf(TFIDFConverter.class);
-    conf.set("io.serializations",
-      "org.apache.hadoop.io.serializer.JavaSerialization,"
-          + "org.apache.hadoop.io.serializer.WritableSerialization");
+    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
+                                  + "org.apache.hadoop.io.serializer.WritableSerialization");
     // this conf parameter needs to be set enable serialisation of conf values
     
-    conf.setJobName("VectorTfIdf Document Frequency Count running over input: "
-                    + input.toString());
+    conf.setJobName("VectorTfIdf Document Frequency Count running over input: " + input.toString());
     conf.setOutputKeyClass(IntWritable.class);
     conf.setOutputValueClass(LongWritable.class);
     

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFPartialVectorReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFPartialVectorReducer.java?rev=909861&r1=909860&r2=909861&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFPartialVectorReducer.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFPartialVectorReducer.java Sat Feb 13 17:55:56 2010
@@ -35,16 +35,15 @@
 import org.apache.hadoop.mapred.Reporter;
 import org.apache.mahout.math.RandomAccessSparseVector;
 import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.VectorWritable;
 import org.apache.mahout.math.Vector.Element;
+import org.apache.mahout.math.VectorWritable;
 import org.apache.mahout.math.map.OpenIntLongHashMap;
 import org.apache.mahout.utils.vectors.TFIDF;
 
 /**
  * Converts a document in to a sparse vector
  */
-public class TFIDFPartialVectorReducer extends MapReduceBase
-    implements
+public class TFIDFPartialVectorReducer extends MapReduceBase implements
     Reducer<WritableComparable<?>,VectorWritable,WritableComparable<?>,VectorWritable> {
   
   private final OpenIntLongHashMap dictionary = new OpenIntLongHashMap();
@@ -60,19 +59,27 @@
                      Iterator<VectorWritable> values,
                      OutputCollector<WritableComparable<?>,VectorWritable> output,
                      Reporter reporter) throws IOException {
-    if (!values.hasNext()) return;
+    if (!values.hasNext()) {
+      return;
+    }
     Vector value = values.next().get();
     Iterator<Element> it = value.iterateNonZero();
-    Vector vector = new RandomAccessSparseVector(key
-        .toString(), (int)featureCount, value.getNumNondefaultElements());
+    Vector vector = new RandomAccessSparseVector(key.toString(), (int) featureCount, value
+        .getNumNondefaultElements());
     while (it.hasNext()) {
       Element e = it.next();
-      if (!dictionary.containsKey(e.index())) continue;
+      if (!dictionary.containsKey(e.index())) {
+        continue;
+      }
       long df = dictionary.get(e.index());
-      if (df / vectorCount > maxDfPercent) continue;
-      if (df < minDf) df = minDf;
-      vector.setQuick(e.index(), tfidf.calculate((int) e.get(), (int) df,
-        (int) featureCount, (int) vectorCount));
+      if (df / vectorCount > maxDfPercent) {
+        continue;
+      }
+      if (df < minDf) {
+        df = minDf;
+      }
+      vector.setQuick(e.index(), tfidf.calculate((int) e.get(), (int) df, (int) featureCount,
+        (int) vectorCount));
     }
     
     vectorWritable.set(vector);
@@ -86,8 +93,7 @@
       
       URI[] localFiles = DistributedCache.getCacheFiles(job);
       if (localFiles == null || localFiles.length < 1) {
-        throw new IllegalArgumentException(
-            "missing paths from the DistributedCache");
+        throw new IllegalArgumentException("missing paths from the DistributedCache");
       }
       
       vectorCount = job.getLong(TFIDFConverter.VECTOR_COUNT, 1);
@@ -97,8 +103,7 @@
       
       Path dictionaryFile = new Path(localFiles[0].getPath());
       FileSystem fs = dictionaryFile.getFileSystem(job);
-      SequenceFile.Reader reader = new SequenceFile.Reader(fs, dictionaryFile,
-          job);
+      SequenceFile.Reader reader = new SequenceFile.Reader(fs, dictionaryFile, job);
       IntWritable key = new IntWritable();
       LongWritable value = new LongWritable();
       

Modified: lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/dirichlet/TestL1ModelClustering.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/dirichlet/TestL1ModelClustering.java?rev=909861&r1=909860&r2=909861&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/dirichlet/TestL1ModelClustering.java (original)
+++ lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/dirichlet/TestL1ModelClustering.java Sat Feb 13 17:55:56 2010
@@ -6,13 +6,13 @@
 import java.util.Iterator;
 import java.util.List;
 
+import junit.framework.Assert;
+
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
-import org.apache.lucene.index.CorruptIndexException;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.store.LockObtainFailedException;
 import org.apache.lucene.store.RAMDirectory;
 import org.apache.lucene.util.Version;
 import org.apache.mahout.clustering.dirichlet.models.L1ModelDistribution;
@@ -33,66 +33,67 @@
 
 
 public class TestL1ModelClustering extends MahoutTestCase {
-
-  @SuppressWarnings("unchecked")
+  
   private class MapElement implements Comparable<MapElement> {
-
+    
     MapElement(double pdf, String doc) {
       super();
       this.pdf = pdf;
       this.doc = doc;
     }
-
+    
     private final Double pdf;
-
+    
     private final String doc;
-
+    
     @Override
     // reverse compare to sort in reverse order
     public int compareTo(MapElement e) {
-      if (e.pdf > pdf)
+      if (e.pdf > pdf) {
         return 1;
-      else if (e.pdf < pdf)
+      } else if (e.pdf < pdf) {
         return -1;
-      else
+      } else {
         return 0;
+      }
     }
-
+    
+    @Override
     public String toString() {
       return pdf.toString() + ' ' + doc.toString();
     }
-
+    
   }
-
+  
   private static final String[] DOCS = { "The quick red fox jumped over the lazy brown dogs.",
-      "The quick brown fox jumped over the lazy red dogs.", "The quick red cat jumped over the lazy brown dogs.",
-      "The quick brown cat jumped over the lazy red dogs.", "Mary had a little lamb whose fleece was white as snow.",
-      "Moby Dick is a story of a whale and a man obsessed.", "The robber wore a black fleece jacket and a baseball cap.",
-      "The English Springer Spaniel is the best of all dogs." };
-
+                                         "The quick brown fox jumped over the lazy red dogs.", "The quick red cat jumped over the lazy brown dogs.",
+                                         "The quick brown cat jumped over the lazy red dogs.", "Mary had a little lamb whose fleece was white as snow.",
+                                         "Moby Dick is a story of a whale and a man obsessed.", "The robber wore a black fleece jacket and a baseball cap.",
+  "The English Springer Spaniel is the best of all dogs." };
+  
   private List<VectorWritable> sampleData;
-
+  
   private static final String[] DOCS2 = { "The quick red fox jumped over the lazy brown dogs.",
-      "The quick brown fox jumped over the lazy red dogs.", "The quick red cat jumped over the lazy brown dogs.",
-      "The quick brown cat jumped over the lazy red dogs.", "Mary had a little lamb whose fleece was white as snow.",
-      "Mary had a little goat whose fleece was white as snow.", "Mary had a little lamb whose fleece was black as tar.",
-      "Dick had a little goat whose fleece was white as snow.", "Moby Dick is a story of a whale and a man obsessed.",
-      "Moby Bob is a story of a walrus and a man obsessed.", "Moby Dick is a story of a whale and a crazy man.",
-      "The robber wore a black fleece jacket and a baseball cap.", "The robber wore a red fleece jacket and a baseball cap.",
-      "The robber wore a white fleece jacket and a baseball cap.", "The English Springer Spaniel is the best of all dogs." };
-
+                                          "The quick brown fox jumped over the lazy red dogs.", "The quick red cat jumped over the lazy brown dogs.",
+                                          "The quick brown cat jumped over the lazy red dogs.", "Mary had a little lamb whose fleece was white as snow.",
+                                          "Mary had a little goat whose fleece was white as snow.", "Mary had a little lamb whose fleece was black as tar.",
+                                          "Dick had a little goat whose fleece was white as snow.", "Moby Dick is a story of a whale and a man obsessed.",
+                                          "Moby Bob is a story of a walrus and a man obsessed.", "Moby Dick is a story of a whale and a crazy man.",
+                                          "The robber wore a black fleece jacket and a baseball cap.", "The robber wore a red fleece jacket and a baseball cap.",
+                                          "The robber wore a white fleece jacket and a baseball cap.", "The English Springer Spaniel is the best of all dogs." };
+  
   @Override
   @Before
   public void setUp() throws Exception {
     super.setUp();
     RandomUtils.useTestSeed();
   }
-
+  
   private void getSampleData(String[] docs2) throws IOException {
     sampleData = new ArrayList<VectorWritable>();
     RAMDirectory directory = new RAMDirectory();
     IndexWriter writer = new IndexWriter(directory, new StandardAnalyzer(Version.LUCENE_CURRENT), true,
-        IndexWriter.MaxFieldLength.UNLIMITED);
+      IndexWriter.MaxFieldLength.UNLIMITED);
     for (int i = 0; i < docs2.length; i++) {
       Document doc = new Document();
       Field id = new Field("id", "doc_" + i, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS);
@@ -108,15 +109,15 @@
     TermInfo termInfo = new CachedTermInfo(reader, "content", 1, 100);
     VectorMapper mapper = new TFDFMapper(reader, weight, termInfo);
     LuceneIterable iterable = new LuceneIterable(reader, "id", "content", mapper);
-
+    
     int i = 0;
     for (Vector vector : iterable) {
-      assertNotNull(vector);
-      System.out.println("Vector[" + i++ + "]=" + formatVector(vector));
+      Assert.assertNotNull(vector);
+      System.out.println("Vector[" + i++ + "]=" + TestL1ModelClustering.formatVector(vector));
       sampleData.add(new VectorWritable(vector));
     }
   }
-
+  
   private static String formatVector(Vector v) {
     StringBuilder buf = new StringBuilder();
     int nzero = 0;
@@ -131,17 +132,19 @@
     int nextIx = 0;
     for (int i = 0; i < v.size(); i++) {
       double elem = v.get(i);
-      if (elem == 0.0)
+      if (elem == 0.0) {
         continue;
-      if (i > nextIx)
+      }
+      if (i > nextIx) {
         buf.append("..{").append(i).append("}=");
+      }
       buf.append(String.format("%.2f", elem)).append(", ");
       nextIx = i + 1;
     }
     buf.append(']');
     return buf.toString();
   }
-
+  
   private static void printSamples(List<Model<VectorWritable>[]> result, int significant) {
     int row = 0;
     for (Model<VectorWritable>[] r : result) {
@@ -161,13 +164,14 @@
     }
     System.out.println();
   }
-
+  
   private void printClusters(Model<VectorWritable>[] models, List<VectorWritable> samples, String[] docs) {
     for (int m = 0; m < models.length; m++) {
       Model<VectorWritable> model = models[m];
       int count = model.count();
-      if (count == 0)
+      if (count == 0) {
         continue;
+      }
       System.out.println("Model[" + m + "] had " + count + " hits (!) and " + (samples.size()-count) + " misses (? in pdf order) during the last iteration:");
       MapElement[] map = new MapElement[samples.size()];
       // sort the samples by pdf
@@ -178,35 +182,36 @@
       Arrays.sort(map);
       // now find the n=model.count() most likely docs and output them
       for (int i = 0; i < map.length; i++) {
-        if (i < count)
+        if (i < count) {
           System.out.print("! ");
-        else
+        } else {
           System.out.print("? ");
+        }
         System.out.println(map[i].doc);
       }
     }
   }
-
+  
   public void testDocs() throws Exception {
     System.out.println("testDocs");
-    getSampleData(DOCS);
+    getSampleData(TestL1ModelClustering.DOCS);
     DirichletClusterer<VectorWritable> dc = new DirichletClusterer<VectorWritable>(sampleData, new L1ModelDistribution(sampleData
-        .get(0)), 1.0, 15, 1, 0);
+      .get(0)), 1.0, 15, 1, 0);
     List<Model<VectorWritable>[]> result = dc.cluster(10);
-    assertNotNull(result);
-    printSamples(result, 0);
-    printClusters(result.get(result.size() - 1), sampleData, DOCS);
+    Assert.assertNotNull(result);
+    TestL1ModelClustering.printSamples(result, 0);
+    printClusters(result.get(result.size() - 1), sampleData, TestL1ModelClustering.DOCS);
   }
-
+  
   public void testDocs2() throws Exception {
     System.out.println("testDocs2");
-    getSampleData(DOCS2);
+    getSampleData(TestL1ModelClustering.DOCS2);
     DirichletClusterer<VectorWritable> dc = new DirichletClusterer<VectorWritable>(sampleData, new L1ModelDistribution(sampleData
-        .get(0)), 1.0, 15, 1, 0);
+      .get(0)), 1.0, 15, 1, 0);
     List<Model<VectorWritable>[]> result = dc.cluster(10);
-    assertNotNull(result);
-    printSamples(result, 0);
-    printClusters(result.get(result.size() - 1), sampleData, DOCS2);
+    Assert.assertNotNull(result);
+    TestL1ModelClustering.printSamples(result, 0);
+    printClusters(result.get(result.size() - 1), sampleData, TestL1ModelClustering.DOCS2);
   }
-
+  
 }

Modified: lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapperTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapperTest.java?rev=909861&r1=909860&r2=909861&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapperTest.java (original)
+++ lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapperTest.java Sat Feb 13 17:55:56 2010
@@ -61,29 +61,31 @@
     key.set("dummy-key");
     
     String[] input = {"the", "best", "of", "times", "the", "worst", "of",
-                      "times"};
+    "times"};
     StringTuple inputTuple = new StringTuple();
     for (String i : input) {
       inputTuple.add(i);
     }
     
     String[][] values = new String[][] { {"h_the", "the best"},
-                                        {"t_best", "the best"},
-                                        {"h_of", "of times"},
-                                        {"t_times", "of times"},
-                                        {"h_best", "best of"},
-                                        {"t_of", "best of"},
-                                        {"h_the", "the worst"},
-                                        {"t_worst", "the worst"},
-                                        {"h_times", "times the"},
-                                        {"t_the", "times the"},
-                                        {"h_worst", "worst of"},
-                                        {"t_of", "worst of"},};
+                                         {"t_best", "the best"},
+                                         {"h_of", "of times"},
+                                         {"t_times", "of times"},
+                                         {"h_best", "best of"},
+                                         {"t_of", "best of"},
+                                         {"h_the", "the worst"},
+                                         {"t_worst", "the worst"},
+                                         {"h_times", "times the"},
+                                         {"t_the", "times the"},
+                                         {"h_worst", "worst of"},
+                                         {"t_of", "worst of"},};
     // set up expectations for mocks. ngram max size = 2
     for (String[] v : values) {
       Type p = v[0].startsWith("h") ? HEAD : TAIL;
       int frequency = 1;
-      if (v[1].equals("of times")) frequency = 2;
+      if (v[1].equals("of times")) {
+        frequency = 2;
+      }
       Gram subgram = new Gram(v[0].substring(2), frequency, p);
       Gram ngram = new Gram(v[1], frequency);
       collector.collect(subgram, ngram);
@@ -110,34 +112,36 @@
     key.set("dummy-key");
     
     String[] input = {"the", "best", "of", "times", "the", "worst", "of",
-                      "times"};
+    "times"};
     StringTuple inputTuple = new StringTuple();
     for (String i : input) {
       inputTuple.add(i);
     }
     
     String[][] values = new String[][] { {"h_the", "the best"},
-                                        {"t_best", "the best"},
-                                        {"h_of", "of times"},
-                                        {"t_times", "of times"},
-                                        {"h_best", "best of"},
-                                        {"t_of", "best of"},
-                                        {"h_the", "the worst"},
-                                        {"t_worst", "the worst"},
-                                        {"h_times", "times the"},
-                                        {"t_the", "times the"},
-                                        {"h_worst", "worst of"},
-                                        {"t_of", "worst of"},
-                                        {"u_worst", "worst"}, {"u_of", "of"},
-                                        {"u_the", "the"}, {"u_best", "best"},
-                                        {"u_times", "times"},};
+                                         {"t_best", "the best"},
+                                         {"h_of", "of times"},
+                                         {"t_times", "of times"},
+                                         {"h_best", "best of"},
+                                         {"t_of", "best of"},
+                                         {"h_the", "the worst"},
+                                         {"t_worst", "the worst"},
+                                         {"h_times", "times the"},
+                                         {"t_the", "times the"},
+                                         {"h_worst", "worst of"},
+                                         {"t_of", "worst of"},
+                                         {"u_worst", "worst"}, {"u_of", "of"},
+                                         {"u_the", "the"}, {"u_best", "best"},
+                                         {"u_times", "times"},};
     // set up expectations for mocks. ngram max size = 2
     for (String[] v : values) {
       Type p = v[0].startsWith("h") ? HEAD : TAIL;
       p = v[0].startsWith("u") ? UNIGRAM : p;
       int frequency = 1;
       if (v[1].equals("of times") || v[1].equals("of") || v[1].equals("times")
-          || v[1].equals("the")) frequency = 2;
+          || v[1].equals("the")) {
+        frequency = 2;
+      }
       Gram subgram = new Gram(v[0].substring(2), frequency, p);
       Gram ngram = new Gram(v[1], frequency);
       collector.collect(subgram, ngram);

Modified: lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducerTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducerTest.java?rev=909861&r1=909860&r2=909861&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducerTest.java (original)
+++ lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducerTest.java Sat Feb 13 17:55:56 2010
@@ -44,28 +44,28 @@
     output = EasyMock.createMock(OutputCollector.class);
     reporter = EasyMock.createMock(Reporter.class);
   }
-
+  
   @Test
   public void testReduce() throws Exception {
     // test input, input[*][0] is the key,
     // input[*][1..n] are the values passed in via
     // the iterator.
     Gram[][] input = new Gram[][] {
-        { new Gram("the",   UNIGRAM), new Gram("the", UNIGRAM), new Gram("the", UNIGRAM) },
-        { new Gram("the",   HEAD), new Gram("the best"), new Gram("the worst") },
-        { new Gram("of",    HEAD), new Gram("of times"), new Gram("of times") },
-        { new Gram("times", TAIL), new Gram("of times"), new Gram("of times") }
+                                   { new Gram("the",   UNIGRAM), new Gram("the", UNIGRAM), new Gram("the", UNIGRAM) },
+                                   { new Gram("the",   HEAD), new Gram("the best"), new Gram("the worst") },
+                                   { new Gram("of",    HEAD), new Gram("of times"), new Gram("of times") },
+                                   { new Gram("times", TAIL), new Gram("of times"), new Gram("of times") }
     };
-
+    
     // expected results.
     Gram[][] values = new Gram[][] {
-        { new Gram("the", 4, UNIGRAM), new Gram("the", 2, UNIGRAM) },                             
-        { new Gram("the best",  1), new Gram("the", 2,   HEAD) }, 
-        { new Gram("the worst", 1), new Gram("the", 2,   HEAD) }, 
-        { new Gram("of times",  2), new Gram("of",  2,   HEAD) }, 
-        { new Gram("of times",  2), new Gram("times", 2, TAIL) }
+                                    { new Gram("the", 4, UNIGRAM), new Gram("the", 2, UNIGRAM) },
+                                    { new Gram("the best",  1), new Gram("the", 2,   HEAD) },
+                                    { new Gram("the worst", 1), new Gram("the", 2,   HEAD) },
+                                    { new Gram("of times",  2), new Gram("of",  2,   HEAD) },
+                                    { new Gram("of times",  2), new Gram("times", 2, TAIL) }
     };
-
+    
     // set up expectations
     for (Gram[] v : values) {
       output.collect(v[0], v[1]);