You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ro...@apache.org on 2010/02/13 18:56:01 UTC

svn commit: r909861 [1/4] - in /lucene/mahout/trunk/utils/src: main/java/org/apache/mahout/clustering/lda/ main/java/org/apache/mahout/text/ main/java/org/apache/mahout/utils/ main/java/org/apache/mahout/utils/clustering/ main/java/org/apache/mahout/ut...

Author: robinanil
Date: Sat Feb 13 17:55:56 2010
New Revision: 909861

URL: http://svn.apache.org/viewvc?rev=909861&view=rev
Log:
MAHOUT-291
Code Cleanup in Utils

Modified:
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/SequenceFileDumper.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocCombiner.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapper.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducer.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/Gram.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/LLRReducer.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/NGramCollector.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/SequenceFileVectorIterable.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/TF.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/TFIDF.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/TermEntry.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/TermInfo.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/Weight.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFModel.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFType.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/Driver.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMergeReducer.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterTermInfoWriter.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterVectorWriter.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/TermInfoWriter.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/CachedTermInfo.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DocumentProcessor.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/document/SequenceFileTokenizerMapper.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TFPartialVectorReducer.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TermCountMapper.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TermCountReducer.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TermDocumentCountMapper.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TermDocumentCountReducer.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFPartialVectorReducer.java
    lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/dirichlet/TestL1ModelClustering.java
    lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapperTest.java
    lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducerTest.java
    lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/GramTest.java
    lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/LLRReducerTest.java
    lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/NGramCollectorTest.java
    lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/RandomVectorIterable.java
    lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/SequenceFileVectorIterableTest.java
    lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java
    lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/io/VectorWriterTest.java
    lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java
    lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java?rev=909861&r1=909860&r2=909861&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/lda/LDAPrintTopics.java Sat Feb 13 17:55:56 2010
@@ -27,7 +27,6 @@
 import java.util.List;
 import java.util.PriorityQueue;
 import java.util.Queue;
-import java.util.regex.Pattern;
 
 import org.apache.commons.cli2.CommandLine;
 import org.apache.commons.cli2.Group;
@@ -50,9 +49,8 @@
  * Class to print out the top K words for each topic.
  */
 public class LDAPrintTopics {
-  private static final Pattern TAB_PATTERN = Pattern.compile("\t");
   
-  private LDAPrintTopics() {}
+  private LDAPrintTopics() { }
   
   private static class StringDoublePair implements Comparable<StringDoublePair> {
     private final double score;
@@ -85,9 +83,9 @@
   }
   
   public static List<List<String>> topWordsForTopics(String dir,
-                                                     Configuration job,
-                                                     List<String> wordList,
-                                                     int numWordsToPrint) throws IOException {
+    Configuration job,
+    List<String> wordList,
+    int numWordsToPrint) throws IOException {
     FileSystem fs = new Path(dir).getFileSystem(job);
     
     List<PriorityQueue<StringDoublePair>> queues = new ArrayList<PriorityQueue<StringDoublePair>>();
@@ -101,11 +99,11 @@
         int topic = key.getX();
         int word = key.getY();
         
-        ensureQueueSize(queues, topic);
+        LDAPrintTopics.ensureQueueSize(queues, topic);
         if (word >= 0 && topic >= 0) {
           double score = value.get();
           String realWord = wordList.get(word);
-          maybeEnqueue(queues.get(topic), realWord, score, numWordsToPrint);
+          LDAPrintTopics.maybeEnqueue(queues.get(topic), realWord, score, numWordsToPrint);
         }
       }
       reader.close();
@@ -149,42 +147,42 @@
     GroupBuilder gbuilder = new GroupBuilder();
     
     Option inputOpt = obuilder.withLongName("input").withRequired(true)
-        .withArgument(
-          abuilder.withName("input").withMinimum(1).withMaximum(1).create())
-        .withDescription("Path to an LDA output (a state)").withShortName("i")
-        .create();
+    .withArgument(
+      abuilder.withName("input").withMinimum(1).withMaximum(1).create())
+      .withDescription("Path to an LDA output (a state)").withShortName("i")
+      .create();
     
     Option dictOpt = obuilder.withLongName("dict").withRequired(true)
-        .withArgument(
-          abuilder.withName("dict").withMinimum(1).withMaximum(1).create())
-        .withDescription(
-          "Dictionary to read in, in the same format as one created by "
-              + "org.apache.mahout.utils.vectors.lucene.Driver").withShortName(
-          "d").create();
+    .withArgument(
+      abuilder.withName("dict").withMinimum(1).withMaximum(1).create())
+      .withDescription(
+        "Dictionary to read in, in the same format as one created by "
+        + "org.apache.mahout.utils.vectors.lucene.Driver").withShortName(
+        "d").create();
     
     Option outOpt = obuilder.withLongName("output").withRequired(true)
-        .withArgument(
-          abuilder.withName("output").withMinimum(1).withMaximum(1).create())
-        .withDescription("Output directory to write top words").withShortName(
-          "o").create();
+    .withArgument(
+      abuilder.withName("output").withMinimum(1).withMaximum(1).create())
+      .withDescription("Output directory to write top words").withShortName(
+      "o").create();
     
     Option wordOpt = obuilder.withLongName("words").withRequired(false)
-        .withArgument(
-          abuilder.withName("words").withMinimum(0).withMaximum(1).withDefault(
-            "20").create()).withDescription("Number of words to print")
-        .withShortName("w").create();
+    .withArgument(
+      abuilder.withName("words").withMinimum(0).withMaximum(1).withDefault(
+      "20").create()).withDescription("Number of words to print")
+      .withShortName("w").create();
     Option dictTypeOpt = obuilder.withLongName("dictionaryType").withRequired(
       false).withArgument(
-      abuilder.withName("dictionaryType").withMinimum(1).withMaximum(1)
-          .create()).withDescription(
-      "The dictionary file type (text|sequencefile)").withShortName("dt")
+        abuilder.withName("dictionaryType").withMinimum(1).withMaximum(1)
+        .create()).withDescription(
+        "The dictionary file type (text|sequencefile)").withShortName("dt")
         .create();
     Option helpOpt = obuilder.withLongName("help").withDescription(
-      "Print out help").withShortName("h").create();
+    "Print out help").withShortName("h").create();
     
     Group group = gbuilder.withName("Options").withOption(dictOpt).withOption(
       outOpt).withOption(wordOpt).withOption(inputOpt).withOption(dictTypeOpt)
-        .create();
+      .create();
     try {
       Parser parser = new Parser();
       parser.setGroup(group);
@@ -212,7 +210,7 @@
       List<String> wordList;
       if (dictionaryType.equals("text")) {
         wordList = Arrays.asList(VectorHelper.loadTermDictionary(new File(
-            dictFile)));
+          dictFile)));
       } else if (dictionaryType.equals("sequencefile")) {
         FileSystem fs = FileSystem.get(new Path(dictFile).toUri(), config);
         wordList = Arrays.asList(VectorHelper.loadTermDictionary(config, fs,
@@ -221,7 +219,7 @@
         throw new IllegalArgumentException("Invalid dictionary format");
       }
       
-      List<List<String>> topWords = topWordsForTopics(input, config, wordList,
+      List<List<String>> topWords = LDAPrintTopics.topWordsForTopics(input, config, wordList,
         numWords);
       
       if (!output.exists()) {

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java?rev=909861&r1=909860&r2=909861&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java Sat Feb 13 17:55:56 2010
@@ -43,8 +43,7 @@
  */
 public final class SparseVectorsFromSequenceFiles {
   
-  private static final Logger log = LoggerFactory
-      .getLogger(SparseVectorsFromSequenceFiles.class);
+  private static final Logger log = LoggerFactory.getLogger(SparseVectorsFromSequenceFiles.class);
   
   private SparseVectorsFromSequenceFiles() {}
   
@@ -53,108 +52,74 @@
     ArgumentBuilder abuilder = new ArgumentBuilder();
     GroupBuilder gbuilder = new GroupBuilder();
     
-    Option inputDirOpt = obuilder.withLongName("input").withRequired(true)
-        .withArgument(
-          abuilder.withName("input").withMinimum(1).withMaximum(1).create())
-        .withDescription(
-          "input dir containing the documents in sequence file format")
-        .withShortName("i").create();
-    
-    Option outputDirOpt = obuilder
-        .withLongName("output")
-        .withRequired(true)
-        .withArgument(
-          abuilder.withName("output").withMinimum(1).withMaximum(1).create())
-        .withDescription("The output directory").withShortName("o").create();
+    Option inputDirOpt = obuilder.withLongName("input").withRequired(true).withArgument(
+      abuilder.withName("input").withMinimum(1).withMaximum(1).create()).withDescription(
+      "input dir containing the documents in sequence file format").withShortName("i").create();
+    
+    Option outputDirOpt = obuilder.withLongName("output").withRequired(true).withArgument(
+      abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription(
+      "The output directory").withShortName("o").create();
     Option minSupportOpt = obuilder.withLongName("minSupport").withArgument(
-      abuilder.withName("minSupport").withMinimum(1).withMaximum(1).create())
-        .withDescription("(Optional) Minimum Support. Default Value: 2")
-        .withShortName("s").create();
-    
-    Option analyzerNameOpt = obuilder.withLongName("analyzerName")
-        .withArgument(
-          abuilder.withName("analyzerName").withMinimum(1).withMaximum(1)
-              .create()).withDescription("The class name of the analyzer")
-        .withShortName("a").create();
+      abuilder.withName("minSupport").withMinimum(1).withMaximum(1).create()).withDescription(
+      "(Optional) Minimum Support. Default Value: 2").withShortName("s").create();
+    
+    Option analyzerNameOpt = obuilder.withLongName("analyzerName").withArgument(
+      abuilder.withName("analyzerName").withMinimum(1).withMaximum(1).create()).withDescription(
+      "The class name of the analyzer").withShortName("a").create();
     
     Option chunkSizeOpt = obuilder.withLongName("chunkSize").withArgument(
-      abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create())
-        .withDescription("The chunkSize in MegaBytes. 100-10000 MB")
-        .withShortName("chunk").create();
-    
-    Option weightOpt = obuilder.withLongName("weight").withRequired(false)
-        .withArgument(
-          abuilder.withName("weight").withMinimum(1).withMaximum(1).create())
-        .withDescription("The kind of weight to use. Currently TF or TFIDF")
-        .withShortName("wt").create();
-    
-    Option minDFOpt = obuilder.withLongName("minDF").withRequired(false)
-        .withArgument(
-          abuilder.withName("minDF").withMinimum(1).withMaximum(1).create())
-        .withDescription("The minimum document frequency.  Default is 1")
-        .withShortName("md").create();
+      abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create()).withDescription(
+      "The chunkSize in MegaBytes. 100-10000 MB").withShortName("chunk").create();
+    
+    Option weightOpt = obuilder.withLongName("weight").withRequired(false).withArgument(
+      abuilder.withName("weight").withMinimum(1).withMaximum(1).create()).withDescription(
+      "The kind of weight to use. Currently TF or TFIDF").withShortName("wt").create();
+    
+    Option minDFOpt = obuilder.withLongName("minDF").withRequired(false).withArgument(
+      abuilder.withName("minDF").withMinimum(1).withMaximum(1).create()).withDescription(
+      "The minimum document frequency.  Default is 1").withShortName("md").create();
     
     Option maxDFPercentOpt = obuilder
         .withLongName("maxDFPercent")
         .withRequired(false)
-        .withArgument(
-          abuilder.withName("maxDFPercent").withMinimum(1).withMaximum(1)
-              .create())
+        .withArgument(abuilder.withName("maxDFPercent").withMinimum(1).withMaximum(1).create())
         .withDescription(
           "The max percentage of docs for the DF.  Can be used to remove really high frequency terms.  Expressed as an integer between 0 and 100. Default is 99.")
         .withShortName("x").create();
     
-    Option minLLROpt = obuilder.withLongName("minLLR").withRequired(false)
-        .withArgument(
-          abuilder.withName("minLLR").withMinimum(1).withMaximum(1).create())
-        .withDescription(
-          "(Optional)The minimum Log Likelihood Ratio(Float)  Default is "
-              + LLRReducer.DEFAULT_MIN_LLR).withShortName("ml").create();
-    
-    Option numReduceTasksOpt = obuilder.withLongName("numReducers")
-        .withArgument(
-          abuilder.withName("numReducers").withMinimum(1).withMaximum(1)
-              .create()).withDescription(
-          "(Optional) Number of reduce tasks. Default Value: 1").withShortName(
-          "nr").create();
-    
-    Option powerOpt = obuilder
-        .withLongName("norm")
-        .withRequired(false)
-        .withArgument(
-          abuilder.withName("norm").withMinimum(1).withMaximum(1).create())
-        .withDescription(
-          "The norm to use, expressed as either a float or \"INF\" if you want to use the Infinite norm.  "
-              + "Must be greater or equal to 0.  The default is not to normalize")
-        .withShortName("n").create();
-    Option maxNGramSizeOpt = obuilder
-        .withLongName("maxNGramSize")
-        .withRequired(false)
-        .withArgument(
-          abuilder.withName("ngramSize").withMinimum(1).withMaximum(1).create())
+    Option minLLROpt = obuilder.withLongName("minLLR").withRequired(false).withArgument(
+      abuilder.withName("minLLR").withMinimum(1).withMaximum(1).create()).withDescription(
+      "(Optional)The minimum Log Likelihood Ratio(Float)  Default is " + LLRReducer.DEFAULT_MIN_LLR)
+        .withShortName("ml").create();
+    
+    Option numReduceTasksOpt = obuilder.withLongName("numReducers").withArgument(
+      abuilder.withName("numReducers").withMinimum(1).withMaximum(1).create()).withDescription(
+      "(Optional) Number of reduce tasks. Default Value: 1").withShortName("nr").create();
+    
+    Option powerOpt = obuilder.withLongName("norm").withRequired(false).withArgument(
+      abuilder.withName("norm").withMinimum(1).withMaximum(1).create()).withDescription(
+      "The norm to use, expressed as either a float or \"INF\" if you want to use the Infinite norm.  "
+          + "Must be greater or equal to 0.  The default is not to normalize").withShortName("n").create();
+    Option maxNGramSizeOpt = obuilder.withLongName("maxNGramSize").withRequired(false).withArgument(
+      abuilder.withName("ngramSize").withMinimum(1).withMaximum(1).create())
         .withDescription(
           "(Optional) The maximum size of ngrams to create"
-              + " (2 = bigrams, 3 = trigrams, etc) Default Value:2")
-        .withShortName("ng").create();
-    Option sequentialAccessVectorOpt = obuilder.withLongName("sequentialAccessVector")
-        .withRequired(false)
-        .withDescription("(Optional) Whether output vectors should be SequentialAccessVectors If set true else false")
+              + " (2 = bigrams, 3 = trigrams, etc) Default Value:2").withShortName("ng").create();
+    Option sequentialAccessVectorOpt = obuilder.withLongName("sequentialAccessVector").withRequired(false)
+        .withDescription(
+          "(Optional) Whether output vectors should be SequentialAccessVectors If set true else false")
         .withShortName("seq").create();
     
-    Option overwriteOutput = obuilder.withLongName("overwrite").withRequired(
-      false).withDescription("If set, overwrite the output directory")
-        .withShortName("w").create();
-    Option helpOpt = obuilder.withLongName("help").withDescription(
-      "Print out help").withShortName("h").create();
-    
-    Group group = gbuilder.withName("Options").withOption(minSupportOpt)
-        .withOption(analyzerNameOpt).withOption(chunkSizeOpt).withOption(
-          outputDirOpt).withOption(inputDirOpt).withOption(minDFOpt)
-        .withOption(maxDFPercentOpt).withOption(weightOpt).withOption(powerOpt)
-        .withOption(minLLROpt).withOption(numReduceTasksOpt).withOption(
-          maxNGramSizeOpt).withOption(overwriteOutput).withOption(helpOpt)
-        .withOption(sequentialAccessVectorOpt)
+    Option overwriteOutput = obuilder.withLongName("overwrite").withRequired(false).withDescription(
+      "If set, overwrite the output directory").withShortName("w").create();
+    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
         .create();
+    
+    Group group = gbuilder.withName("Options").withOption(minSupportOpt).withOption(analyzerNameOpt)
+        .withOption(chunkSizeOpt).withOption(outputDirOpt).withOption(inputDirOpt).withOption(minDFOpt)
+        .withOption(maxDFPercentOpt).withOption(weightOpt).withOption(powerOpt).withOption(minLLROpt)
+        .withOption(numReduceTasksOpt).withOption(maxNGramSizeOpt).withOption(overwriteOutput).withOption(
+          helpOpt).withOption(sequentialAccessVectorOpt).create();
     try {
       Parser parser = new Parser();
       parser.setGroup(group);
@@ -182,8 +147,7 @@
       
       if (cmdLine.hasOption(maxNGramSizeOpt) == true) {
         try {
-          maxNGramSize = Integer.parseInt(cmdLine.getValue(maxNGramSizeOpt)
-              .toString());
+          maxNGramSize = Integer.parseInt(cmdLine.getValue(maxNGramSizeOpt).toString());
         } catch (NumberFormatException ex) {
           log.warn("Could not parse ngram size option");
         }
@@ -202,8 +166,7 @@
       
       int reduceTasks = 1;
       if (cmdLine.hasOption(numReduceTasksOpt)) {
-        reduceTasks = Integer.parseInt(cmdLine.getValue(numReduceTasksOpt)
-            .toString());
+        reduceTasks = Integer.parseInt(cmdLine.getValue(numReduceTasksOpt).toString());
       }
       log.info("Pass1 reduce tasks: {}", reduceTasks);
       
@@ -237,8 +200,7 @@
       }
       int maxDFPercent = 99;
       if (cmdLine.hasOption(maxDFPercentOpt)) {
-        maxDFPercent = Integer.parseInt(cmdLine.getValue(maxDFPercentOpt)
-            .toString());
+        maxDFPercent = Integer.parseInt(cmdLine.getValue(maxDFPercentOpt).toString());
       }
       
       float norm = PartialVectorMerger.NO_NORMALIZING;
@@ -251,23 +213,20 @@
         }
       }
       HadoopUtil.overwriteOutput(outputDir);
-      String tokenizedPath = outputDir
-                             + DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER;
-      DocumentProcessor.tokenizeDocuments(inputDir, analyzerClass,
-        tokenizedPath);
-
+      String tokenizedPath = outputDir + DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER;
+      DocumentProcessor.tokenizeDocuments(inputDir, analyzerClass, tokenizedPath);
+      
       boolean sequentialAccessOutput = false;
       if (cmdLine.hasOption(sequentialAccessVectorOpt)) {
         sequentialAccessOutput = true;
       }
       
-      DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir,
-        minSupport, maxNGramSize, minLLRValue, reduceTasks, chunkSize, sequentialAccessOutput);
+      DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir, minSupport, maxNGramSize,
+        minLLRValue, reduceTasks, chunkSize, sequentialAccessOutput);
       if (processIdf) {
-        TFIDFConverter.processTfIdf(
-          outputDir + DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER,
-          outputDir + TFIDFConverter.TFIDF_OUTPUT_FOLDER, chunkSize, minDf,
-          maxDFPercent, norm, sequentialAccessOutput);
+        TFIDFConverter.processTfIdf(outputDir + DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER,
+          outputDir + TFIDFConverter.TFIDF_OUTPUT_FOLDER, chunkSize, minDf, maxDFPercent, norm,
+          sequentialAccessOutput);
       }
     } catch (OptionException e) {
       log.error("Exception", e);

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/SequenceFileDumper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/SequenceFileDumper.java?rev=909861&r1=909860&r2=909861&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/SequenceFileDumper.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/SequenceFileDumper.java Sat Feb 13 17:55:56 2010
@@ -17,6 +17,11 @@
 
 package org.apache.mahout.utils;
 
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+
 import org.apache.commons.cli2.CommandLine;
 import org.apache.commons.cli2.Group;
 import org.apache.commons.cli2.Option;
@@ -36,51 +41,46 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import java.io.FileWriter;
-import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.io.Writer;
-
 public class SequenceFileDumper {
-
+  
   private static final Logger log = LoggerFactory.getLogger(SequenceFileDumper.class);
-
+  
   private SequenceFileDumper() {
   }
-
+  
   public static void main(String[] args) throws IOException, IllegalAccessException, InstantiationException {
     DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
     ArgumentBuilder abuilder = new ArgumentBuilder();
     GroupBuilder gbuilder = new GroupBuilder();
-
+    
     Option seqOpt = obuilder.withLongName("seqFile").withRequired(false).withArgument(
-            abuilder.withName("seqFile").withMinimum(1).withMaximum(1).create()).
-            withDescription("The Sequence File containing the Clusters").withShortName("s").create();
+      abuilder.withName("seqFile").withMinimum(1).withMaximum(1).create()).
+      withDescription("The Sequence File containing the Clusters").withShortName("s").create();
     Option outputOpt = obuilder.withLongName("output").withRequired(false).withArgument(
-            abuilder.withName("output").withMinimum(1).withMaximum(1).create()).
-            withDescription("The output file.  If not specified, dumps to the console").withShortName("o").create();
+      abuilder.withName("output").withMinimum(1).withMaximum(1).create()).
+      withDescription("The output file.  If not specified, dumps to the console").withShortName("o").create();
     Option substringOpt = obuilder.withLongName("substring").withRequired(false).withArgument(
-            abuilder.withName("substring").withMinimum(1).withMaximum(1).create()).
-            withDescription("The number of chars of the asFormatString() to print").withShortName("b").create();
+      abuilder.withName("substring").withMinimum(1).withMaximum(1).create()).
+      withDescription("The number of chars of the asFormatString() to print").withShortName("b").create();
     Option countOpt = obuilder.withLongName("count").withRequired(false).
-            withDescription("Report the count only").withShortName("c").create();
+    withDescription("Report the count only").withShortName("c").create();
     Option helpOpt = obuilder.withLongName("help").
-            withDescription("Print out help").withShortName("h").create();
-
+    withDescription("Print out help").withShortName("h").create();
+    
     Group group = gbuilder.withName("Options").withOption(seqOpt).withOption(outputOpt)
     .withOption(substringOpt).withOption(countOpt).withOption(helpOpt).create();
-
+    
     try {
       Parser parser = new Parser();
       parser.setGroup(group);
       CommandLine cmdLine = parser.parse(args);
-
+      
       if (cmdLine.hasOption(helpOpt)) {
-
-        printHelp(group);
+        
+        SequenceFileDumper.printHelp(group);
         return;
       }
-
+      
       if (cmdLine.hasOption(seqOpt)) {
         Path path = new Path(cmdLine.getValue(seqOpt).toString());
         JobClient client = new JobClient();
@@ -88,7 +88,7 @@
         client.setConf(conf);
         FileSystem fs = FileSystem.get(path.toUri(), conf);
         SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
-
+        
         Writer writer;
         if (cmdLine.hasOption(outputOpt)) {
           writer = new FileWriter(cmdLine.getValue(outputOpt).toString());
@@ -96,7 +96,7 @@
           writer = new OutputStreamWriter(System.out);
         }
         writer.append("Input Path: ").append(String.valueOf(path)).append('\n');
-
+        
         int sub = Integer.MAX_VALUE;
         if (cmdLine.hasOption(substringOpt)) {
           sub = Integer.parseInt(cmdLine.getValue(substringOpt).toString());
@@ -105,7 +105,7 @@
         Writable key = (Writable) reader.getKeyClass().newInstance();
         Writable value = (Writable) reader.getValueClass().newInstance();
         writer.append("Key class: ").append(String.valueOf(reader.getKeyClass())).append(" Value Class: ")
-            .append(String.valueOf(value.getClass())).append('\n');
+        .append(String.valueOf(value.getClass())).append('\n');
         writer.flush();
         long count = 0;
         if (countOnly == false) {
@@ -129,14 +129,14 @@
           writer.close();
         }
       }
-
+      
     } catch (OptionException e) {
-      log.error("Exception", e);
-      printHelp(group);
+      SequenceFileDumper.log.error("Exception", e);
+      SequenceFileDumper.printHelp(group);
     }
-
+    
   }
-
+  
   private static void printHelp(Group group) {
     HelpFormatter formatter = new HelpFormatter();
     formatter.setGroup(group);

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java?rev=909861&r1=909860&r2=909861&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java Sat Feb 13 17:55:56 2010
@@ -58,8 +58,7 @@
 
 public final class ClusterDumper {
   
-  private static final Logger log = LoggerFactory
-      .getLogger(ClusterDumper.class);
+  private static final Logger log = LoggerFactory.getLogger(ClusterDumper.class);
   
   private final String seqFileDir;
   private final String pointsDir;
@@ -80,15 +79,13 @@
     if (this.pointsDir != null) {
       JobConf conf = new JobConf(Job.class);
       // read in the points
-      clusterIdToPoints = readPoints(this.pointsDir, conf);
+      clusterIdToPoints = ClusterDumper.readPoints(this.pointsDir, conf);
     } else {
       clusterIdToPoints = Collections.emptyMap();
     }
   }
   
-  public void printClusters() throws IOException,
-                             InstantiationException,
-                             IllegalAccessException {
+  public void printClusters() throws IOException, InstantiationException, IllegalAccessException {
     JobClient client = new JobClient();
     JobConf conf = new JobConf(Job.class);
     client.setConf(conf);
@@ -96,13 +93,10 @@
     String[] dictionary = null;
     if (this.termDictionary != null) {
       if (dictionaryFormat.equals("text")) {
-        dictionary = VectorHelper.loadTermDictionary(new File(
-            this.termDictionary));
+        dictionary = VectorHelper.loadTermDictionary(new File(this.termDictionary));
       } else if (dictionaryFormat.equals("sequencefile")) {
-        FileSystem fs = FileSystem.get(new Path(this.termDictionary).toUri(),
-          conf);
-        dictionary = VectorHelper.loadTermDictionary(conf, fs,
-          this.termDictionary);
+        FileSystem fs = FileSystem.get(new Path(this.termDictionary).toUri(), conf);
+        dictionary = VectorHelper.loadTermDictionary(conf, fs, this.termDictionary);
       } else {
         throw new IllegalArgumentException("Invalid dictionary format");
       }
@@ -115,13 +109,12 @@
       writer = new OutputStreamWriter(System.out);
     }
     
-    File[] seqFileList = new File(this.seqFileDir)
-        .listFiles(new FilenameFilter() {
-          @Override
-          public boolean accept(File file, String name) {
-            return name.endsWith(".crc") == false;
-          }
-        });
+    File[] seqFileList = new File(this.seqFileDir).listFiles(new FilenameFilter() {
+      @Override
+      public boolean accept(File file, String name) {
+        return name.endsWith(".crc") == false;
+      }
+    });
     for (File seqFile : seqFileList) {
       if (!seqFile.isFile()) {
         continue;
@@ -134,27 +127,25 @@
       ClusterBase value = (ClusterBase) reader.getValueClass().newInstance();
       while (reader.next(key, value)) {
         Vector center = value.getCenter();
-        String fmtStr = useJSON ? center.asFormatString() : VectorHelper
-            .vectorToString(center, dictionary);
+        String fmtStr = useJSON ? center.asFormatString() : VectorHelper.vectorToString(center, dictionary);
         writer.append("Id: ").append(String.valueOf(value.getId())).append(":");
         writer.append("name:").append(center.getName());
-        if (subString > 0) writer.append(":").append(
-          fmtStr.substring(0, Math.min(subString, fmtStr.length())));
+        if (subString > 0) {
+          writer.append(":").append(fmtStr.substring(0, Math.min(subString, fmtStr.length())));
+        }
         writer.append('\n');
-
+        
         if (dictionary != null) {
-          String topTerms = getTopFeatures(center, dictionary, 10);
+          String topTerms = ClusterDumper.getTopFeatures(center, dictionary, 10);
           writer.write("\tTop Terms: ");
           writer.write(topTerms);
           writer.write('\n');
         }
         
-        List<String> points = clusterIdToPoints.get(String.valueOf(value
-            .getId()));
+        List<String> points = clusterIdToPoints.get(String.valueOf(value.getId()));
         if (points != null) {
           writer.write("\tPoints: ");
-          for (Iterator<String> iterator = points.iterator(); iterator
-              .hasNext();) {
+          for (Iterator<String> iterator = points.iterator(); iterator.hasNext();) {
             String point = iterator.next();
             writer.append(point);
             if (iterator.hasNext()) {
@@ -202,65 +193,40 @@
     this.dictionaryFormat = dictionaryType;
   }
   
-  public static void main(String[] args) throws IOException,
-                                        IllegalAccessException,
-                                        InstantiationException {
+  public static void main(String[] args) throws IOException, IllegalAccessException, InstantiationException {
     DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
     ArgumentBuilder abuilder = new ArgumentBuilder();
     GroupBuilder gbuilder = new GroupBuilder();
     
-    Option seqOpt = obuilder.withLongName("seqFileDir").withRequired(false)
-        .withArgument(
-          abuilder.withName("seqFileDir").withMinimum(1).withMaximum(1)
-              .create()).withDescription(
-          "The directory containing Sequence Files for the Clusters")
-        .withShortName("s").create();
-    Option outputOpt = obuilder.withLongName("output").withRequired(false)
-        .withArgument(
-          abuilder.withName("output").withMinimum(1).withMaximum(1).create())
-        .withDescription(
-          "The output file.  If not specified, dumps to the console")
-        .withShortName("o").create();
-    Option substringOpt = obuilder
-        .withLongName("substring")
-        .withRequired(false)
-        .withArgument(
-          abuilder.withName("substring").withMinimum(1).withMaximum(1).create())
-        .withDescription("The number of chars of the asFormatString() to print")
-        .withShortName("b").create();
-    Option centroidJSonOpt = obuilder
-        .withLongName("json")
-        .withRequired(false)
-        .withDescription(
-          "Output the centroid as JSON.  Otherwise it substitues in the terms for vector cell entries")
+    Option seqOpt = obuilder.withLongName("seqFileDir").withRequired(false).withArgument(
+      abuilder.withName("seqFileDir").withMinimum(1).withMaximum(1).create()).withDescription(
+      "The directory containing Sequence Files for the Clusters").withShortName("s").create();
+    Option outputOpt = obuilder.withLongName("output").withRequired(false).withArgument(
+      abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription(
+      "The output file.  If not specified, dumps to the console").withShortName("o").create();
+    Option substringOpt = obuilder.withLongName("substring").withRequired(false).withArgument(
+      abuilder.withName("substring").withMinimum(1).withMaximum(1).create()).withDescription(
+      "The number of chars of the asFormatString() to print").withShortName("b").create();
+    Option centroidJSonOpt = obuilder.withLongName("json").withRequired(false).withDescription(
+      "Output the centroid as JSON.  Otherwise it substitues in the terms for vector cell entries")
         .withShortName("j").create();
-    Option pointsOpt = obuilder
-        .withLongName("pointsDir")
-        .withRequired(false)
-        .withArgument(
-          abuilder.withName("pointsDir").withMinimum(1).withMaximum(1).create())
-        .withDescription(
-          "The directory containing points sequence files mapping input vectors to their cluster.  "
-              + "If specified, then the program will output the points associated with a cluster")
-        .withShortName("p").create();
-    Option dictOpt = obuilder.withLongName("dictionary").withRequired(false)
-        .withArgument(
-          abuilder.withName("dictionary").withMinimum(1).withMaximum(1)
-              .create()).withDescription("The dictionary file. ")
-        .withShortName("d").create();
-    Option dictTypeOpt = obuilder.withLongName("dictionaryType").withRequired(
-      false).withArgument(
-      abuilder.withName("dictionaryType").withMinimum(1).withMaximum(1)
-          .create()).withDescription(
-      "The dictionary file type (text|sequencefile)").withShortName("dt")
+    Option pointsOpt = obuilder.withLongName("pointsDir").withRequired(false).withArgument(
+      abuilder.withName("pointsDir").withMinimum(1).withMaximum(1).create()).withDescription(
+      "The directory containing points sequence files mapping input vectors to their cluster.  "
+          + "If specified, then the program will output the points associated with a cluster").withShortName(
+      "p").create();
+    Option dictOpt = obuilder.withLongName("dictionary").withRequired(false).withArgument(
+      abuilder.withName("dictionary").withMinimum(1).withMaximum(1).create()).withDescription(
+      "The dictionary file. ").withShortName("d").create();
+    Option dictTypeOpt = obuilder.withLongName("dictionaryType").withRequired(false).withArgument(
+      abuilder.withName("dictionaryType").withMinimum(1).withMaximum(1).create()).withDescription(
+      "The dictionary file type (text|sequencefile)").withShortName("dt").create();
+    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
         .create();
-    Option helpOpt = obuilder.withLongName("help").withDescription(
-      "Print out help").withShortName("h").create();
     
-    Group group = gbuilder.withName("Options").withOption(helpOpt).withOption(
-      seqOpt).withOption(outputOpt).withOption(substringOpt).withOption(
-      pointsOpt).withOption(centroidJSonOpt).withOption(dictOpt).withOption(
-      dictTypeOpt).create();
+    Group group = gbuilder.withName("Options").withOption(helpOpt).withOption(seqOpt).withOption(outputOpt)
+        .withOption(substringOpt).withOption(pointsOpt).withOption(centroidJSonOpt).withOption(dictOpt)
+        .withOption(dictTypeOpt).create();
     
     try {
       Parser parser = new Parser();
@@ -314,7 +280,7 @@
       }
       clusterDumper.printClusters();
     } catch (OptionException e) {
-      log.error("Exception", e);
+      ClusterDumper.log.error("Exception", e);
       CommandLineUtil.printHelp(group);
     }
   }
@@ -323,8 +289,7 @@
     this.useJSON = json;
   }
   
-  private static Map<String,List<String>> readPoints(String pointsPathDir,
-                                                     JobConf conf) throws IOException {
+  private static Map<String,List<String>> readPoints(String pointsPathDir, JobConf conf) throws IOException {
     SortedMap<String,List<String>> result = new TreeMap<String,List<String>>();
     
     File[] children = new File(pointsPathDir).listFiles(new FilenameFilter() {
@@ -358,9 +323,9 @@
           pointList.add(key.toString());
         }
       } catch (InstantiationException e) {
-        log.error("Exception", e);
+        ClusterDumper.log.error("Exception", e);
       } catch (IllegalAccessException e) {
-        log.error("Exception", e);
+        ClusterDumper.log.error("Exception", e);
       }
     }
     
@@ -377,9 +342,7 @@
     }
   }
   
-  private static String getTopFeatures(Vector vector,
-                                       String[] dictionary,
-                                       int numTerms) {
+  private static String getTopFeatures(Vector vector, String[] dictionary, int numTerms) {
     
     List<TermIndexWeight> vectorTerms = new ArrayList<TermIndexWeight>();
     
@@ -403,7 +366,7 @@
       int index = vectorTerms.get(i).index;
       String dictTerm = dictionary[index];
       if (dictTerm == null) {
-        log.error("Dictionary entry missing for {}", index);
+        ClusterDumper.log.error("Dictionary entry missing for {}", index);
         continue;
       }
       topTerms.add(dictTerm);

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocCombiner.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocCombiner.java?rev=909861&r1=909860&r2=909861&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocCombiner.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocCombiner.java Sat Feb 13 17:55:56 2010
@@ -15,7 +15,6 @@
  * limitations under the License.
  */
 
-
 package org.apache.mahout.utils.nlp.collocations.llr;
 
 import java.io.IOException;
@@ -29,56 +28,54 @@
 import org.apache.mahout.utils.nlp.collocations.llr.Gram.Type;
 
 /** Combiner for pass1 of the CollocationDriver */
-public class CollocCombiner extends MapReduceBase implements
-  Reducer<Gram, Gram, Gram, Gram> {
-
-  /** collocation finder: pass 1 colloc phase:
-   *  
-   *  given input from the mapper,
-   *  k:h_subgram:1 v:ngram:1
-   *  k:t_subgram:1 v:ngram:1
-   *  
-   *  count ngrams and subgrams. 
-   *  
-   *  output is:
-   *  
-   *  k:h_subgram:subgramfreq v:ngram:ngramfreq
-   *  k:t_subgram:subgramfreq v:ngram:ngramfreq
-   *  
-   *  Each ngram's frequency is essentially counted twice, frequency should
-   *  be the same for the head and tail. Fix this to count only for the head
-   *  and move the count into the value?
+public class CollocCombiner extends MapReduceBase implements Reducer<Gram,Gram,Gram,Gram> {
+  
+  /**
+   * collocation finder: pass 1 colloc phase:
+   * 
+   * given input from the mapper, k:h_subgram:1 v:ngram:1 k:t_subgram:1 v:ngram:1
+   * 
+   * count ngrams and subgrams.
+   * 
+   * output is:
+   * 
+   * k:h_subgram:subgramfreq v:ngram:ngramfreq k:t_subgram:subgramfreq v:ngram:ngramfreq
+   * 
+   * Each ngram's frequency is essentially counted twice, frequency should be the same for the head and tail.
+   * Fix this to count only for the head and move the count into the value?
    */
   @Override
-  public void reduce(Gram subgramKey, Iterator<Gram> ngramValues,
-      OutputCollector<Gram, Gram> output, Reporter reporter) throws IOException {
-
+  public void reduce(Gram subgramKey,
+                     Iterator<Gram> ngramValues,
+                     OutputCollector<Gram,Gram> output,
+                     Reporter reporter) throws IOException {
+    
     HashMap<Gram,Gram> ngramSet = new HashMap<Gram,Gram>();
     int subgramFrequency = 0;
-
+    
     while (ngramValues.hasNext()) {
       Gram ngram = ngramValues.next();
       subgramFrequency += ngram.getFrequency();
-
+      
       Gram ngramCanon = ngramSet.get(ngram);
       if (ngramCanon == null) {
         // t is potentially reused, so create a new object to populate the HashMap
         Gram ngramEntry = new Gram(ngram);
-        ngramSet.put(ngramEntry,ngramEntry);
-      }
-      else {
+        ngramSet.put(ngramEntry, ngramEntry);
+      } else {
         ngramCanon.incrementFrequency(ngram.getFrequency());
       }
     }
-
+    
     // emit subgram:subgramFreq ngram:ngramFreq pairs
     subgramKey.setFrequency(subgramFrequency);
-
-    for (Gram ngram: ngramSet.keySet()) {
-      if(subgramKey.getType() == Type.UNIGRAM)
+    
+    for (Gram ngram : ngramSet.keySet()) {
+      if (subgramKey.getType() == Type.UNIGRAM) {
         ngram.setType(subgramKey.getType());
+      }
       output.collect(subgramKey, ngram);
     }
   }
-
+  
 }

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java?rev=909861&r1=909860&r2=909861&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java Sat Feb 13 17:55:56 2010
@@ -51,7 +51,7 @@
 public class CollocDriver {
   public static final String DEFAULT_OUTPUT_DIRECTORY = "output";
   public static final String SUBGRAM_OUTPUT_DIRECTORY = "subgrams";
-  public static final String NGRAM_OUTPUT_DIRECTORY   = "ngrams";
+  public static final String NGRAM_OUTPUT_DIRECTORY = "ngrams";
   
   public static final String EMIT_UNIGRAMS = "emit-unigrams";
   public static final boolean DEFAULT_EMIT_UNIGRAMS = false;
@@ -69,78 +69,57 @@
     ArgumentBuilder abuilder = new ArgumentBuilder();
     GroupBuilder gbuilder = new GroupBuilder();
     
-    Option inputOpt = obuilder.withLongName("input").withRequired(true)
-        .withArgument(
-          abuilder.withName("input").withMinimum(1).withMaximum(1).create())
-        .withDescription("The Path for input files.").withShortName("i")
-        .create();
-    
-    Option outputOpt = obuilder.withLongName("output").withRequired(true)
-        .withArgument(
-          abuilder.withName("output").withMinimum(1).withMaximum(1).create())
-        .withDescription("The Path write output to").withShortName("o")
-        .create();
+    Option inputOpt = obuilder.withLongName("input").withRequired(true).withArgument(
+      abuilder.withName("input").withMinimum(1).withMaximum(1).create()).withDescription(
+      "The Path for input files.").withShortName("i").create();
+    
+    Option outputOpt = obuilder.withLongName("output").withRequired(true).withArgument(
+      abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription(
+      "The Path write output to").withShortName("o").create();
     
-    Option maxNGramSizeOpt = obuilder.withLongName("maxNGramSize")
-        .withRequired(false).withArgument(
-          abuilder.withName("ngramSize").withMinimum(1).withMaximum(1).create())
+    Option maxNGramSizeOpt = obuilder.withLongName("maxNGramSize").withRequired(false).withArgument(
+      abuilder.withName("ngramSize").withMinimum(1).withMaximum(1).create())
         .withDescription(
           "(Optional) The maximum size of ngrams to create"
-              + " (2 = bigrams, 3 = trigrams, etc) Default Value:2")
-        .withShortName("ng").create();
+              + " (2 = bigrams, 3 = trigrams, etc) Default Value:2").withShortName("ng").create();
     
-    Option minSupportOpt = obuilder.withLongName("minSupport")
-        .withRequired(false).withArgument(
-          abuilder.withName("minSupport").withMinimum(1).withMaximum(1).create())
-        .withDescription(
-          "(Optional) Minimum Support. Default Value: "
-              + CollocReducer.DEFAULT_MIN_SUPPORT).withShortName("s").create();
-    
-    Option minLLROpt = obuilder.withLongName("minLLR").withRequired(false)
-        .withArgument(
-          abuilder.withName("minLLR").withMinimum(1).withMaximum(1).create())
-        .withDescription(
-          "(Optional)The minimum Log Likelihood Ratio(Float)  Default is "
-              + LLRReducer.DEFAULT_MIN_LLR).withShortName("ml").create();
+    Option minSupportOpt = obuilder.withLongName("minSupport").withRequired(false).withArgument(
+      abuilder.withName("minSupport").withMinimum(1).withMaximum(1).create()).withDescription(
+      "(Optional) Minimum Support. Default Value: " + CollocReducer.DEFAULT_MIN_SUPPORT).withShortName("s")
+        .create();
     
-    Option numReduceTasksOpt = obuilder.withLongName("numReducers")
-        .withRequired(false).withArgument(
-          abuilder.withName("numReducers").withMinimum(1).withMaximum(1)
-              .create()).withDescription(
-          "(Optional) Number of reduce tasks. Default Value: "
-              + DEFAULT_PASS1_NUM_REDUCE_TASKS).withShortName("nr").create();
+    Option minLLROpt = obuilder.withLongName("minLLR").withRequired(false).withArgument(
+      abuilder.withName("minLLR").withMinimum(1).withMaximum(1).create()).withDescription(
+      "(Optional)The minimum Log Likelihood Ratio(Float)  Default is " + LLRReducer.DEFAULT_MIN_LLR)
+        .withShortName("ml").create();
+    
+    Option numReduceTasksOpt = obuilder.withLongName("numReducers").withRequired(false).withArgument(
+      abuilder.withName("numReducers").withMinimum(1).withMaximum(1).create()).withDescription(
+      "(Optional) Number of reduce tasks. Default Value: " + CollocDriver.DEFAULT_PASS1_NUM_REDUCE_TASKS)
+        .withShortName("nr").create();
     
-    Option preprocessOpt = obuilder.withLongName("preprocess")
-        .withRequired(false).withDescription(
+    Option preprocessOpt = obuilder.withLongName("preprocess").withRequired(false).withDescription(
       "If set, input is SequenceFile<Text,Text> where the value is the document, "
-          + " which will be tokenized using the specified analyzer.")
-        .withShortName("p").create();
+          + " which will be tokenized using the specified analyzer.").withShortName("p").create();
     
-    Option unigramOpt = obuilder
-        .withLongName("unigram")
-        .withRequired(false)
-        .withDescription(
-          "If set, unigrams will be emitted in the final output alongside collocations")
-        .withShortName("u").create();
+    Option unigramOpt = obuilder.withLongName("unigram").withRequired(false).withDescription(
+      "If set, unigrams will be emitted in the final output alongside collocations").withShortName("u")
+        .create();
+    
+    Option overwriteOutput = obuilder.withLongName("overwrite").withRequired(false).withDescription(
+      "If set, overwrite the output directory").withShortName("w").create();
+    
+    Option analyzerNameOpt = obuilder.withLongName("analyzerName").withArgument(
+      abuilder.withName("analyzerName").withMinimum(1).withMaximum(1).create()).withDescription(
+      "The class name of the analyzer").withShortName("a").create();
+    
+    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
+        .create();
     
-    Option overwriteOutput = obuilder.withLongName("overwrite").withRequired(
-      false).withDescription("If set, overwrite the output directory")
-        .withShortName("w").create();
-    
-    Option analyzerNameOpt = obuilder.withLongName("analyzerName")
-        .withArgument(
-          abuilder.withName("analyzerName").withMinimum(1).withMaximum(1)
-              .create()).withDescription("The class name of the analyzer")
-        .withShortName("a").create();
-    
-    Option helpOpt = obuilder.withLongName("help").withDescription(
-      "Print out help").withShortName("h").create();
-    
-    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(
-      outputOpt).withOption(maxNGramSizeOpt).withOption(overwriteOutput)
-        .withOption(minSupportOpt).withOption(minLLROpt).withOption(
-          numReduceTasksOpt).withOption(analyzerNameOpt).withOption(
-          preprocessOpt).withOption(unigramOpt).withOption(helpOpt).create();
+    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(
+      maxNGramSizeOpt).withOption(overwriteOutput).withOption(minSupportOpt).withOption(minLLROpt)
+        .withOption(numReduceTasksOpt).withOption(analyzerNameOpt).withOption(preprocessOpt).withOption(
+          unigramOpt).withOption(helpOpt).create();
     
     try {
       Parser parser = new Parser();
@@ -155,17 +134,16 @@
       String input = cmdLine.getValue(inputOpt).toString();
       String output = cmdLine.getValue(outputOpt).toString();
       
-      int maxNGramSize = DEFAULT_MAX_NGRAM_SIZE;
+      int maxNGramSize = CollocDriver.DEFAULT_MAX_NGRAM_SIZE;
       
       if (cmdLine.hasOption(maxNGramSizeOpt) == true) {
         try {
-          maxNGramSize = Integer.parseInt(cmdLine.getValue(maxNGramSizeOpt)
-              .toString());
+          maxNGramSize = Integer.parseInt(cmdLine.getValue(maxNGramSizeOpt).toString());
         } catch (NumberFormatException ex) {
-          log.warn("Could not parse ngram size option");
+          CollocDriver.log.warn("Could not parse ngram size option");
         }
       }
-      log.info("Maximum n-gram size is: {}", maxNGramSize);
+      CollocDriver.log.info("Maximum n-gram size is: {}", maxNGramSize);
       
       if (cmdLine.hasOption(overwriteOutput) == true) {
         HadoopUtil.overwriteOutput(output);
@@ -174,28 +152,26 @@
       int minSupport = CollocReducer.DEFAULT_MIN_SUPPORT;
       ;
       if (cmdLine.hasOption(minSupportOpt)) {
-        minSupport = Integer.parseInt(cmdLine.getValue(minSupportOpt)
-            .toString());
+        minSupport = Integer.parseInt(cmdLine.getValue(minSupportOpt).toString());
       }
-      log.info("Minimum Support value: {}", minSupport);
+      CollocDriver.log.info("Minimum Support value: {}", minSupport);
       
       float minLLRValue = LLRReducer.DEFAULT_MIN_LLR;
       if (cmdLine.hasOption(minLLROpt)) {
         minLLRValue = Float.parseFloat(cmdLine.getValue(minLLROpt).toString());
       }
-      log.info("Minimum LLR value: {}", minLLRValue);
+      CollocDriver.log.info("Minimum LLR value: {}", minLLRValue);
       
-      int reduceTasks = DEFAULT_PASS1_NUM_REDUCE_TASKS;
+      int reduceTasks = CollocDriver.DEFAULT_PASS1_NUM_REDUCE_TASKS;
       if (cmdLine.hasOption(numReduceTasksOpt)) {
-        reduceTasks = Integer.parseInt(cmdLine.getValue(numReduceTasksOpt)
-            .toString());
+        reduceTasks = Integer.parseInt(cmdLine.getValue(numReduceTasksOpt).toString());
       }
-      log.info("Number of pass1 reduce tasks: {}", reduceTasks);
+      CollocDriver.log.info("Number of pass1 reduce tasks: {}", reduceTasks);
       
       boolean emitUnigrams = cmdLine.hasOption(unigramOpt);
       
       if (cmdLine.hasOption(preprocessOpt)) {
-        log.info("Input will be preprocessed");
+        CollocDriver.log.info("Input will be preprocessed");
         
         Class<? extends Analyzer> analyzerClass = StandardAnalyzer.class;
         if (cmdLine.hasOption(analyzerNameOpt)) {
@@ -206,26 +182,23 @@
           analyzerClass.newInstance();
         }
         
-        String tokenizedPath = 
-          output + DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER;
+        String tokenizedPath = output + DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER;
         
-        DocumentProcessor
-            .tokenizeDocuments(input, analyzerClass, tokenizedPath);
+        DocumentProcessor.tokenizeDocuments(input, analyzerClass, tokenizedPath);
         input = tokenizedPath;
       } else {
-        log.info("Input will NOT be preprocessed");
+        CollocDriver.log.info("Input will NOT be preprocessed");
       }
       
       // parse input and extract collocations
-      long ngramCount = generateCollocations(input, output, emitUnigrams,
-        maxNGramSize, reduceTasks, minSupport);
+      long ngramCount = CollocDriver.generateCollocations(input, output, emitUnigrams, maxNGramSize,
+        reduceTasks, minSupport);
       
       // tally collocations and perform LLR calculation
-      computeNGramsPruneByLLR(ngramCount, output, emitUnigrams, minLLRValue,
-        reduceTasks);
+      CollocDriver.computeNGramsPruneByLLR(ngramCount, output, emitUnigrams, minLLRValue, reduceTasks);
       
     } catch (OptionException e) {
-      log.error("Exception", e);
+      CollocDriver.log.error("Exception", e);
       CommandLineUtil.printHelp(group);
     }
     
@@ -255,11 +228,11 @@
                                       float minLLRValue,
                                       int reduceTasks) throws IOException {
     // parse input and extract collocations
-    long ngramCount = generateCollocations(input, output, true, maxNGramSize,
-      reduceTasks, minSupport);
+    long ngramCount = CollocDriver.generateCollocations(input, output, true, maxNGramSize, reduceTasks,
+      minSupport);
     
     // tally collocations and perform LLR calculation
-    computeNGramsPruneByLLR(ngramCount, output, true, minLLRValue, reduceTasks);
+    CollocDriver.computeNGramsPruneByLLR(ngramCount, output, true, minLLRValue, reduceTasks);
   }
   
   /**
@@ -284,7 +257,7 @@
     conf.setBoolean(CollocDriver.EMIT_UNIGRAMS, emitUnigrams);
     
     FileInputFormat.setInputPaths(conf, new Path(input));
-    Path outPath = new Path(output, SUBGRAM_OUTPUT_DIRECTORY);
+    Path outPath = new Path(output, CollocDriver.SUBGRAM_OUTPUT_DIRECTORY);
     FileOutputFormat.setOutputPath(conf, outPath);
     
     conf.setInputFormat(SequenceFileInputFormat.class);
@@ -297,8 +270,7 @@
     conf.setNumReduceTasks(reduceTasks);
     
     RunningJob job = JobClient.runJob(conf);
-    return job.getCounters().findCounter(CollocMapper.Count.NGRAM_TOTAL)
-        .getValue();
+    return job.getCounters().findCounter(CollocMapper.Count.NGRAM_TOTAL).getValue();
   }
   
   /**
@@ -320,8 +292,8 @@
     conf.setOutputKeyClass(Text.class);
     conf.setOutputValueClass(DoubleWritable.class);
     
-    FileInputFormat.setInputPaths(conf, new Path(output, SUBGRAM_OUTPUT_DIRECTORY));
-    Path outPath = new Path(output, NGRAM_OUTPUT_DIRECTORY);
+    FileInputFormat.setInputPaths(conf, new Path(output, CollocDriver.SUBGRAM_OUTPUT_DIRECTORY));
+    Path outPath = new Path(output, CollocDriver.NGRAM_OUTPUT_DIRECTORY);
     FileOutputFormat.setOutputPath(conf, outPath);
     
     conf.setMapperClass(IdentityMapper.class);

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapper.java?rev=909861&r1=909860&r2=909861&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapper.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapper.java Sat Feb 13 17:55:56 2010
@@ -41,16 +41,14 @@
 import org.slf4j.LoggerFactory;
 
 /**
- * Runs pass 1 of the Collocation discovery job on input of
- * SequeceFile<Text,Text>, where the key is a document id and the value is the
- * document contents. . Delegates to NGramCollector to perform tokenization,
+ * Runs pass 1 of the Collocation discovery job on input of SequeceFile<Text,Text>, where the key is a
+ * document id and the value is the document contents. . Delegates to NGramCollector to perform tokenization,
  * ngram-creation and output collection.
  * 
  * @see org.apache.mahout.text.SequenceFilesFromDirectory
  * @see org.apache.mahout.utils.nlp.collocations.llr.colloc.NGramCollector
  */
-public class CollocMapper extends MapReduceBase implements
-    Mapper<Text,StringTuple,Gram,Gram> {
+public class CollocMapper extends MapReduceBase implements Mapper<Text,StringTuple,Gram,Gram> {
   
   public static final String MAX_SHINGLE_SIZE = "maxShingleSize";
   public static final int DEFAULT_MAX_SHINGLE_SIZE = 2;
@@ -68,41 +66,36 @@
   public void configure(JobConf job) {
     super.configure(job);
     
-    this.maxShingleSize = job.getInt(CollocMapper.MAX_SHINGLE_SIZE,
-      DEFAULT_MAX_SHINGLE_SIZE);
+    this.maxShingleSize = job.getInt(CollocMapper.MAX_SHINGLE_SIZE, CollocMapper.DEFAULT_MAX_SHINGLE_SIZE);
     
-    this.emitUnigrams = job.getBoolean(CollocDriver.EMIT_UNIGRAMS,
-      CollocDriver.DEFAULT_EMIT_UNIGRAMS);
+    this.emitUnigrams = job.getBoolean(CollocDriver.EMIT_UNIGRAMS, CollocDriver.DEFAULT_EMIT_UNIGRAMS);
     
-    if (log.isInfoEnabled()) {
-      log.info("Max Ngram size is {}", this.maxShingleSize);
-      log.info("Emit Unitgrams is {}", emitUnigrams);
+    if (CollocMapper.log.isInfoEnabled()) {
+      CollocMapper.log.info("Max Ngram size is {}", this.maxShingleSize);
+      CollocMapper.log.info("Emit Unitgrams is {}", emitUnigrams);
     }
   }
   
   /**
    * Collocation finder: pass 1 map phase.
    * 
-   * Receives a token stream which gets passed through the ShingleFilter. The
-   * ShingleFilter delivers ngrams of the appropriate size which are then
-   * decomposed into head and tail subgrams which are collected in the following
-   * manner
+   * Receives a token stream which gets passed through the ShingleFilter. The ShingleFilter delivers ngrams of
+   * the appropriate size which are then decomposed into head and tail subgrams which are collected in the
+   * following manner
    * 
    * k:h_subgram v:ngram k:t_subgram v:ngram
    * 
-   * The 'h_' or 't_' prefix is used to specify whether the subgram in question
-   * is the head or tail of the ngram. In this implementation the head of the
-   * ngram is a (n-1)gram, and the tail is a (1)gram.
+   * The 'h_' or 't_' prefix is used to specify whether the subgram in question is the head or tail of the
+   * ngram. In this implementation the head of the ngram is a (n-1)gram, and the tail is a (1)gram.
    * 
-   * For example, given 'click and clack' and an ngram length of 3: k:'h_click
-   * and' v:'click and clack' k;'t_clack' v:'click and clack'
+   * For example, given 'click and clack' and an ngram length of 3: k:'h_click and' v:'click and clack'
+   * k;'t_clack' v:'click and clack'
    * 
-   * Also counts the total number of ngrams encountered and adds it to the
-   * counter CollocDriver.Count.NGRAM_TOTAL
+   * Also counts the total number of ngrams encountered and adds it to the counter
+   * CollocDriver.Count.NGRAM_TOTAL
    * 
    * @param r
-   *          The reader to read input from -- used to create a tokenstream from
-   *          the analyzer
+   *          The reader to read input from -- used to create a tokenstream from the analyzer
    * 
    * @param collector
    *          The collector to send output to
@@ -111,28 +104,21 @@
    *          Used to deliver the final ngram-count.
    * 
    * @throws IOException
-   *           if there's a problem with the ShingleFilter reading data or the
-   *           collector collecting output.
+   *           if there's a problem with the ShingleFilter reading data or the collector collecting output.
    */
   @Override
-  public void map(Text key,
-                  StringTuple value,
-                  final OutputCollector<Gram,Gram> collector,
-                  Reporter reporter) throws IOException {
+  public void map(Text key, StringTuple value, final OutputCollector<Gram,Gram> collector, Reporter reporter) throws IOException {
     
-    ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value
-        .getEntries().iterator()), maxShingleSize);
+    ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()),
+        maxShingleSize);
     int count = 0; // ngram count
-    OpenObjectIntHashMap<String> ngrams = new OpenObjectIntHashMap<String>(
-        value.getEntries().size() * (maxShingleSize - 1));
-    OpenObjectIntHashMap<String> unigrams = new OpenObjectIntHashMap<String>(
-        value.getEntries().size());
+    OpenObjectIntHashMap<String> ngrams = new OpenObjectIntHashMap<String>(value.getEntries().size()
+                                                                           * (maxShingleSize - 1));
+    OpenObjectIntHashMap<String> unigrams = new OpenObjectIntHashMap<String>(value.getEntries().size());
     
     do {
-      String term = ((TermAttribute) sf.getAttribute(TermAttribute.class))
-          .term();
-      String type = ((TypeAttribute) sf.getAttribute(TypeAttribute.class))
-          .type();
+      String term = ((TermAttribute) sf.getAttribute(TermAttribute.class)).term();
+      String type = ((TypeAttribute) sf.getAttribute(TypeAttribute.class)).type();
       if ("shingle".equals(type)) {
         count++;
         if (ngrams.containsKey(term) == false) {
@@ -158,10 +144,8 @@
         int i = term.lastIndexOf(' ');
         if (i != -1) { // bigram, trigram etc
           try {
-            collector.collect(new Gram(term.substring(0, i), frequency, HEAD),
-              ngram);
-            collector.collect(new Gram(term.substring(i + 1), frequency, TAIL),
-              ngram);
+            collector.collect(new Gram(term.substring(0, i), frequency, HEAD), ngram);
+            collector.collect(new Gram(term.substring(i + 1), frequency, TAIL), ngram);
           } catch (IOException e) {
             throw new RuntimeException(e);
           }

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducer.java?rev=909861&r1=909860&r2=909861&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducer.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducer.java Sat Feb 13 17:55:56 2010
@@ -31,11 +31,9 @@
 import org.slf4j.LoggerFactory;
 
 /**
- * Reducer for Pass 1 of the collocation identification job. Generates counts
- * for ngrams and subgrams.
+ * Reducer for Pass 1 of the collocation identification job. Generates counts for ngrams and subgrams.
  */
-public class CollocReducer extends MapReduceBase implements
-    Reducer<Gram,Gram,Gram,Gram> {
+public class CollocReducer extends MapReduceBase implements Reducer<Gram,Gram,Gram,Gram> {
   
   public static final String MIN_SUPPORT = "minSupport";
   public static final int DEFAULT_MIN_SUPPORT = 2;
@@ -53,14 +51,13 @@
   public void configure(JobConf job) {
     super.configure(job);
     
-    this.minSupport = job.getInt(MIN_SUPPORT, DEFAULT_MIN_SUPPORT);
+    this.minSupport = job.getInt(CollocReducer.MIN_SUPPORT, CollocReducer.DEFAULT_MIN_SUPPORT);
     
-    this.emitUnigrams =
-      job.getBoolean(CollocDriver.EMIT_UNIGRAMS, CollocDriver.DEFAULT_EMIT_UNIGRAMS);
+    this.emitUnigrams = job.getBoolean(CollocDriver.EMIT_UNIGRAMS, CollocDriver.DEFAULT_EMIT_UNIGRAMS);
     
-    if (log.isInfoEnabled()) {
-      log.info("Min support is {}", minSupport);
-      log.info("Emit Unitgrams is {}", emitUnigrams);
+    if (CollocReducer.log.isInfoEnabled()) {
+      CollocReducer.log.info("Min support is {}", minSupport);
+      CollocReducer.log.info("Emit Unitgrams is {}", emitUnigrams);
     }
     
   }
@@ -68,20 +65,16 @@
   /**
    * collocation finder: pass 1 reduce phase:
    * 
-   * given input from the mapper, 
-   *   k:h_subgram v:ngram 
-   *   k:t_subgram v:ngram
+   * given input from the mapper, k:h_subgram v:ngram k:t_subgram v:ngram
    * 
    * count ngrams and subgrams.
    * 
    * output is:
    * 
-   * k:ngram:ngramfreq v:h_subgram:h_subgramfreq 
-   * k:ngram:ngramfreq v:t_subgram:t_subgramfreq
+   * k:ngram:ngramfreq v:h_subgram:h_subgramfreq k:ngram:ngramfreq v:t_subgram:t_subgramfreq
    * 
-   * Each ngram's frequency is essentially counted twice, frequency should be
-   * the same for the head and tail. Fix this to count only for the head and
-   * move the count into the value?
+   * Each ngram's frequency is essentially counted twice, frequency should be the same for the head and tail.
+   * Fix this to count only for the head and move the count into the value?
    */
   @Override
   public void reduce(Gram subgramKey,
@@ -115,8 +108,9 @@
         reporter.incrCounter(Skipped.LESS_THAN_MIN_SUPPORT, 1);
         continue;
       }
-      if(subgramKey.getType() == Type.UNIGRAM)
+      if (subgramKey.getType() == Type.UNIGRAM) {
         ngram.setType(subgramKey.getType());
+      }
       output.collect(ngram, subgramKey);
     }
   }

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/Gram.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/Gram.java?rev=909861&r1=909860&r2=909861&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/Gram.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/Gram.java Sat Feb 13 17:55:56 2010
@@ -27,11 +27,10 @@
 import org.apache.hadoop.io.WritableComparable;
 
 /**
- * Writable for holding data generated from the collocation discovery jobs.
- * Depending on the job configuration gram may be one or more words. In some
- * contexts this is used to hold a complete ngram, while in others it holds a
- * part of an existing ngram (subgram). Tracks the frequency of the gram and its
- * position in the ngram in which is was found.
+ * Writable for holding data generated from the collocation discovery jobs. Depending on the job configuration
+ * gram may be one or more words. In some contexts this is used to hold a complete ngram, while in others it
+ * holds a part of an existing ngram (subgram). Tracks the frequency of the gram and its position in the ngram
+ * in which is was found.
  */
 public class Gram implements WritableComparable<Gram> {
   
@@ -71,8 +70,7 @@
    * @param gram
    *          the gram string
    * @param type
-   *          whether the gram is at the head of its text unit or tail or
-   *          unigram
+   *          whether the gram is at the head of its text unit or tail or unigram
    */
   public Gram(String ngram, Type type) {
     this(ngram, 1, type);
@@ -97,8 +95,7 @@
    * @param frequency
    *          the gram frequency
    * @param type
-   *          whether the gram is at the head of its text unit or tail or
-   *          unigram
+   *          whether the gram is at the head of its text unit or tail or unigram
    */
   public Gram(String ngram, int frequency, Type type) {
     this.gram = ngram;
@@ -115,8 +112,7 @@
   
   /**
    * @param part
-   *          whether the gram is at the head of its text unit or tail or
-   *          unigram
+   *          whether the gram is at the head of its text unit or tail or unigram
    */
   public void setType(Type type) {
     this.type = type;
@@ -162,9 +158,13 @@
     frequency = in.readInt();
     int typeValue = in.readUnsignedByte();
     
-    if (typeValue == 0) type = Type.TAIL;
-    else if (typeValue == 1) type = Type.HEAD;
-    else type = Type.UNIGRAM;
+    if (typeValue == 0) {
+      type = Type.TAIL;
+    } else if (typeValue == 1) {
+      type = Type.HEAD;
+    } else {
+      type = Type.UNIGRAM;
+    }
     
     Text data = new Text();
     data.readFields(in);
@@ -175,9 +175,13 @@
   public void write(DataOutput out) throws IOException {
     out.writeInt(frequency);
     
-    if (type == Type.TAIL) out.writeByte(0);
-    else if (type == Type.HEAD) out.writeByte(1);
-    else out.writeByte(2);
+    if (type == Type.TAIL) {
+      out.writeByte(0);
+    } else if (type == Type.HEAD) {
+      out.writeByte(1);
+    } else {
+      out.writeByte(2);
+    }
     
     Text data = new Text(gram);
     data.write(out);
@@ -214,8 +218,8 @@
   public int hashCode() {
     final int prime = 31;
     int result = 1;
-    result = prime * result + ((gram == null) ? 0 : gram.hashCode());
-    result = prime * result + ((type == null) ? 0 : type.hashCode());
+    result = prime * result + (gram == null ? 0 : gram.hashCode());
+    result = prime * result + (type == null ? 0 : type.hashCode());
     return result;
   }
   
@@ -224,23 +228,36 @@
    */
   @Override
   public boolean equals(Object obj) {
-    if (this == obj) return true;
-    if (obj == null) return false;
-    if (getClass() != obj.getClass()) return false;
+    if (this == obj) {
+      return true;
+    }
+    if (obj == null) {
+      return false;
+    }
+    if (getClass() != obj.getClass()) {
+      return false;
+    }
     Gram other = (Gram) obj;
     if (gram == null) {
-      if (other.gram != null) return false;
-    } else if (!gram.equals(other.gram)) return false;
+      if (other.gram != null) {
+        return false;
+      }
+    } else if (!gram.equals(other.gram)) {
+      return false;
+    }
     if (type == null) {
-      if (other.type != null) return false;
-    } else if (!type.equals(other.type)) return false;
+      if (other.type != null) {
+        return false;
+      }
+    } else if (!type.equals(other.type)) {
+      return false;
+    }
     return true;
   }
   
   @Override
   public String toString() {
-    return "'" + gram + "'["
-           + (type == Type.UNIGRAM ? "u" : (type == Type.HEAD ? "h" : "t")) + "]:"
+    return "'" + gram + "'[" + (type == Type.UNIGRAM ? "u" : type == Type.HEAD ? "h" : "t") + "]:"
            + frequency;
   }
   

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/LLRReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/LLRReducer.java?rev=909861&r1=909860&r2=909861&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/LLRReducer.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/LLRReducer.java Sat Feb 13 17:55:56 2010
@@ -33,11 +33,10 @@
 import org.slf4j.LoggerFactory;
 
 /**
- * Reducer for pass 2 of the collocation discovery job. Collects ngram and
- * sub-ngram frequencies and performs the Log-likelihood ratio calculation.
+ * Reducer for pass 2 of the collocation discovery job. Collects ngram and sub-ngram frequencies and performs
+ * the Log-likelihood ratio calculation.
  */
-public class LLRReducer extends MapReduceBase implements
-    Reducer<Gram,Gram,Text,DoubleWritable> {
+public class LLRReducer extends MapReduceBase implements Reducer<Gram,Gram,Text,DoubleWritable> {
   
   public static enum Skipped {
     EXTRA_HEAD,
@@ -80,16 +79,15 @@
   public void configure(JobConf job) {
     super.configure(job);
     
-    this.ngramTotal = job.getLong(NGRAM_TOTAL, -1);
-    this.minLLRValue = job.getFloat(MIN_LLR, DEFAULT_MIN_LLR);
+    this.ngramTotal = job.getLong(LLRReducer.NGRAM_TOTAL, -1);
+    this.minLLRValue = job.getFloat(LLRReducer.MIN_LLR, LLRReducer.DEFAULT_MIN_LLR);
     
-    this.emitUnigrams = job.getBoolean(CollocDriver.EMIT_UNIGRAMS,
-      CollocDriver.DEFAULT_EMIT_UNIGRAMS);
+    this.emitUnigrams = job.getBoolean(CollocDriver.EMIT_UNIGRAMS, CollocDriver.DEFAULT_EMIT_UNIGRAMS);
     
-    if (log.isInfoEnabled()) {
-      log.info("NGram Total is {}", ngramTotal);
-      log.info("Min LLR value is {}", minLLRValue);
-      log.info("Emit Unitgrams is {}", emitUnigrams);
+    if (LLRReducer.log.isInfoEnabled()) {
+      LLRReducer.log.info("NGram Total is {}", ngramTotal);
+      LLRReducer.log.info("Min LLR value is {}", minLLRValue);
+      LLRReducer.log.info("Emit Unitgrams is {}", emitUnigrams);
     }
     
     if (ngramTotal == -1) {
@@ -98,17 +96,13 @@
   }
   
   /**
-   * Perform LLR calculation, input is: k:ngram:ngramFreq
-   * v:(h_|t_)subgram:subgramfreq N = ngram total
+   * Perform LLR calculation, input is: k:ngram:ngramFreq v:(h_|t_)subgram:subgramfreq N = ngram total
    * 
-   * Each ngram will have 2 subgrams, a head and a tail, referred to as A and B
-   * respectively below.
+   * Each ngram will have 2 subgrams, a head and a tail, referred to as A and B respectively below.
    * 
-   * A+ B: number of times a+b appear together: ngramFreq A+!B: number of times
-   * A appears without B: hSubgramFreq - ngramFreq !A+ B: number of times B
-   * appears without A: tSubgramFreq - ngramFreq !A+!B: number of times neither
-   * A or B appears (in that order): N - (subgramFreqA + subgramFreqB -
-   * ngramFreq)
+   * A+ B: number of times a+b appear together: ngramFreq A+!B: number of times A appears without B:
+   * hSubgramFreq - ngramFreq !A+ B: number of times B appears without A: tSubgramFreq - ngramFreq !A+!B:
+   * number of times neither A or B appears (in that order): N - (subgramFreqA + subgramFreqB - ngramFreq)
    */
   @Override
   public void reduce(Gram key,
@@ -132,10 +126,10 @@
     while (values.hasNext()) {
       Gram value = values.next();
       
-      int pos = (value.getType() == Type.HEAD ? 0 : 1);
+      int pos = value.getType() == Type.HEAD ? 0 : 1;
       
       if (gramFreq[pos] != -1) {
-        log.warn("Extra {} for {}, skipping", value.getType(), ngram);
+        LLRReducer.log.warn("Extra {} for {}, skipping", value.getType(), ngram);
         if (value.getType() == Type.HEAD) {
           reporter.incrCounter(Skipped.EXTRA_HEAD, 1);
         } else {
@@ -149,11 +143,11 @@
     }
     
     if (gramFreq[0] == -1) {
-      log.warn("Missing head for {}, skipping.", ngram);
+      LLRReducer.log.warn("Missing head for {}, skipping.", ngram);
       reporter.incrCounter(Skipped.MISSING_HEAD, 1);
       return;
     } else if (gramFreq[1] == -1) {
-      log.warn("Missing tail for {}, skipping", ngram);
+      LLRReducer.log.warn("Missing tail for {}, skipping", ngram);
       reporter.incrCounter(Skipped.MISSING_TAIL, 1);
       return;
     }
@@ -161,8 +155,7 @@
     int k11 = ngram.getFrequency(); /* a&b */
     int k12 = gramFreq[0] - ngram.getFrequency(); /* a&!b */
     int k21 = gramFreq[1] - ngram.getFrequency(); /* !b&a */
-    int k22 = (int) (ngramTotal - (gramFreq[0] + gramFreq[1] - ngram
-        .getFrequency())); /* !a&!b */
+    int k22 = (int) (ngramTotal - (gramFreq[0] + gramFreq[1] - ngram.getFrequency())); /* !a&!b */
     
     try {
       double llr = ll.logLikelihoodRatio(k11, k12, k21, k22);
@@ -175,18 +168,16 @@
       output.collect(t, dd);
     } catch (IllegalArgumentException ex) {
       reporter.incrCounter(Skipped.LLR_CALCULATION_ERROR, 1);
-      log.error("Problem calculating LLR ratio: " + ex.getMessage());
-      log.error("NGram: " + ngram);
-      log.error("HEAD: " + gram[0] + ":" + gramFreq[0]);
-      log.error("TAIL: " + gram[1] + ":" + gramFreq[1]);
-      log.error("k11: " + k11 + " k12: " + k12 + " k21: " + k21 + " k22: "
-                + k22);
+      LLRReducer.log.error("Problem calculating LLR ratio: " + ex.getMessage());
+      LLRReducer.log.error("NGram: " + ngram);
+      LLRReducer.log.error("HEAD: " + gram[0] + ":" + gramFreq[0]);
+      LLRReducer.log.error("TAIL: " + gram[1] + ":" + gramFreq[1]);
+      LLRReducer.log.error("k11: " + k11 + " k12: " + k12 + " k21: " + k21 + " k22: " + k22);
     }
   }
   
   /**
-   * provide interface so the input to the llr calculation can be captured for
-   * validation in unit testing
+   * provide interface so the input to the llr calculation can be captured for validation in unit testing
    */
   public static interface LLCallback {
     public double logLikelihoodRatio(int k11, int k12, int k21, int k22);

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/NGramCollector.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/NGramCollector.java?rev=909861&r1=909860&r2=909861&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/NGramCollector.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/NGramCollector.java Sat Feb 13 17:55:56 2010
@@ -38,9 +38,8 @@
 import org.slf4j.LoggerFactory;
 
 /**
- * Performs tokenization, ngram generation + collection for the first pass of
- * the LLR collocation discovery job. Factors this code out of the mappers so
- * that different input formats can be supported.
+ * Performs tokenization, ngram generation + collection for the first pass of the LLR collocation discovery
+ * job. Factors this code out of the mappers so that different input formats can be supported.
  * 
  * @see org.apache.mahout.utils.nlp.collocations.llr.colloc.CollocMapperTextFile
  */
@@ -53,12 +52,10 @@
     NGRAM_TOTAL;
   }
   
-  private static final Logger log = LoggerFactory
-      .getLogger(NGramCollector.class);
+  private static final Logger log = LoggerFactory.getLogger(NGramCollector.class);
   
   /**
-   * An analyzer to perform tokenization. A ShingleFilter will be wrapped around
-   * its output to create ngrams
+   * An analyzer to perform tokenization. A ShingleFilter will be wrapped around its output to create ngrams
    */
   private Analyzer a;
   
@@ -70,12 +67,10 @@
   /**
    * Configure the NGramCollector.
    * 
-   * Reads NGramCollector.ANALYZER_CLASS and instantiates that class if it is
-   * provided. Otherwise a lucene StandardAnalyzer will be used that is set to
-   * be compatible to LUCENE_24.
+   * Reads NGramCollector.ANALYZER_CLASS and instantiates that class if it is provided. Otherwise a lucene
+   * StandardAnalyzer will be used that is set to be compatible to LUCENE_24.
    * 
-   * Reads NGramCollector.MAX_SHINGLE_SIZE and uses this as the parameter to the
-   * ShingleFilter.
+   * Reads NGramCollector.MAX_SHINGLE_SIZE and uses this as the parameter to the ShingleFilter.
    * 
    * @param job
    */
@@ -104,33 +99,30 @@
     
     this.maxShingleSize = job.getInt(NGramCollector.MAX_SHINGLE_SIZE, 2);
     
-    if (log.isInfoEnabled()) {
-      log.info("Analyzer is {}", this.a.toString());
-      log.info("Max Ngram size is {}", this.maxShingleSize);
+    if (NGramCollector.log.isInfoEnabled()) {
+      NGramCollector.log.info("Analyzer is {}", this.a.toString());
+      NGramCollector.log.info("Max Ngram size is {}", this.maxShingleSize);
     }
   }
   
   /**
-   * Receives a document and uses a lucene analyzer to tokenize them. The
-   * ShingleFilter delivers ngrams of the appropriate size which aren then
-   * decomposed into head and tail subgrams which are collected in the following
-   * manner
+   * Receives a document and uses a lucene analyzer to tokenize them. The ShingleFilter delivers ngrams of the
+   * appropriate size which aren then decomposed into head and tail subgrams which are collected in the
+   * following manner
    * 
    * k:h_subgram v:ngram k:t_subgram v:ngram
    * 
-   * The 'h_' or 't_' prefix is used to specify whether the subgram in question
-   * is the head or tail of the ngram. In this implementation the head of the
-   * ngram is a (n-1)gram, and the tail is a (1)gram.
+   * The 'h_' or 't_' prefix is used to specify whether the subgram in question is the head or tail of the
+   * ngram. In this implementation the head of the ngram is a (n-1)gram, and the tail is a (1)gram.
    * 
-   * For example, given 'click and clack' and an ngram length of 3: k:'h_click
-   * and' v:'clack and clack' k;'t_clack' v:'click and clack'
+   * For example, given 'click and clack' and an ngram length of 3: k:'h_click and' v:'clack and clack'
+   * k;'t_clack' v:'click and clack'
    * 
-   * Also counts the total number of ngrams encountered and adds it to the
-   * counter CollocDriver.Count.NGRAM_TOTAL
+   * Also counts the total number of ngrams encountered and adds it to the counter
+   * CollocDriver.Count.NGRAM_TOTAL
    * 
    * @param r
-   *          The reader to read input from -- used to create a tokenstream from
-   *          the analyzer
+   *          The reader to read input from -- used to create a tokenstream from the analyzer
    * 
    * @param collector
    *          The collector to send output to
@@ -139,12 +131,9 @@
    *          Used to deliver the final ngram-count.
    * 
    * @throws IOException
-   *           if there's a problem with the ShingleFilter reading data or the
-   *           collector collecting output.
+   *           if there's a problem with the ShingleFilter reading data or the collector collecting output.
    */
-  public void collectNgrams(Reader r,
-                            OutputCollector<Gram,Gram> collector,
-                            Reporter reporter) throws IOException {
+  public void collectNgrams(Reader r, OutputCollector<Gram,Gram> collector, Reporter reporter) throws IOException {
     TokenStream st = a.tokenStream("text", r);
     ShingleFilter sf = new ShingleFilter(st, maxShingleSize);
     
@@ -152,10 +141,8 @@
     int count = 0; // ngram count
     
     do {
-      String term = ((TermAttribute) sf.getAttribute(TermAttribute.class))
-          .term();
-      String type = ((TypeAttribute) sf.getAttribute(TypeAttribute.class))
-          .type();
+      String term = ((TermAttribute) sf.getAttribute(TermAttribute.class)).term();
+      String type = ((TypeAttribute) sf.getAttribute(TypeAttribute.class)).type();
       
       if ("shingle".equals(type)) {
         count++;