You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ro...@apache.org on 2010/02/11 07:01:20 UTC

svn commit: r908851 - in /lucene/mahout/trunk/utils/src: main/java/org/apache/mahout/text/ main/java/org/apache/mahout/utils/nlp/collocations/llr/ main/java/org/apache/mahout/utils/vectors/text/ main/java/org/apache/mahout/utils/vectors/tfidf/ test/jav...

Author: robinanil
Date: Thu Feb 11 06:01:19 2010
New Revision: 908851

URL: http://svn.apache.org/viewvc?rev=908851&view=rev
Log:
Checking in Drew's fixes. Functional Vectorizer

Added:
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/NGramCollector.java
    lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/NGramCollectorTest.java
Modified:
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocCombiner.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducer.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DocumentProcessor.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java?rev=908851&r1=908850&r2=908851&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java Thu Feb 11 06:01:19 2010
@@ -153,11 +153,18 @@
     
     Parser parser = new Parser();
     parser.setGroup(group);
-    CommandLine cmdLine = parser.parse(args);
+    CommandLine cmdLine = null;
     
-    if (cmdLine.hasOption(helpOpt)) {
-      CommandLineUtil.printHelp(group);
-      return;
+    try {
+      // standard help opt won't work because
+      // outputDir is required and exception will 
+      // be thrown if it is not present.
+      cmdLine = parser.parse(args);
+    }
+    catch (OptionException oe) {
+        System.out.println(oe.getMessage());
+        CommandLineUtil.printHelp(group);
+        return;
     }
     
     String inputDir = (String) cmdLine.getValue(inputDirOpt);
@@ -246,7 +253,7 @@
       }
     }
     HadoopUtil.overwriteOutput(outputDir);
-    String tokenizedPath = outputDir + "/tokenized-documents";
+    String tokenizedPath = outputDir + DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER;
     DocumentProcessor.tokenizeDocuments(inputDir, analyzerClass, tokenizedPath);
     
     DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir,
@@ -254,7 +261,7 @@
     if (processIdf) {
       TFIDFConverter.processTfIdf(
         outputDir + DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER,
-        outputDir + "/tfidf", chunkSize, minDf, maxDFPercent, norm);
+        outputDir + TFIDFConverter.TFIDF_OUTPUT_FOLDER, chunkSize, minDf, maxDFPercent, norm);
     }
   }
 }

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocCombiner.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocCombiner.java?rev=908851&r1=908850&r2=908851&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocCombiner.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocCombiner.java Thu Feb 11 06:01:19 2010
@@ -28,10 +28,11 @@
 import org.apache.hadoop.mapred.Reporter;
 import org.apache.mahout.utils.nlp.collocations.llr.Gram.Type;
 
+/** Combiner for pass1 of the CollocationDriver */
 public class CollocCombiner extends MapReduceBase implements
-Reducer<Gram, Gram, Gram, Gram> {
+  Reducer<Gram, Gram, Gram, Gram> {
 
-  /** collocation finder: pass 1 collec phase:
+  /** collocation finder: pass 1 colloc phase:
    *  
    *  given input from the mapper,
    *  k:h_subgram:1 v:ngram:1
@@ -49,34 +50,34 @@
    *  and move the count into the value?
    */
   @Override
-  public void reduce(Gram key, Iterator<Gram> value,
+  public void reduce(Gram subgramKey, Iterator<Gram> ngramValues,
       OutputCollector<Gram, Gram> output, Reporter reporter) throws IOException {
 
-    HashMap<Gram,Gram> set = new HashMap<Gram,Gram>();
+    HashMap<Gram,Gram> ngramSet = new HashMap<Gram,Gram>();
     int subgramFrequency = 0;
 
-    while (value.hasNext()) {
-      Gram t = value.next();
-      subgramFrequency += t.getFrequency();
+    while (ngramValues.hasNext()) {
+      Gram ngram = ngramValues.next();
+      subgramFrequency += ngram.getFrequency();
 
-      Gram s = set.get(t);
-      if (s == null) {
+      Gram ngramCanon = ngramSet.get(ngram);
+      if (ngramCanon == null) {
         // t is potentially reused, so create a new object to populate the HashMap
-        Gram e = new Gram(t);
-        set.put(e,e);
+        Gram ngramEntry = new Gram(ngram);
+        ngramSet.put(ngramEntry,ngramEntry);
       }
       else {
-        s.incrementFrequency(t.getFrequency());
+        ngramCanon.incrementFrequency(ngram.getFrequency());
       }
     }
 
     // emit subgram:subgramFreq ngram:ngramFreq pairs
-    key.setFrequency(subgramFrequency);
+    subgramKey.setFrequency(subgramFrequency);
 
-    for (Gram t: set.keySet()) {
-      if(key.getType() == Type.UNIGRAM)
-        t.setType(key.getType());
-      output.collect(key, t);
+    for (Gram ngram: ngramSet.keySet()) {
+      if(subgramKey.getType() == Type.UNIGRAM)
+        ngram.setType(subgramKey.getType());
+      output.collect(subgramKey, ngram);
     }
   }
 

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java?rev=908851&r1=908850&r2=908851&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java Thu Feb 11 06:01:19 2010
@@ -49,10 +49,13 @@
 
 /** Driver for LLR collocation discovery mapreduce job */
 public class CollocDriver {
+  public static final String DEFAULT_OUTPUT_DIRECTORY = "output";
+  public static final String SUBGRAM_OUTPUT_DIRECTORY = "subgrams";
+  public static final String NGRAM_OUTPUT_DIRECTORY   = "ngrams";
+  
   public static final String EMIT_UNIGRAMS = "emit-unigrams";
   public static final boolean DEFAULT_EMIT_UNIGRAMS = false;
   
-  public static final String DEFAULT_OUTPUT_DIRECTORY = "output";
   public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
   public static final int DEFAULT_PASS1_NUM_REDUCE_TASKS = 1;
   
@@ -78,18 +81,17 @@
         .withDescription("The Path write output to").withShortName("o")
         .create();
     
-    Option maxNGramSizeOpt = obuilder
-        .withLongName("maxNGramSize")
-        .withRequired(false)
-        .withArgument(
+    Option maxNGramSizeOpt = obuilder.withLongName("maxNGramSize")
+        .withRequired(false).withArgument(
           abuilder.withName("ngramSize").withMinimum(1).withMaximum(1).create())
         .withDescription(
           "(Optional) The maximum size of ngrams to create"
               + " (2 = bigrams, 3 = trigrams, etc) Default Value:2")
         .withShortName("ng").create();
     
-    Option minSupportOpt = obuilder.withLongName("minSupport").withArgument(
-      abuilder.withName("minSupport").withMinimum(1).withMaximum(1).create())
+    Option minSupportOpt = obuilder.withLongName("minSupport")
+        .withRequired(false).withArgument(
+          abuilder.withName("minSupport").withMinimum(1).withMaximum(1).create())
         .withDescription(
           "(Optional) Minimum Support. Default Value: "
               + CollocReducer.DEFAULT_MIN_SUPPORT).withShortName("s").create();
@@ -102,14 +104,14 @@
               + LLRReducer.DEFAULT_MIN_LLR).withShortName("ml").create();
     
     Option numReduceTasksOpt = obuilder.withLongName("numReducers")
-        .withArgument(
+        .withRequired(false).withArgument(
           abuilder.withName("numReducers").withMinimum(1).withMaximum(1)
               .create()).withDescription(
           "(Optional) Number of reduce tasks. Default Value: "
               + DEFAULT_PASS1_NUM_REDUCE_TASKS).withShortName("nr").create();
     
-    Option preprocessOpt = obuilder.withLongName("preprocess").withRequired(
-      false).withDescription(
+    Option preprocessOpt = obuilder.withLongName("preprocess")
+        .withRequired(false).withDescription(
       "If set, input is SequenceFile<Text,Text> where the value is the document, "
           + " which will be tokenized using the specified analyzer.")
         .withShortName("p").create();
@@ -188,7 +190,7 @@
         reduceTasks = Integer.parseInt(cmdLine.getValue(numReduceTasksOpt)
             .toString());
       }
-      log.info("Pass1 reduce tasks: {}", reduceTasks);
+      log.info("Number of pass1 reduce tasks: {}", reduceTasks);
       
       boolean emitUnigrams = cmdLine.hasOption(unigramOpt);
       
@@ -204,7 +206,9 @@
           analyzerClass.newInstance();
         }
         
-        String tokenizedPath = output + "/tokenized-documents";
+        String tokenizedPath = 
+          output + DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER;
+        
         DocumentProcessor
             .tokenizeDocuments(input, analyzerClass, tokenizedPath);
         input = tokenizedPath;
@@ -280,7 +284,7 @@
     conf.setBoolean(CollocDriver.EMIT_UNIGRAMS, emitUnigrams);
     
     FileInputFormat.setInputPaths(conf, new Path(input));
-    Path outPath = new Path(output + "/subgrams");
+    Path outPath = new Path(output, SUBGRAM_OUTPUT_DIRECTORY);
     FileOutputFormat.setOutputPath(conf, outPath);
     
     conf.setInputFormat(SequenceFileInputFormat.class);
@@ -316,8 +320,8 @@
     conf.setOutputKeyClass(Text.class);
     conf.setOutputValueClass(DoubleWritable.class);
     
-    FileInputFormat.setInputPaths(conf, new Path(output + "/subgrams"));
-    Path outPath = new Path(output + "/ngrams");
+    FileInputFormat.setInputPaths(conf, new Path(output, SUBGRAM_OUTPUT_DIRECTORY));
+    Path outPath = new Path(output, NGRAM_OUTPUT_DIRECTORY);
     FileOutputFormat.setOutputPath(conf, outPath);
     
     conf.setMapperClass(IdentityMapper.class);

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducer.java?rev=908851&r1=908850&r2=908851&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducer.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducer.java Thu Feb 11 06:01:19 2010
@@ -84,40 +84,40 @@
    * move the count into the value?
    */
   @Override
-  public void reduce(Gram key,
-                     Iterator<Gram> value,
+  public void reduce(Gram subgramKey,
+                     Iterator<Gram> ngramValues,
                      OutputCollector<Gram,Gram> output,
                      Reporter reporter) throws IOException {
     
-    HashMap<Gram,Gram> set = new HashMap<Gram,Gram>();
+    HashMap<Gram,Gram> ngramSet = new HashMap<Gram,Gram>();
     int subgramFrequency = 0;
     
-    while (value.hasNext()) {
-      Gram t = value.next();
-      subgramFrequency += t.getFrequency();
+    while (ngramValues.hasNext()) {
+      Gram ngram = ngramValues.next();
+      subgramFrequency += ngram.getFrequency();
       
-      Gram s = set.get(t);
-      if (s == null) {
+      Gram ngramCanon = ngramSet.get(ngram);
+      if (ngramCanon == null) {
         // t is potentially reused, so create a new object to populate the
         // HashMap
-        Gram e = new Gram(t);
-        set.put(e, e);
+        Gram ngramEntry = new Gram(ngram);
+        ngramSet.put(ngramEntry, ngramEntry);
       } else {
-        s.incrementFrequency(t.getFrequency());
+        ngramCanon.incrementFrequency(ngram.getFrequency());
       }
     }
     
     // emit ngram:ngramFreq, subgram:subgramFreq pairs.
-    key.setFrequency(subgramFrequency);
+    subgramKey.setFrequency(subgramFrequency);
     
-    for (Gram t : set.keySet()) {
-      if (t.getFrequency() < minSupport) {
+    for (Gram ngram : ngramSet.keySet()) {
+      if (ngram.getFrequency() < minSupport) {
         reporter.incrCounter(Skipped.LESS_THAN_MIN_SUPPORT, 1);
         continue;
       }
-      if(key.getType() == Type.UNIGRAM)
-        t.setType(key.getType());
-      output.collect(t, key);
+      if(subgramKey.getType() == Type.UNIGRAM)
+        ngram.setType(subgramKey.getType());
+      output.collect(ngram, subgramKey);
     }
   }
 }

Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/NGramCollector.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/NGramCollector.java?rev=908851&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/NGramCollector.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/NGramCollector.java Thu Feb 11 06:01:19 2010
@@ -0,0 +1,179 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.nlp.collocations.llr;
+
+import static org.apache.mahout.utils.nlp.collocations.llr.Gram.Type.HEAD;
+import static org.apache.mahout.utils.nlp.collocations.llr.Gram.Type.TAIL;
+import static org.apache.mahout.utils.nlp.collocations.llr.NGramCollector.Count.NGRAM_TOTAL;
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.shingle.ShingleFilter;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.Version;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Performs tokenization, ngram generation + collection for the first pass of
+ * the LLR collocation discovery job. Factors this code out of the mappers so
+ * that different input formats can be supported.
+ * 
+ * @see org.apache.mahout.utils.nlp.collocations.llr.colloc.CollocMapperTextFile
+ */
+public class NGramCollector {
+  
+  public static final String ANALYZER_CLASS = "analyzerClass";
+  public static final String MAX_SHINGLE_SIZE = "maxShingleSize";
+  
+  public static enum Count {
+    NGRAM_TOTAL;
+  }
+  
+  private static final Logger log = LoggerFactory
+      .getLogger(NGramCollector.class);
+  
+  /**
+   * An analyzer to perform tokenization. A ShingleFilter will be wrapped around
+   * its output to create ngrams
+   */
+  private Analyzer a;
+  
+  /** max size of shingles (ngrams) to create */
+  private int maxShingleSize;
+  
+  public NGramCollector() {}
+  
+  /**
+   * Configure the NGramCollector.
+   * 
+   * Reads NGramCollector.ANALYZER_CLASS and instantiates that class if it is
+   * provided. Otherwise a lucene StandardAnalyzer will be used that is set to
+   * be compatible to LUCENE_24.
+   * 
+   * Reads NGramCollector.MAX_SHINGLE_SIZE and uses this as the parameter to the
+   * ShingleFilter.
+   * 
+   * @param job
+   */
+  public void configure(JobConf job) {
+    this.a = null;
+    try {
+      ClassLoader ccl = Thread.currentThread().getContextClassLoader();
+      String analyzerClass = job.get(NGramCollector.ANALYZER_CLASS);
+      if (analyzerClass != null) {
+        Class<?> cl = ccl.loadClass(analyzerClass);
+        a = (Analyzer) cl.newInstance();
+      }
+    } catch (ClassNotFoundException e) {
+      throw new IllegalStateException(e);
+    } catch (InstantiationException e) {
+      throw new IllegalStateException(e);
+    } catch (IllegalAccessException e) {
+      throw new IllegalStateException(e);
+    }
+    
+    if (this.a == null) {
+      // No analyzer specified. Use the LUCENE_24 analzer here because
+      // it does not preserve stop word positions.
+      this.a = new StandardAnalyzer(Version.LUCENE_24);
+    }
+    
+    this.maxShingleSize = job.getInt(NGramCollector.MAX_SHINGLE_SIZE, 2);
+    
+    if (log.isInfoEnabled()) {
+      log.info("Analyzer is {}", this.a.toString());
+      log.info("Max Ngram size is {}", this.maxShingleSize);
+    }
+  }
+  
+  /**
+   * Receives a document and uses a lucene analyzer to tokenize them. The
+   * ShingleFilter delivers ngrams of the appropriate size which aren then
+   * decomposed into head and tail subgrams which are collected in the following
+   * manner
+   * 
+   * k:h_subgram v:ngram k:t_subgram v:ngram
+   * 
+   * The 'h_' or 't_' prefix is used to specify whether the subgram in question
+   * is the head or tail of the ngram. In this implementation the head of the
+   * ngram is a (n-1)gram, and the tail is a (1)gram.
+   * 
+   * For example, given 'click and clack' and an ngram length of 3: k:'h_click
+   * and' v:'clack and clack' k;'t_clack' v:'click and clack'
+   * 
+   * Also counts the total number of ngrams encountered and adds it to the
+   * counter CollocDriver.Count.NGRAM_TOTAL
+   * 
+   * @param r
+   *          The reader to read input from -- used to create a tokenstream from
+   *          the analyzer
+   * 
+   * @param collector
+   *          The collector to send output to
+   * 
+   * @param reporter
+   *          Used to deliver the final ngram-count.
+   * 
+   * @throws IOException
+   *           if there's a problem with the ShingleFilter reading data or the
+   *           collector collecting output.
+   */
+  public void collectNgrams(Reader r,
+                            OutputCollector<Gram,Gram> collector,
+                            Reporter reporter) throws IOException {
+    TokenStream st = a.tokenStream("text", r);
+    ShingleFilter sf = new ShingleFilter(st, maxShingleSize);
+    
+    sf.reset();
+    int count = 0; // ngram count
+    
+    do {
+      String term = ((TermAttribute) sf.getAttribute(TermAttribute.class))
+          .term();
+      String type = ((TypeAttribute) sf.getAttribute(TypeAttribute.class))
+          .type();
+      
+      if ("shingle".equals(type)) {
+        count++;
+        Gram ngram = new Gram(term);
+        
+        // obtain components, the leading (n-1)gram and the trailing unigram.
+        int i = term.lastIndexOf(' ');
+        if (i != -1) {
+          collector.collect(new Gram(term.substring(0, i), HEAD), ngram);
+          collector.collect(new Gram(term.substring(i + 1), TAIL), ngram);
+        }
+      }
+    } while (sf.incrementToken());
+    
+    reporter.incrCounter(NGRAM_TOTAL, count);
+    
+    sf.end();
+    sf.close();
+    r.close();
+  }
+}

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java?rev=908851&r1=908850&r2=908851&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java Thu Feb 11 06:01:19 2010
@@ -106,7 +106,7 @@
    *          the minimum frequency of the feature in the entire corpus to be
    *          considered for inclusion in the sparse vector
    * @param maxNGramSize
-   *          1 = unigram, 2 = unigram and bigram, 3 = unigram, bigrama and
+   *          1 = unigram, 2 = unigram and bigram, 3 = unigram, bigram and
    *          trigram
    * @param minLLRValue
    *          minValue of log likelihood ratio to used to prune ngrams
@@ -146,7 +146,7 @@
       CollocDriver.generateAllGrams(inputPath.toString(), dictionaryJobPath
           .toString(), maxNGramSize, minSupport, minLLRValue, numReducers);
       dictionaryChunks = createDictionaryChunks(minSupport, new Path(
-          output + DICTIONARY_JOB_FOLDER + "/ngrams"), output,
+          output + DICTIONARY_JOB_FOLDER, CollocDriver.NGRAM_OUTPUT_DIRECTORY), output,
         chunkSizeInMegabytes, new DoubleWritable());
     }
     

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DocumentProcessor.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DocumentProcessor.java?rev=908851&r1=908850&r2=908851&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DocumentProcessor.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DocumentProcessor.java Thu Feb 11 06:01:19 2010
@@ -46,6 +46,7 @@
  */
 public final class DocumentProcessor {
   
+  public static final String TOKENIZED_DOCUMENT_OUTPUT_FOLDER = "/tokenized-documents";
   public static final String ANALYZER_CLASS = "analyzer.class";
   
   public static final Charset CHARSET = Charset.forName("UTF-8");

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java?rev=908851&r1=908850&r2=908851&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java Thu Feb 11 06:01:19 2010
@@ -65,6 +65,8 @@
   
   public static final String MAX_DF_PERCENTAGE = "max.df.percentage";
   
+  public static final String TFIDF_OUTPUT_FOLDER = "/tfidf";
+  
   private static final String DOCUMENT_VECTOR_OUTPUT_FOLDER = "/vectors";
   
   private static final String FREQUENCY_FILE = "/frequency.file-";

Added: lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/NGramCollectorTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/NGramCollectorTest.java?rev=908851&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/NGramCollectorTest.java (added)
+++ lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/NGramCollectorTest.java Thu Feb 11 06:01:19 2010
@@ -0,0 +1,118 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.nlp.collocations.llr;
+
+import static org.apache.mahout.utils.nlp.collocations.llr.Gram.Type.HEAD;
+import static org.apache.mahout.utils.nlp.collocations.llr.Gram.Type.TAIL;
+import static org.apache.mahout.utils.nlp.collocations.llr.NGramCollector.Count.NGRAM_TOTAL;
+
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.Collections;
+
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.util.Version;
+import org.apache.mahout.utils.nlp.collocations.llr.Gram.Type;
+import org.easymock.EasyMock;
+import org.junit.Before;
+import org.junit.Test;
+
+/** Test for NGramCollectorTest
+ * FIXME: Add negative test cases
+ */
+@SuppressWarnings("deprecation")
+public class NGramCollectorTest {
+
+  OutputCollector<Gram,Gram> collector;
+  Reporter reporter;
+
+  @Before
+  @SuppressWarnings("unchecked")
+  public void setUp() {
+    collector = EasyMock.createMock(OutputCollector.class);
+    reporter  = EasyMock.createMock(Reporter.class);
+  }
+
+  @Test
+  public void testCollectNgrams() throws Exception {
+
+    String input = "the best of times the worst of times";
+
+    String[][] values = 
+      new String[][]{
+        {"h_the",   "the best"},
+        {"t_best",  "the best"},
+        {"h_best",  "best of"},
+        {"t_of",    "best of"},
+        {"h_of",    "of times"},
+        {"t_times", "of times"},
+        {"h_times", "times the"},
+        {"t_the",   "times the"},
+        {"h_the",   "the worst"},
+        {"t_worst", "the worst"},
+        {"h_worst", "worst of"},
+        {"t_of",    "worst of"},
+        {"h_of",    "of times"},
+        {"t_times", "of times"}
+    };
+    // set up expectations for mocks. ngram max size = 2
+
+    // setup expectations
+    for (String[] v: values) {
+      Type p = v[0].startsWith("h") ? HEAD : TAIL;
+      Gram subgram = new Gram(v[0].substring(2), p);
+      Gram ngram = new Gram(v[1]);
+      collector.collect(subgram, ngram);
+    }
+
+    reporter.incrCounter(NGRAM_TOTAL, 7);
+    EasyMock.replay(reporter, collector);
+    
+    Reader r = new StringReader(input);
+
+    JobConf conf = new JobConf();
+    conf.set(NGramCollector.MAX_SHINGLE_SIZE, "2");
+    conf.set(NGramCollector.ANALYZER_CLASS, TestAnalyzer.class.getName());
+
+    NGramCollector c = new NGramCollector();
+    c.configure(conf);
+    
+    c.collectNgrams(r, collector, reporter);
+
+    EasyMock.verify(reporter, collector);
+  }
+
+  /** A lucene 2.9 standard analyzer with no stopwords. */
+  public static class TestAnalyzer extends Analyzer {
+    final Analyzer a;
+    
+    public TestAnalyzer() {
+      a = new StandardAnalyzer(Version.LUCENE_29, Collections.EMPTY_SET);
+    }
+    
+    @Override
+    public TokenStream tokenStream(String arg0, Reader arg1) {
+      return a.tokenStream(arg0, arg1);
+    }
+  }
+}