You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ro...@apache.org on 2010/02/11 08:12:50 UTC
svn commit: r908859 - in /lucene/mahout/trunk/utils/src: main/java/org/apache/mahout/text/ main/java/org/apache/mahout/utils/nlp/collocations/llr/ test/java/org/apache/mahout/utils/nlp/collocations/llr/

Author: robinanil
Date: Thu Feb 11 07:12:48 2010
New Revision: 908859

URL: http://svn.apache.org/viewvc?rev=908859&view=rev
Log:
MAHOUT-285 CollocMapper optimisations (Now reduces number of subgrams in output)

Modified:
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapper.java
    lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapperTest.java

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java?rev=908859&r1=908858&r2=908859&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java Thu Feb 11 07:12:48 2010
@@ -150,118 +150,119 @@
         .withOption(minLLROpt).withOption(numReduceTasksOpt).withOption(
           maxNGramSizeOpt).withOption(overwriteOutput).withOption(helpOpt)
         .create();
-    
-    Parser parser = new Parser();
-    parser.setGroup(group);
-    CommandLine cmdLine = null;
-    
     try {
-      // standard help opt won't work because
-      // outputDir is required and exception will 
-      // be thrown if it is not present.
-      cmdLine = parser.parse(args);
-    }
-    catch (OptionException oe) {
-        System.out.println(oe.getMessage());
+      Parser parser = new Parser();
+      parser.setGroup(group);
+      CommandLine cmdLine = parser.parse(args);
+      
+      if (cmdLine.hasOption(helpOpt)) {
         CommandLineUtil.printHelp(group);
         return;
-    }
-    
-    String inputDir = (String) cmdLine.getValue(inputDirOpt);
-    String outputDir = (String) cmdLine.getValue(outputDirOpt);
-    
-    int chunkSize = 100;
-    if (cmdLine.hasOption(chunkSizeOpt)) {
-      chunkSize = Integer.parseInt((String) cmdLine.getValue(chunkSizeOpt));
-    }
-    int minSupport = 2;
-    if (cmdLine.hasOption(minSupportOpt)) {
-      String minSupportString = (String) cmdLine.getValue(minSupportOpt);
-      minSupport = Integer.parseInt(minSupportString);
-    }
-    
-    int maxNGramSize = 1;
-    
-    if (cmdLine.hasOption(maxNGramSizeOpt) == true) {
-      try {
-        maxNGramSize = Integer.parseInt(cmdLine.getValue(maxNGramSizeOpt)
+      }
+      
+      String inputDir = (String) cmdLine.getValue(inputDirOpt);
+      String outputDir = (String) cmdLine.getValue(outputDirOpt);
+      
+      int chunkSize = 100;
+      if (cmdLine.hasOption(chunkSizeOpt)) {
+        chunkSize = Integer.parseInt((String) cmdLine.getValue(chunkSizeOpt));
+      }
+      int minSupport = 2;
+      if (cmdLine.hasOption(minSupportOpt)) {
+        String minSupportString = (String) cmdLine.getValue(minSupportOpt);
+        minSupport = Integer.parseInt(minSupportString);
+      }
+      
+      int maxNGramSize = 1;
+      
+      if (cmdLine.hasOption(maxNGramSizeOpt) == true) {
+        try {
+          maxNGramSize = Integer.parseInt(cmdLine.getValue(maxNGramSizeOpt)
+              .toString());
+        } catch (NumberFormatException ex) {
+          log.warn("Could not parse ngram size option");
+        }
+      }
+      log.info("Maximum n-gram size is: {}", maxNGramSize);
+      
+      if (cmdLine.hasOption(overwriteOutput) == true) {
+        HadoopUtil.overwriteOutput(outputDir);
+      }
+      
+      float minLLRValue = LLRReducer.DEFAULT_MIN_LLR;
+      if (cmdLine.hasOption(minLLROpt)) {
+        minLLRValue = Float.parseFloat(cmdLine.getValue(minLLROpt).toString());
+      }
+      log.info("Minimum LLR value: {}", minLLRValue);
+      
+      int reduceTasks = 1;
+      if (cmdLine.hasOption(numReduceTasksOpt)) {
+        reduceTasks = Integer.parseInt(cmdLine.getValue(numReduceTasksOpt)
             .toString());
-      } catch (NumberFormatException ex) {
-        log.warn("Could not parse ngram size option");
       }
-    }
-    log.info("Maximum n-gram size is: {}", maxNGramSize);
-    
-    if (cmdLine.hasOption(overwriteOutput) == true) {
-      HadoopUtil.overwriteOutput(outputDir);
-    }
-    
-    float minLLRValue = LLRReducer.DEFAULT_MIN_LLR;
-    if (cmdLine.hasOption(minLLROpt)) {
-      minLLRValue = Float.parseFloat(cmdLine.getValue(minLLROpt).toString());
-    }
-    log.info("Minimum LLR value: {}", minLLRValue);
-    
-    int reduceTasks = 1;
-    if (cmdLine.hasOption(numReduceTasksOpt)) {
-      reduceTasks = Integer.parseInt(cmdLine.getValue(numReduceTasksOpt)
-          .toString());
-    }
-    log.info("Pass1 reduce tasks: {}", reduceTasks);
-    
-    Class<? extends Analyzer> analyzerClass = StandardAnalyzer.class;
-    if (cmdLine.hasOption(analyzerNameOpt)) {
-      String className = cmdLine.getValue(analyzerNameOpt).toString();
-      analyzerClass = (Class<? extends Analyzer>) Class.forName(className);
-      // try instantiating it, b/c there isn't any point in setting it if
-      // you can't instantiate it
-      analyzerClass.newInstance();
-    }
-    
-    boolean processIdf;
-    
-    if (cmdLine.hasOption(weightOpt)) {
-      String wString = cmdLine.getValue(weightOpt).toString();
-      if (wString.equalsIgnoreCase("tf")) {
-        processIdf = false;
-      } else if (wString.equalsIgnoreCase("tfidf")) {
-        processIdf = true;
-      } else {
-        throw new OptionException(weightOpt);
+      log.info("Pass1 reduce tasks: {}", reduceTasks);
+      
+      Class<? extends Analyzer> analyzerClass = StandardAnalyzer.class;
+      if (cmdLine.hasOption(analyzerNameOpt)) {
+        String className = cmdLine.getValue(analyzerNameOpt).toString();
+        analyzerClass = (Class<? extends Analyzer>) Class.forName(className);
+        // try instantiating it, b/c there isn't any point in setting it if
+        // you can't instantiate it
+        analyzerClass.newInstance();
       }
-    } else {
-      processIdf = true;
-    }
-    
-    int minDf = 1;
-    if (cmdLine.hasOption(minDFOpt)) {
-      minDf = Integer.parseInt(cmdLine.getValue(minDFOpt).toString());
-    }
-    int maxDFPercent = 99;
-    if (cmdLine.hasOption(maxDFPercentOpt)) {
-      maxDFPercent = Integer.parseInt(cmdLine.getValue(maxDFPercentOpt)
-          .toString());
-    }
-    
-    float norm = PartialVectorMerger.NO_NORMALIZING;
-    if (cmdLine.hasOption(powerOpt)) {
-      String power = cmdLine.getValue(powerOpt).toString();
-      if (power.equals("INF")) {
-        norm = Float.POSITIVE_INFINITY;
+      
+      boolean processIdf;
+      
+      if (cmdLine.hasOption(weightOpt)) {
+        String wString = cmdLine.getValue(weightOpt).toString();
+        if (wString.equalsIgnoreCase("tf")) {
+          processIdf = false;
+        } else if (wString.equalsIgnoreCase("tfidf")) {
+          processIdf = true;
+        } else {
+          throw new OptionException(weightOpt);
+        }
       } else {
-        norm = Float.parseFloat(power);
+        processIdf = true;
       }
-    }
-    HadoopUtil.overwriteOutput(outputDir);
-    String tokenizedPath = outputDir + DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER;
-    DocumentProcessor.tokenizeDocuments(inputDir, analyzerClass, tokenizedPath);
-    
-    DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir,
-      minSupport, maxNGramSize, minLLRValue, reduceTasks, chunkSize);
-    if (processIdf) {
-      TFIDFConverter.processTfIdf(
-        outputDir + DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER,
-        outputDir + TFIDFConverter.TFIDF_OUTPUT_FOLDER, chunkSize, minDf, maxDFPercent, norm);
+      
+      int minDf = 1;
+      if (cmdLine.hasOption(minDFOpt)) {
+        minDf = Integer.parseInt(cmdLine.getValue(minDFOpt).toString());
+      }
+      int maxDFPercent = 99;
+      if (cmdLine.hasOption(maxDFPercentOpt)) {
+        maxDFPercent = Integer.parseInt(cmdLine.getValue(maxDFPercentOpt)
+            .toString());
+      }
+      
+      float norm = PartialVectorMerger.NO_NORMALIZING;
+      if (cmdLine.hasOption(powerOpt)) {
+        String power = cmdLine.getValue(powerOpt).toString();
+        if (power.equals("INF")) {
+          norm = Float.POSITIVE_INFINITY;
+        } else {
+          norm = Float.parseFloat(power);
+        }
+      }
+      HadoopUtil.overwriteOutput(outputDir);
+      String tokenizedPath = outputDir
+                             + DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER;
+      DocumentProcessor.tokenizeDocuments(inputDir, analyzerClass,
+        tokenizedPath);
+      
+      DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir,
+        minSupport, maxNGramSize, minLLRValue, reduceTasks, chunkSize);
+      if (processIdf) {
+        TFIDFConverter.processTfIdf(
+          outputDir + DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER,
+          outputDir + TFIDFConverter.TFIDF_OUTPUT_FOLDER, chunkSize, minDf,
+          maxDFPercent, norm);
+      }
+    } catch (OptionException e) {
+      log.error("Exception", e);
+      CommandLineUtil.printHelp(group);
     }
   }
+  
 }

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapper.java?rev=908859&r1=908858&r2=908859&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapper.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapper.java Thu Feb 11 07:12:48 2010
@@ -35,6 +35,8 @@
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 import org.apache.mahout.common.StringTuple;
+import org.apache.mahout.math.function.ObjectIntProcedure;
+import org.apache.mahout.math.map.OpenObjectIntHashMap;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -115,12 +117,16 @@
   @Override
   public void map(Text key,
                   StringTuple value,
-                  OutputCollector<Gram,Gram> collector,
+                  final OutputCollector<Gram,Gram> collector,
                   Reporter reporter) throws IOException {
     
     ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value
         .getEntries().iterator()), maxShingleSize);
     int count = 0; // ngram count
+    OpenObjectIntHashMap<String> ngrams = new OpenObjectIntHashMap<String>(
+        value.getEntries().size() * (maxShingleSize - 1));
+    OpenObjectIntHashMap<String> unigrams = new OpenObjectIntHashMap<String>(
+        value.getEntries().size());
     
     do {
       String term = ((TermAttribute) sf.getAttribute(TermAttribute.class))
@@ -129,19 +135,54 @@
           .type();
       if ("shingle".equals(type)) {
         count++;
-        Gram ngram = new Gram(term);
+        if (ngrams.containsKey(term) == false) {
+          ngrams.put(term, 1);
+        } else {
+          ngrams.put(term, 1 + ngrams.get(term));
+        }
+      } else if (emitUnigrams && term.length() > 0) { // unigram
+        if (unigrams.containsKey(term) == false) {
+          unigrams.put(term, 1);
+        } else {
+          unigrams.put(term, 1 + unigrams.get(term));
+        }
+      }
+    } while (sf.incrementToken());
+    
+    ngrams.forEachPair(new ObjectIntProcedure<String>() {
+      
+      @Override
+      public boolean apply(String term, int frequency) {
+        Gram ngram = new Gram(term, frequency);
         // obtain components, the leading (n-1)gram and the trailing unigram.
         int i = term.lastIndexOf(' ');
         if (i != -1) { // bigram, trigram etc
-          collector.collect(new Gram(term.substring(0, i), HEAD), ngram);
-          collector.collect(new Gram(term.substring(i + 1), TAIL), ngram);
+          try {
+            collector.collect(new Gram(term.substring(0, i), frequency, HEAD),
+              ngram);
+            collector.collect(new Gram(term.substring(i + 1), frequency, TAIL),
+              ngram);
+          } catch (IOException e) {
+            throw new RuntimeException(e);
+          }
         }
-      } else if (emitUnigrams && term.length() > 0) { // unigram
-        Gram ngram = new Gram(term);
-        Gram unigram = new Gram(term, UNIGRAM);
-        collector.collect(unigram, ngram);
+        return true;
       }
-    } while (sf.incrementToken());
+    });
+    
+    unigrams.forEachPair(new ObjectIntProcedure<String>() {
+      @Override
+      public boolean apply(String term, int frequency) {
+        try {
+          Gram ngram = new Gram(term, frequency);
+          Gram unigram = new Gram(term, frequency, UNIGRAM);
+          collector.collect(unigram, ngram);
+        } catch (IOException e) {
+          throw new RuntimeException(e);
+        }
+        return true;
+      }
+    });
     
     reporter.incrCounter(Count.NGRAM_TOTAL, count);
     

Modified: lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapperTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapperTest.java?rev=908859&r1=908858&r2=908859&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapperTest.java (original)
+++ lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapperTest.java Thu Feb 11 07:12:48 2010
@@ -38,138 +38,126 @@
 import org.junit.Before;
 import org.junit.Test;
 
-/** Test for CollocMapper
- * FIXME: Add negative test cases
+/**
+ * Test for CollocMapper FIXME: Add negative test cases
  */
 @SuppressWarnings("deprecation")
 public class CollocMapperTest {
-
+  
   OutputCollector<Gram,Gram> collector;
   Reporter reporter;
-
+  
   @Before
   @SuppressWarnings("unchecked")
   public void setUp() {
     collector = EasyMock.createMock(OutputCollector.class);
-    reporter  = EasyMock.createMock(Reporter.class);
+    reporter = EasyMock.createMock(Reporter.class);
   }
-
+  
   @Test
   public void testCollectNgrams() throws Exception {
-
+    
     Text key = new Text();
     key.set("dummy-key");
     
-    String[] input = {"the", "best", "of", "times", "the", "worst", "of", "times"};
+    String[] input = {"the", "best", "of", "times", "the", "worst", "of",
+                      "times"};
     StringTuple inputTuple = new StringTuple();
-    for (String i: input) {
+    for (String i : input) {
       inputTuple.add(i);
     }
-
-    String[][] values = 
-      new String[][]{
-        {"h_the",   "the best"},
-        {"t_best",  "the best"},
-        {"h_best",  "best of"},
-        {"t_of",    "best of"},
-        {"h_of",    "of times"},
-        {"t_times", "of times"},
-        {"h_times", "times the"},
-        {"t_the",   "times the"},
-        {"h_the",   "the worst"},
-        {"t_worst", "the worst"},
-        {"h_worst", "worst of"},
-        {"t_of",    "worst of"},
-        {"h_of",    "of times"},
-        {"t_times", "of times"}
-    };
+    
+    String[][] values = new String[][] { {"h_the", "the best"},
+                                        {"t_best", "the best"},
+                                        {"h_of", "of times"},
+                                        {"t_times", "of times"},
+                                        {"h_best", "best of"},
+                                        {"t_of", "best of"},
+                                        {"h_the", "the worst"},
+                                        {"t_worst", "the worst"},
+                                        {"h_times", "times the"},
+                                        {"t_the", "times the"},
+                                        {"h_worst", "worst of"},
+                                        {"t_of", "worst of"},};
     // set up expectations for mocks. ngram max size = 2
-    for (String[] v: values) {
+    for (String[] v : values) {
       Type p = v[0].startsWith("h") ? HEAD : TAIL;
-      Gram subgram = new Gram(v[0].substring(2), p);
-      Gram ngram = new Gram(v[1]);
+      int frequency = 1;
+      if (v[1].equals("of times")) frequency = 2;
+      Gram subgram = new Gram(v[0].substring(2), frequency, p);
+      Gram ngram = new Gram(v[1], frequency);
       collector.collect(subgram, ngram);
     }
-   
-
+    
     reporter.incrCounter(CollocMapper.Count.NGRAM_TOTAL, 7);
     EasyMock.replay(reporter, collector);
     
-
     JobConf conf = new JobConf();
     conf.set(CollocMapper.MAX_SHINGLE_SIZE, "2");
-
+    
     CollocMapper c = new CollocMapper();
     c.configure(conf);
     
     c.map(key, inputTuple, collector, reporter);
-
+    
     EasyMock.verify(reporter, collector);
   }
   
   @Test
   public void testCollectNgramsWithUnigrams() throws Exception {
-
+    
     Text key = new Text();
     key.set("dummy-key");
     
-    String[] input = {"the", "best", "of", "times", "the", "worst", "of", "times"};
+    String[] input = {"the", "best", "of", "times", "the", "worst", "of",
+                      "times"};
     StringTuple inputTuple = new StringTuple();
-    for (String i: input) {
+    for (String i : input) {
       inputTuple.add(i);
     }
-
-    String[][] values = 
-      new String[][]{
-        {"u_the", "the"},
-        {"h_the",   "the best"},
-        {"t_best",  "the best"},
-        {"u_best", "best"},
-        {"h_best",  "best of"},
-        {"t_of",    "best of"},
-        {"u_of", "of"},
-        {"h_of",    "of times"},
-        {"t_times", "of times"},
-        {"u_times", "times"},
-        {"h_times", "times the"},
-        {"t_the",   "times the"},
-        {"u_the", "the"},
-        {"h_the",   "the worst"},
-        {"t_worst", "the worst"},
-        {"u_worst", "worst"},
-        {"h_worst", "worst of"},
-        {"t_of",    "worst of"},
-        {"u_of", "of"},
-        {"h_of",    "of times"},
-        {"t_times", "of times"},
-        {"u_times", "times"},
-    };
+    
+    String[][] values = new String[][] { {"h_the", "the best"},
+                                        {"t_best", "the best"},
+                                        {"h_of", "of times"},
+                                        {"t_times", "of times"},
+                                        {"h_best", "best of"},
+                                        {"t_of", "best of"},
+                                        {"h_the", "the worst"},
+                                        {"t_worst", "the worst"},
+                                        {"h_times", "times the"},
+                                        {"t_the", "times the"},
+                                        {"h_worst", "worst of"},
+                                        {"t_of", "worst of"},
+                                        {"u_worst", "worst"}, {"u_of", "of"},
+                                        {"u_the", "the"}, {"u_best", "best"},
+                                        {"u_times", "times"},};
     // set up expectations for mocks. ngram max size = 2
-    for (String[] v: values) {
+    for (String[] v : values) {
       Type p = v[0].startsWith("h") ? HEAD : TAIL;
       p = v[0].startsWith("u") ? UNIGRAM : p;
-      Gram subgram = new Gram(v[0].substring(2), p);
-      Gram ngram = new Gram(v[1]);
+      int frequency = 1;
+      if (v[1].equals("of times") || v[1].equals("of") || v[1].equals("times")
+          || v[1].equals("the")) frequency = 2;
+      Gram subgram = new Gram(v[0].substring(2), frequency, p);
+      Gram ngram = new Gram(v[1], frequency);
       collector.collect(subgram, ngram);
     }
-   
-
+    
     reporter.incrCounter(CollocMapper.Count.NGRAM_TOTAL, 7);
     EasyMock.replay(reporter, collector);
     
-
     JobConf conf = new JobConf();
     conf.set(CollocMapper.MAX_SHINGLE_SIZE, "2");
     conf.setBoolean(CollocDriver.EMIT_UNIGRAMS, true);
-
+    
     CollocMapper c = new CollocMapper();
     c.configure(conf);
     
     c.map(key, inputTuple, collector, reporter);
-
+    
     EasyMock.verify(reporter, collector);
   }
-
+  
   /** A lucene 2.9 standard analyzer with no stopwords. */
   public static class TestAnalyzer extends Analyzer {
     final Analyzer a;