You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ro...@apache.org on 2010/02/11 05:34:04 UTC

svn commit: r908839 - /lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java

Author: robinanil
Date: Thu Feb 11 04:34:04 2010
New Revision: 908839

URL: http://svn.apache.org/viewvc?rev=908839&view=rev
Log:
MAHOUT-285 Missed out the main class

Modified:
    lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java

Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java?rev=908839&r1=908838&r2=908839&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java Thu Feb 11 04:34:04 2010
@@ -27,11 +27,15 @@
 import org.apache.commons.cli2.commandline.Parser;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.mahout.common.CommandLineUtil;
 import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.utils.nlp.collocations.llr.LLRReducer;
 import org.apache.mahout.utils.vectors.common.PartialVectorMerger;
 import org.apache.mahout.utils.vectors.text.DictionaryVectorizer;
 import org.apache.mahout.utils.vectors.text.DocumentProcessor;
 import org.apache.mahout.utils.vectors.tfidf.TFIDFConverter;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 /**
  * Converts a given set of sequence files into SparseVectors
@@ -39,6 +43,9 @@
  */
 public final class SparseVectorsFromSequenceFiles {
   
+  private static final Logger log = LoggerFactory
+      .getLogger(SparseVectorsFromSequenceFiles.class);
+  
   private SparseVectorsFromSequenceFiles() {}
   
   public static void main(String[] args) throws Exception {
@@ -79,7 +86,7 @@
         .withArgument(
           abuilder.withName("weight").withMinimum(1).withMaximum(1).create())
         .withDescription("The kind of weight to use. Currently TF or TFIDF")
-        .withShortName("w").create();
+        .withShortName("wt").create();
     
     Option minDFOpt = obuilder.withLongName("minDF").withRequired(false)
         .withArgument(
@@ -96,6 +103,21 @@
         .withDescription(
           "The max percentage of docs for the DF.  Can be used to remove really high frequency terms.  Expressed as an integer between 0 and 100. Default is 99.")
         .withShortName("x").create();
+    
+    Option minLLROpt = obuilder.withLongName("minLLR").withRequired(false)
+        .withArgument(
+          abuilder.withName("minLLR").withMinimum(1).withMaximum(1).create())
+        .withDescription(
+          "(Optional)The minimum Log Likelihood Ratio(Float)  Default is "
+              + LLRReducer.DEFAULT_MIN_LLR).withShortName("ml").create();
+    
+    Option numReduceTasksOpt = obuilder.withLongName("numReducers")
+        .withArgument(
+          abuilder.withName("numReducers").withMinimum(1).withMaximum(1)
+              .create()).withDescription(
+          "(Optional) Number of reduce tasks. Default Value: 1").withShortName(
+          "nr").create();
+    
     Option powerOpt = obuilder
         .withLongName("norm")
         .withRequired(false)
@@ -105,17 +127,39 @@
           "The norm to use, expressed as either a float or \"INF\" if you want to use the Infinite norm.  "
               + "Must be greater or equal to 0.  The default is not to normalize")
         .withShortName("n").create();
+    Option maxNGramSizeOpt = obuilder
+        .withLongName("maxNGramSize")
+        .withRequired(false)
+        .withArgument(
+          abuilder.withName("ngramSize").withMinimum(1).withMaximum(1).create())
+        .withDescription(
+          "(Optional) The maximum size of ngrams to create"
+              + " (2 = bigrams, 3 = trigrams, etc) Default Value:2")
+        .withShortName("ng").create();
+    
+    Option overwriteOutput = obuilder.withLongName("overwrite").withRequired(
+      false).withDescription("If set, overwrite the output directory")
+        .withShortName("w").create();
+    Option helpOpt = obuilder.withLongName("help").withDescription(
+      "Print out help").withShortName("h").create();
     
     Group group = gbuilder.withName("Options").withOption(minSupportOpt)
         .withOption(analyzerNameOpt).withOption(chunkSizeOpt).withOption(
           outputDirOpt).withOption(inputDirOpt).withOption(minDFOpt)
         .withOption(maxDFPercentOpt).withOption(weightOpt).withOption(powerOpt)
+        .withOption(minLLROpt).withOption(numReduceTasksOpt).withOption(
+          maxNGramSizeOpt).withOption(overwriteOutput).withOption(helpOpt)
         .create();
     
     Parser parser = new Parser();
     parser.setGroup(group);
     CommandLine cmdLine = parser.parse(args);
     
+    if (cmdLine.hasOption(helpOpt)) {
+      CommandLineUtil.printHelp(group);
+      return;
+    }
+    
     String inputDir = (String) cmdLine.getValue(inputDirOpt);
     String outputDir = (String) cmdLine.getValue(outputDirOpt);
     
@@ -129,6 +173,35 @@
       minSupport = Integer.parseInt(minSupportString);
     }
     
+    int maxNGramSize = 1;
+    
+    if (cmdLine.hasOption(maxNGramSizeOpt) == true) {
+      try {
+        maxNGramSize = Integer.parseInt(cmdLine.getValue(maxNGramSizeOpt)
+            .toString());
+      } catch (NumberFormatException ex) {
+        log.warn("Could not parse ngram size option");
+      }
+    }
+    log.info("Maximum n-gram size is: {}", maxNGramSize);
+    
+    if (cmdLine.hasOption(overwriteOutput) == true) {
+      HadoopUtil.overwriteOutput(outputDir);
+    }
+    
+    float minLLRValue = LLRReducer.DEFAULT_MIN_LLR;
+    if (cmdLine.hasOption(minLLROpt)) {
+      minLLRValue = Float.parseFloat(cmdLine.getValue(minLLROpt).toString());
+    }
+    log.info("Minimum LLR value: {}", minLLRValue);
+    
+    int reduceTasks = 1;
+    if (cmdLine.hasOption(numReduceTasksOpt)) {
+      reduceTasks = Integer.parseInt(cmdLine.getValue(numReduceTasksOpt)
+          .toString());
+    }
+    log.info("Pass1 reduce tasks: {}", reduceTasks);
+    
     Class<? extends Analyzer> analyzerClass = StandardAnalyzer.class;
     if (cmdLine.hasOption(analyzerNameOpt)) {
       String className = cmdLine.getValue(analyzerNameOpt).toString();
@@ -176,8 +249,8 @@
     String tokenizedPath = outputDir + "/tokenized-documents";
     DocumentProcessor.tokenizeDocuments(inputDir, analyzerClass, tokenizedPath);
     
-    DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath,
-      outputDir, minSupport, chunkSize);
+    DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir,
+      minSupport, maxNGramSize, minLLRValue, reduceTasks, chunkSize);
     if (processIdf) {
       TFIDFConverter.processTfIdf(
         outputDir + DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER,