You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ro...@apache.org on 2010/02/11 05:34:04 UTC
svn commit: r908839 -
/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
Author: robinanil
Date: Thu Feb 11 04:34:04 2010
New Revision: 908839
URL: http://svn.apache.org/viewvc?rev=908839&view=rev
Log:
MAHOUT-285 Missed out the main class
Modified:
lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java?rev=908839&r1=908838&r2=908839&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java Thu Feb 11 04:34:04 2010
@@ -27,11 +27,15 @@
import org.apache.commons.cli2.commandline.Parser;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.mahout.common.CommandLineUtil;
import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.utils.nlp.collocations.llr.LLRReducer;
import org.apache.mahout.utils.vectors.common.PartialVectorMerger;
import org.apache.mahout.utils.vectors.text.DictionaryVectorizer;
import org.apache.mahout.utils.vectors.text.DocumentProcessor;
import org.apache.mahout.utils.vectors.tfidf.TFIDFConverter;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
/**
* Converts a given set of sequence files into SparseVectors
@@ -39,6 +43,9 @@
*/
public final class SparseVectorsFromSequenceFiles {
+ private static final Logger log = LoggerFactory
+ .getLogger(SparseVectorsFromSequenceFiles.class);
+
private SparseVectorsFromSequenceFiles() {}
public static void main(String[] args) throws Exception {
@@ -79,7 +86,7 @@
.withArgument(
abuilder.withName("weight").withMinimum(1).withMaximum(1).create())
.withDescription("The kind of weight to use. Currently TF or TFIDF")
- .withShortName("w").create();
+ .withShortName("wt").create();
Option minDFOpt = obuilder.withLongName("minDF").withRequired(false)
.withArgument(
@@ -96,6 +103,21 @@
.withDescription(
"The max percentage of docs for the DF. Can be used to remove really high frequency terms. Expressed as an integer between 0 and 100. Default is 99.")
.withShortName("x").create();
+
+ Option minLLROpt = obuilder.withLongName("minLLR").withRequired(false)
+ .withArgument(
+ abuilder.withName("minLLR").withMinimum(1).withMaximum(1).create())
+ .withDescription(
+ "(Optional)The minimum Log Likelihood Ratio(Float) Default is "
+ + LLRReducer.DEFAULT_MIN_LLR).withShortName("ml").create();
+
+ Option numReduceTasksOpt = obuilder.withLongName("numReducers")
+ .withArgument(
+ abuilder.withName("numReducers").withMinimum(1).withMaximum(1)
+ .create()).withDescription(
+ "(Optional) Number of reduce tasks. Default Value: 1").withShortName(
+ "nr").create();
+
Option powerOpt = obuilder
.withLongName("norm")
.withRequired(false)
@@ -105,17 +127,39 @@
"The norm to use, expressed as either a float or \"INF\" if you want to use the Infinite norm. "
+ "Must be greater or equal to 0. The default is not to normalize")
.withShortName("n").create();
+ Option maxNGramSizeOpt = obuilder
+ .withLongName("maxNGramSize")
+ .withRequired(false)
+ .withArgument(
+ abuilder.withName("ngramSize").withMinimum(1).withMaximum(1).create())
+ .withDescription(
+ "(Optional) The maximum size of ngrams to create"
+ + " (2 = bigrams, 3 = trigrams, etc) Default Value:2")
+ .withShortName("ng").create();
+
+ Option overwriteOutput = obuilder.withLongName("overwrite").withRequired(
+ false).withDescription("If set, overwrite the output directory")
+ .withShortName("w").create();
+ Option helpOpt = obuilder.withLongName("help").withDescription(
+ "Print out help").withShortName("h").create();
Group group = gbuilder.withName("Options").withOption(minSupportOpt)
.withOption(analyzerNameOpt).withOption(chunkSizeOpt).withOption(
outputDirOpt).withOption(inputDirOpt).withOption(minDFOpt)
.withOption(maxDFPercentOpt).withOption(weightOpt).withOption(powerOpt)
+ .withOption(minLLROpt).withOption(numReduceTasksOpt).withOption(
+ maxNGramSizeOpt).withOption(overwriteOutput).withOption(helpOpt)
.create();
Parser parser = new Parser();
parser.setGroup(group);
CommandLine cmdLine = parser.parse(args);
+ if (cmdLine.hasOption(helpOpt)) {
+ CommandLineUtil.printHelp(group);
+ return;
+ }
+
String inputDir = (String) cmdLine.getValue(inputDirOpt);
String outputDir = (String) cmdLine.getValue(outputDirOpt);
@@ -129,6 +173,35 @@
minSupport = Integer.parseInt(minSupportString);
}
+ int maxNGramSize = 1;
+
+ if (cmdLine.hasOption(maxNGramSizeOpt) == true) {
+ try {
+ maxNGramSize = Integer.parseInt(cmdLine.getValue(maxNGramSizeOpt)
+ .toString());
+ } catch (NumberFormatException ex) {
+ log.warn("Could not parse ngram size option");
+ }
+ }
+ log.info("Maximum n-gram size is: {}", maxNGramSize);
+
+ if (cmdLine.hasOption(overwriteOutput) == true) {
+ HadoopUtil.overwriteOutput(outputDir);
+ }
+
+ float minLLRValue = LLRReducer.DEFAULT_MIN_LLR;
+ if (cmdLine.hasOption(minLLROpt)) {
+ minLLRValue = Float.parseFloat(cmdLine.getValue(minLLROpt).toString());
+ }
+ log.info("Minimum LLR value: {}", minLLRValue);
+
+ int reduceTasks = 1;
+ if (cmdLine.hasOption(numReduceTasksOpt)) {
+ reduceTasks = Integer.parseInt(cmdLine.getValue(numReduceTasksOpt)
+ .toString());
+ }
+ log.info("Pass1 reduce tasks: {}", reduceTasks);
+
Class<? extends Analyzer> analyzerClass = StandardAnalyzer.class;
if (cmdLine.hasOption(analyzerNameOpt)) {
String className = cmdLine.getValue(analyzerNameOpt).toString();
@@ -176,8 +249,8 @@
String tokenizedPath = outputDir + "/tokenized-documents";
DocumentProcessor.tokenizeDocuments(inputDir, analyzerClass, tokenizedPath);
- DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath,
- outputDir, minSupport, chunkSize);
+ DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir,
+ minSupport, maxNGramSize, minLLRValue, reduceTasks, chunkSize);
if (processIdf) {
TFIDFConverter.processTfIdf(
outputDir + DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER,