You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ro...@apache.org on 2010/02/12 22:24:09 UTC
svn commit: r909611 -
/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
Author: robinanil
Date: Fri Feb 12 21:24:08 2010
New Revision: 909611
URL: http://svn.apache.org/viewvc?rev=909611&view=rev
Log:
adding sequentialaccess option in main filesrc/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
Modified:
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java?rev=909611&r1=909610&r2=909611&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java Fri Feb 12 21:24:08 2010
@@ -136,6 +136,10 @@
"(Optional) The maximum size of ngrams to create"
+ " (2 = bigrams, 3 = trigrams, etc) Default Value:2")
.withShortName("ng").create();
+ Option sequentialAccessVectorOpt = obuilder.withLongName("sequentialAccessVector")
+ .withRequired(false)
+ .withDescription("(Optional) Whether output vectors should be SequentialAccessVectors If set true else false")
+ .withShortName("seq").create();
Option overwriteOutput = obuilder.withLongName("overwrite").withRequired(
false).withDescription("If set, overwrite the output directory")
@@ -149,6 +153,7 @@
.withOption(maxDFPercentOpt).withOption(weightOpt).withOption(powerOpt)
.withOption(minLLROpt).withOption(numReduceTasksOpt).withOption(
maxNGramSizeOpt).withOption(overwriteOutput).withOption(helpOpt)
+ .withOption(sequentialAccessVectorOpt)
.create();
try {
Parser parser = new Parser();
@@ -250,14 +255,19 @@
+ DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER;
DocumentProcessor.tokenizeDocuments(inputDir, analyzerClass,
tokenizedPath);
+
+ boolean sequentialAccessOutput = false;
+ if (cmdLine.hasOption(sequentialAccessVectorOpt)) {
+ sequentialAccessOutput = true;
+ }
DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir,
- minSupport, maxNGramSize, minLLRValue, reduceTasks, chunkSize);
+ minSupport, maxNGramSize, minLLRValue, reduceTasks, chunkSize, sequentialAccessOutput);
if (processIdf) {
TFIDFConverter.processTfIdf(
outputDir + DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER,
outputDir + TFIDFConverter.TFIDF_OUTPUT_FOLDER, chunkSize, minDf,
- maxDFPercent, norm);
+ maxDFPercent, norm, sequentialAccessOutput);
}
} catch (OptionException e) {
log.error("Exception", e);