You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ro...@apache.org on 2010/02/12 22:24:09 UTC

svn commit: r909611 - /lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java

Author: robinanil
Date: Fri Feb 12 21:24:08 2010
New Revision: 909611

URL: http://svn.apache.org/viewvc?rev=909611&view=rev
Log:
adding sequentialaccess option in main filesrc/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java

Modified:
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java?rev=909611&r1=909610&r2=909611&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java Fri Feb 12 21:24:08 2010
@@ -136,6 +136,10 @@
           "(Optional) The maximum size of ngrams to create"
               + " (2 = bigrams, 3 = trigrams, etc) Default Value:2")
         .withShortName("ng").create();
+    Option sequentialAccessVectorOpt = obuilder.withLongName("sequentialAccessVector")
+        .withRequired(false)
+        .withDescription("(Optional) Whether output vectors should be SequentialAccessVectors If set true else false")
+        .withShortName("seq").create();
     
     Option overwriteOutput = obuilder.withLongName("overwrite").withRequired(
       false).withDescription("If set, overwrite the output directory")
@@ -149,6 +153,7 @@
         .withOption(maxDFPercentOpt).withOption(weightOpt).withOption(powerOpt)
         .withOption(minLLROpt).withOption(numReduceTasksOpt).withOption(
           maxNGramSizeOpt).withOption(overwriteOutput).withOption(helpOpt)
+        .withOption(sequentialAccessVectorOpt)
         .create();
     try {
       Parser parser = new Parser();
@@ -250,14 +255,19 @@
                              + DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER;
       DocumentProcessor.tokenizeDocuments(inputDir, analyzerClass,
         tokenizedPath);
+
+      boolean sequentialAccessOutput = false;
+      if (cmdLine.hasOption(sequentialAccessVectorOpt)) {
+        sequentialAccessOutput = true;
+      }
       
       DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir,
-        minSupport, maxNGramSize, minLLRValue, reduceTasks, chunkSize);
+        minSupport, maxNGramSize, minLLRValue, reduceTasks, chunkSize, sequentialAccessOutput);
       if (processIdf) {
         TFIDFConverter.processTfIdf(
           outputDir + DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER,
           outputDir + TFIDFConverter.TFIDF_OUTPUT_FOLDER, chunkSize, minDf,
-          maxDFPercent, norm);
+          maxDFPercent, norm, sequentialAccessOutput);
       }
     } catch (OptionException e) {
       log.error("Exception", e);