You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by jm...@apache.org on 2010/02/12 22:21:26 UTC

svn commit: r909608 - in /lucene/mahout/trunk/utils/src: main/java/org/apache/mahout/utils/vectors/common/ main/java/org/apache/mahout/utils/vectors/text/ main/java/org/apache/mahout/utils/vectors/text/term/ main/java/org/apache/mahout/utils/vectors/tf...

Author: jmannix
Date: Fri Feb 12 21:21:23 2010
New Revision: 909608

URL: http://svn.apache.org/viewvc?rev=909608&view=rev
Log:
Adds the ability to output SequentialAccessSparseVectors, and keeps track of dimension properly.

Modified:
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMergeReducer.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TFPartialVectorReducer.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFPartialVectorReducer.java
    lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMergeReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMergeReducer.java?rev=909608&r1=909607&r2=909608&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMergeReducer.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMergeReducer.java Fri Feb 12 21:21:23 2010
@@ -27,6 +27,7 @@
 import org.apache.hadoop.mapred.Reducer;
 import org.apache.hadoop.mapred.Reporter;
 import org.apache.mahout.math.RandomAccessSparseVector;
+import org.apache.mahout.math.SequentialAccessSparseVector;
 import org.apache.mahout.math.Vector;
 import org.apache.mahout.math.VectorWritable;
 
@@ -40,6 +41,8 @@
   private final VectorWritable vectorWritable = new VectorWritable();
   
   private double normPower;
+  private int dimension;
+  private boolean sequentialAccess;
   
   @Override
   public void reduce(WritableComparable<?> key,
@@ -47,8 +50,7 @@
                      OutputCollector<WritableComparable<?>,VectorWritable> output,
                      Reporter reporter) throws IOException {
     
-    Vector vector = new RandomAccessSparseVector(key
-        .toString(), Integer.MAX_VALUE, 10);
+    Vector vector = new RandomAccessSparseVector(key.toString(), dimension, 10);
     while (values.hasNext()) {
       VectorWritable value = values.next();
       value.get().addTo(vector);
@@ -56,6 +58,9 @@
     if (normPower != PartialVectorMerger.NO_NORMALIZING) {
       vector = vector.normalize(normPower);
     }
+    if (sequentialAccess) {
+      vector = new SequentialAccessSparseVector(vector);
+    }
     vectorWritable.set(vector);
     output.collect(key, vectorWritable);
   }
@@ -64,6 +69,8 @@
   public void configure(JobConf job) {
     super.configure(job);
     normPower = job.getFloat(PartialVectorMerger.NORMALIZATION_POWER,
-      PartialVectorMerger.NO_NORMALIZING);
+                             PartialVectorMerger.NO_NORMALIZING);
+    dimension = job.getInt(PartialVectorMerger.DIMENSION, Integer.MAX_VALUE);
+    sequentialAccess = job.getBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, false);
   }
 }

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java?rev=909608&r1=909607&r2=909608&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java Fri Feb 12 21:21:23 2010
@@ -47,6 +47,10 @@
   public static final float NO_NORMALIZING = -1.0f;
   
   public static final String NORMALIZATION_POWER = "normalization.power";
+
+  public static final String DIMENSION = "vector.dimension";
+
+  public static final String SEQUENTIAL_ACCESS = "vector.sequentialAccess";
   
   /**
    * Cannot be initialized. Use the static functions
@@ -71,7 +75,9 @@
    */
   public static void mergePartialVectors(List<Path> partialVectorPaths,
                                          String output,
-                                         float normPower) throws IOException {
+                                         float normPower,
+                                         int dimension,
+                                         boolean sequentialAccess) throws IOException {
     if (normPower != NO_NORMALIZING && normPower < 0) {
       throw new IllegalArgumentException("normPower must either be -1 or >= 0");
     }
@@ -83,7 +89,8 @@
           + "org.apache.hadoop.io.serializer.WritableSerialization");
     // this conf parameter needs to be set enable serialisation of conf values
     conf.setJobName("PartialVectorMerger::MergePartialVectors");
-    
+    conf.setBoolean(SEQUENTIAL_ACCESS, sequentialAccess);
+    conf.setInt(DIMENSION, dimension);
     conf.setFloat(NORMALIZATION_POWER, normPower);
     
     conf.setOutputKeyClass(Text.class);

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java?rev=909608&r1=909607&r2=909608&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java Fri Feb 12 21:21:23 2010
@@ -126,7 +126,8 @@
                                                 int maxNGramSize,
                                                 float minLLRValue,
                                                 int numReducers,
-                                                int chunkSizeInMegabytes) throws IOException {
+                                                int chunkSizeInMegabytes,
+                                                boolean sequentialAccess) throws IOException {
     if (chunkSizeInMegabytes < MIN_CHUNKSIZE) {
       chunkSizeInMegabytes = MIN_CHUNKSIZE;
     } else if (chunkSizeInMegabytes > MAX_CHUNKSIZE) { // 10GB
@@ -136,18 +137,19 @@
     
     Path inputPath = new Path(input);
     Path dictionaryJobPath = new Path(output + DICTIONARY_JOB_FOLDER);
-    
+
+    int[] maxTermDimension = new int[1];
     List<Path> dictionaryChunks;
     if (maxNGramSize == 1) {
       startWordCounting(inputPath, dictionaryJobPath, minSupport);
       dictionaryChunks = createDictionaryChunks(minSupport, dictionaryJobPath,
-        output, chunkSizeInMegabytes, new LongWritable());
+        output, chunkSizeInMegabytes, new LongWritable(), maxTermDimension);
     } else {
       CollocDriver.generateAllGrams(inputPath.toString(), dictionaryJobPath
           .toString(), maxNGramSize, minSupport, minLLRValue, numReducers);
       dictionaryChunks = createDictionaryChunks(minSupport, new Path(
           output + DICTIONARY_JOB_FOLDER, CollocDriver.NGRAM_OUTPUT_DIRECTORY), output,
-        chunkSizeInMegabytes, new DoubleWritable());
+        chunkSizeInMegabytes, new DoubleWritable(), maxTermDimension);
     }
     
     int partialVectorIndex = 0;
@@ -156,8 +158,12 @@
       Path partialVectorOutputPath = getPath(output + VECTOR_OUTPUT_FOLDER,
         partialVectorIndex++);
       partialVectorPaths.add(partialVectorOutputPath);
-      makePartialVectors(input, maxNGramSize, dictionaryChunk,
-        partialVectorOutputPath);
+      makePartialVectors(input,
+                         maxNGramSize,
+                         dictionaryChunk,
+                         partialVectorOutputPath,
+                         maxTermDimension[0],
+                         sequentialAccess);
     }
     
     Configuration conf = new Configuration();
@@ -165,8 +171,11 @@
     
     String outputDir = output + DOCUMENT_VECTOR_OUTPUT_FOLDER;
     if (dictionaryChunks.size() > 1) {
-      PartialVectorMerger
-          .mergePartialVectors(partialVectorPaths, outputDir, -1);
+      PartialVectorMerger.mergePartialVectors(partialVectorPaths,
+                                              outputDir,
+                                              -1,
+                                              maxTermDimension[0],
+                                              sequentialAccess);
       HadoopUtil.deletePaths(partialVectorPaths, fs);
     } else {
       Path singlePartialVectorOutputPath = partialVectorPaths.get(0);
@@ -189,7 +198,8 @@
                                                    Path wordCountPath,
                                                    String dictionaryPathBase,
                                                    int chunkSizeInMegabytes,
-                                                   Writable value) throws IOException {
+                                                   Writable value,
+                                                   int[] maxTermDimension) throws IOException {
     List<Path> chunkPaths = new ArrayList<Path>();
     
     Writable key = new Text();
@@ -233,7 +243,7 @@
         dictWriter.append(key, new IntWritable(i++));
       }
     }
-    
+    maxTermDimension[0] = (int)i;
     dictWriter.close();
     
     return chunkPaths;
@@ -260,7 +270,9 @@
   private static void makePartialVectors(String input,
                                          int maxNGramSize,
                                          Path dictionaryFilePath,
-                                         Path output) throws IOException {
+                                         Path output,
+                                         int dimension,
+                                         boolean sequentialAccess) throws IOException {
     
     Configurable client = new JobClient();
     JobConf conf = new JobConf(DictionaryVectorizer.class);
@@ -272,14 +284,15 @@
     conf.setJobName("DictionaryVectorizer::MakePartialVectors: input-folder: "
                     + input + ", dictionary-file: "
                     + dictionaryFilePath.toString());
+    conf.setInt(PartialVectorMerger.DIMENSION, dimension);
+    conf.setBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, sequentialAccess);
     conf.setInt(MAX_NGRAMS, maxNGramSize);
-    
+
     conf.setMapOutputKeyClass(Text.class);
     conf.setMapOutputValueClass(StringTuple.class);
     conf.setOutputKeyClass(Text.class);
     conf.setOutputValueClass(VectorWritable.class);
-    DistributedCache
-        .setCacheFiles(new URI[] {dictionaryFilePath.toUri()}, conf);
+    DistributedCache.setCacheFiles(new URI[] {dictionaryFilePath.toUri()}, conf);
     FileInputFormat.setInputPaths(conf, new Path(input));
     
     FileOutputFormat.setOutputPath(conf, output);

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TFPartialVectorReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TFPartialVectorReducer.java?rev=909608&r1=909607&r2=909608&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TFPartialVectorReducer.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TFPartialVectorReducer.java Fri Feb 12 21:21:23 2010
@@ -36,9 +36,11 @@
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 import org.apache.mahout.common.StringTuple;
 import org.apache.mahout.math.RandomAccessSparseVector;
+import org.apache.mahout.math.SequentialAccessSparseVector;
 import org.apache.mahout.math.Vector;
 import org.apache.mahout.math.VectorWritable;
 import org.apache.mahout.math.map.OpenObjectIntHashMap;
+import org.apache.mahout.utils.vectors.common.PartialVectorMerger;
 import org.apache.mahout.utils.nlp.collocations.llr.CollocMapper.IteratorTokenStream;
 import org.apache.mahout.utils.vectors.text.DictionaryVectorizer;
 
@@ -50,6 +52,9 @@
   private final OpenObjectIntHashMap<String> dictionary = new OpenObjectIntHashMap<String>();
   
   private final VectorWritable vectorWritable = new VectorWritable();
+
+  private int dimension;
+  private boolean sequentialAccess;
   
   private int maxNGramSize = 1;
   
@@ -62,7 +67,8 @@
     StringTuple value = values.next();
     
     Vector vector = new RandomAccessSparseVector(key.toString(),
-        Integer.MAX_VALUE, value.length()); // guess at initial size
+                                                 dimension,
+                                                 value.length()); // guess at initial size
     
     if (maxNGramSize >= 2) {
       ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value
@@ -89,6 +95,9 @@
         }
       }
     }
+    if (sequentialAccess) {
+      vector = new SequentialAccessSparseVector(vector);
+    }
     vectorWritable.set(vector);
     output.collect(key, vectorWritable);
     
@@ -98,6 +107,8 @@
   public void configure(JobConf job) {
     super.configure(job);
     try {
+      dimension = job.getInt(PartialVectorMerger.DIMENSION, Integer.MAX_VALUE);
+      sequentialAccess = job.getBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, false);
       maxNGramSize = job.getInt(DictionaryVectorizer.MAX_NGRAMS, maxNGramSize);
       URI[] localFiles = DistributedCache.getCacheFiles(job);
       if (localFiles == null || localFiles.length < 1) {

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java?rev=909608&r1=909607&r2=909608&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java Fri Feb 12 21:21:23 2010
@@ -123,7 +123,8 @@
                                   int chunkSizeInMegabytes,
                                   int minDf,
                                   int maxDFPercent,
-                                  float normPower) throws IOException {
+                                  float normPower,
+                                  boolean sequentialAccessOutput) throws IOException {
     if (chunkSizeInMegabytes < MIN_CHUNKSIZE) {
       chunkSizeInMegabytes = MIN_CHUNKSIZE;
     } else if (chunkSizeInMegabytes > MAX_CHUNKSIZE) { // 10GB
@@ -151,9 +152,13 @@
       Path partialVectorOutputPath = getPath(output + VECTOR_OUTPUT_FOLDER,
         partialVectorIndex++);
       partialVectorPaths.add(partialVectorOutputPath);
-      makePartialVectors(input, datasetFeatures.getFirst()[0], datasetFeatures
-          .getFirst()[1], minDf, maxDFPercent, dictionaryChunk,
-        partialVectorOutputPath);
+      makePartialVectors(input,
+                         datasetFeatures.getFirst()[0],
+                         datasetFeatures.getFirst()[1],
+                         minDf,
+                         maxDFPercent,
+                         dictionaryChunk,
+                         partialVectorOutputPath);
     }
     
     Configuration conf = new Configuration();
@@ -161,8 +166,11 @@
     
     String outputDir = output + DOCUMENT_VECTOR_OUTPUT_FOLDER;
     if (dictionaryChunks.size() > 1) {
-      PartialVectorMerger.mergePartialVectors(partialVectorPaths, outputDir,
-        normPower);
+      PartialVectorMerger.mergePartialVectors(partialVectorPaths,
+                                              outputDir,
+                                              normPower,
+                                              (int)(long)datasetFeatures.getFirst()[0],
+                                              sequentialAccessOutput);
       HadoopUtil.deletePaths(partialVectorPaths, fs);
     } else {
       Path singlePartialVectorOutputPath = partialVectorPaths.get(0);

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFPartialVectorReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFPartialVectorReducer.java?rev=909608&r1=909607&r2=909608&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFPartialVectorReducer.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFPartialVectorReducer.java Fri Feb 12 21:21:23 2010
@@ -64,7 +64,7 @@
     Vector value = values.next().get();
     Iterator<Element> it = value.iterateNonZero();
     Vector vector = new RandomAccessSparseVector(key
-        .toString(), Integer.MAX_VALUE, value.getNumNondefaultElements());
+        .toString(), (int)featureCount, value.getNumNondefaultElements());
     while (it.hasNext()) {
       Element e = it.next();
       if (!dictionary.containsKey(e.index())) continue;

Modified: lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java?rev=909608&r1=909607&r2=909608&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java (original)
+++ lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java Fri Feb 12 21:21:23 2010
@@ -137,8 +137,8 @@
     DocumentProcessor.tokenizeDocuments(pathString, analyzer,
       "output/tokenized-documents");
     DictionaryVectorizer.createTermFrequencyVectors("output/tokenized-documents",
-      "output/wordcount", 2, 1, 0.0f, 1, 100);
-    TFIDFConverter.processTfIdf("output/wordcount/vectors", "output/tfidf/", 100, 1, 99, 1.0f);
+      "output/wordcount", 2, 1, 0.0f, 1, 100, false);
+    TFIDFConverter.processTfIdf("output/wordcount/vectors", "output/tfidf/", 100, 1, 99, 1.0f, false);
     
   }
 }