You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ro...@apache.org on 2010/02/07 19:55:29 UTC

svn commit: r907465 - in /lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text: DictionaryVectorizer.java term/TFPartialVectorReducer.java

Author: robinanil
Date: Sun Feb  7 18:55:29 2010
New Revision: 907465

URL: http://svn.apache.org/viewvc?rev=907465&view=rev
Log:
MAHOUT-277 Increase number of entries in memory per chunk of dictionary

Modified:
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TFPartialVectorReducer.java

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java?rev=907465&r1=907464&r2=907465&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java Sun Feb  7 18:55:29 2010
@@ -28,6 +28,7 @@
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
 import org.apache.hadoop.io.LongWritable;
 import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Text;
@@ -70,7 +71,8 @@
   
   private static final String OUTPUT_FILES_PATTERN = "/part-*";
   
-  private static final int SEQUENCEFILE_BYTE_OVERHEAD = 45;
+  // 4 byte overhead for each entry in the OpenObjectIntHashMap
+  private static final int DICTIONARY_BYTE_OVERHEAD = 4;
   
   private static final String VECTOR_OUTPUT_FOLDER = "/partial-vectors-";
   
@@ -139,8 +141,8 @@
     
     String outputDir = output + DOCUMENT_VECTOR_OUTPUT_FOLDER;
     if (dictionaryChunks.size() > 1) {
-      PartialVectorMerger.mergePartialVectors(partialVectorPaths, outputDir,
-        -1);
+      PartialVectorMerger
+          .mergePartialVectors(partialVectorPaths, outputDir, -1);
       HadoopUtil.deletePaths(partialVectorPaths, fs);
     } else {
       Path singlePartialVectorOutputPath = partialVectorPaths.get(0);
@@ -178,8 +180,8 @@
     Path chunkPath = getPath(dictionaryPathBase + DICTIONARY_FILE, chunkIndex);
     chunkPaths.add(chunkPath);
     
-    SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, chunkPath,
-        Text.class, LongWritable.class);
+    SequenceFile.Writer dictWriter = new SequenceFile.Writer(fs, conf,
+        chunkPath, Text.class, IntWritable.class);
     
     SequenceFile.Writer freqWriter = new SequenceFile.Writer(fs, conf, getPath(
       dictionaryPathBase + FREQUENCY_FILE, chunkIndex), Text.class,
@@ -187,7 +189,7 @@
     
     long currentChunkSize = 0;
     
-    long i = 0;
+    int i = 0;
     for (FileStatus fileStatus : outputFiles) {
       Path path = fileStatus.getPath();
       SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
@@ -198,30 +200,30 @@
         }
         
         if (currentChunkSize > chunkSizeLimit) {
-          writer.close();
+          dictWriter.close();
           freqWriter.close();
           chunkIndex++;
           
           chunkPath = getPath(dictionaryPathBase + DICTIONARY_FILE, chunkIndex);
           chunkPaths.add(chunkPath);
           
-          writer = new SequenceFile.Writer(fs, conf, chunkPath, Text.class,
-              LongWritable.class);
+          dictWriter = new SequenceFile.Writer(fs, conf, chunkPath, Text.class,
+              IntWritable.class);
           freqWriter = new SequenceFile.Writer(fs, conf, getPath(
             dictionaryPathBase + FREQUENCY_FILE, chunkIndex), Text.class,
               LongWritable.class);
           currentChunkSize = 0;
         }
         
-        int fieldSize = SEQUENCEFILE_BYTE_OVERHEAD
-                        + (key.toString().length() * 2) + (Long.SIZE / 8);
+        int fieldSize = DICTIONARY_BYTE_OVERHEAD
+                        + (key.toString().length() * 2) + (Integer.SIZE / 8);
         currentChunkSize += fieldSize;
-        writer.append(key, new LongWritable(i++));
+        dictWriter.append(key, new IntWritable(i++));
         freqWriter.append(key, value);
       }
     }
     
-    writer.close();
+    dictWriter.close();
     freqWriter.close();
     
     return chunkPaths;

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TFPartialVectorReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TFPartialVectorReducer.java?rev=907465&r1=907464&r2=907465&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TFPartialVectorReducer.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TFPartialVectorReducer.java Sun Feb  7 18:55:29 2010
@@ -19,14 +19,12 @@
 
 import java.io.IOException;
 import java.net.URI;
-import java.util.HashMap;
 import java.util.Iterator;
-import java.util.Map;
 
 import org.apache.hadoop.filecache.DistributedCache;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.IntWritable;
 import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapred.JobConf;
@@ -39,6 +37,7 @@
 import org.apache.mahout.math.RandomAccessSparseVector;
 import org.apache.mahout.math.Vector;
 import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.map.OpenObjectIntHashMap;
 
 /**
  * Converts a document in to a sparse vector
@@ -46,7 +45,7 @@
 public class TFPartialVectorReducer extends MapReduceBase implements
     Reducer<Text,StringTuple,Text,VectorWritable> {
   private Analyzer analyzer;
-  private final Map<String,int[]> dictionary = new HashMap<String,int[]>();
+  private final OpenObjectIntHashMap<String> dictionary = new OpenObjectIntHashMap<String>();
   
   private final VectorWritable vectorWritable = new VectorWritable();
   
@@ -61,10 +60,10 @@
     Vector vector = new RandomAccessSparseVector(key.toString(),
         Integer.MAX_VALUE, value.length()); // guess at initial size
     
-    for (String tk : value.getEntries()) {
-      if (dictionary.containsKey(tk) == false) continue;
-      int tokenKey = dictionary.get(tk)[0];
-      vector.setQuick(tokenKey, vector.getQuick(tokenKey) + 1);
+    for (String term : value.getEntries()) {
+      if (dictionary.containsKey(term) == false) continue;
+      int termId = dictionary.get(term);
+      vector.setQuick(termId, vector.getQuick(termId) + 1);
     }
     
     vectorWritable.set(vector);
@@ -87,12 +86,11 @@
       SequenceFile.Reader reader = new SequenceFile.Reader(fs, dictionaryFile,
           job);
       Text key = new Text();
-      LongWritable value = new LongWritable();
+      IntWritable value = new IntWritable();
       
       // key is word value is id
       while (reader.next(key, value)) {
-        dictionary.put(key.toString(), new int[] {Long.valueOf(value.get())
-            .intValue()});
+        dictionary.put(key.toString(), value.get());
       }
     } catch (IOException e) {
       throw new IllegalStateException(e);