You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ro...@apache.org on 2010/02/07 19:55:29 UTC
svn commit: r907465 - in
/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text:
DictionaryVectorizer.java term/TFPartialVectorReducer.java
Author: robinanil
Date: Sun Feb 7 18:55:29 2010
New Revision: 907465
URL: http://svn.apache.org/viewvc?rev=907465&view=rev
Log:
MAHOUT-277 Increase number of entries in memory per chunk of dictionary
Modified:
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TFPartialVectorReducer.java
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java?rev=907465&r1=907464&r2=907465&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java Sun Feb 7 18:55:29 2010
@@ -28,6 +28,7 @@
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
@@ -70,7 +71,8 @@
private static final String OUTPUT_FILES_PATTERN = "/part-*";
- private static final int SEQUENCEFILE_BYTE_OVERHEAD = 45;
+ // 4 byte overhead for each entry in the OpenObjectIntHashMap
+ private static final int DICTIONARY_BYTE_OVERHEAD = 4;
private static final String VECTOR_OUTPUT_FOLDER = "/partial-vectors-";
@@ -139,8 +141,8 @@
String outputDir = output + DOCUMENT_VECTOR_OUTPUT_FOLDER;
if (dictionaryChunks.size() > 1) {
- PartialVectorMerger.mergePartialVectors(partialVectorPaths, outputDir,
- -1);
+ PartialVectorMerger
+ .mergePartialVectors(partialVectorPaths, outputDir, -1);
HadoopUtil.deletePaths(partialVectorPaths, fs);
} else {
Path singlePartialVectorOutputPath = partialVectorPaths.get(0);
@@ -178,8 +180,8 @@
Path chunkPath = getPath(dictionaryPathBase + DICTIONARY_FILE, chunkIndex);
chunkPaths.add(chunkPath);
- SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, chunkPath,
- Text.class, LongWritable.class);
+ SequenceFile.Writer dictWriter = new SequenceFile.Writer(fs, conf,
+ chunkPath, Text.class, IntWritable.class);
SequenceFile.Writer freqWriter = new SequenceFile.Writer(fs, conf, getPath(
dictionaryPathBase + FREQUENCY_FILE, chunkIndex), Text.class,
@@ -187,7 +189,7 @@
long currentChunkSize = 0;
- long i = 0;
+ int i = 0;
for (FileStatus fileStatus : outputFiles) {
Path path = fileStatus.getPath();
SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
@@ -198,30 +200,30 @@
}
if (currentChunkSize > chunkSizeLimit) {
- writer.close();
+ dictWriter.close();
freqWriter.close();
chunkIndex++;
chunkPath = getPath(dictionaryPathBase + DICTIONARY_FILE, chunkIndex);
chunkPaths.add(chunkPath);
- writer = new SequenceFile.Writer(fs, conf, chunkPath, Text.class,
- LongWritable.class);
+ dictWriter = new SequenceFile.Writer(fs, conf, chunkPath, Text.class,
+ IntWritable.class);
freqWriter = new SequenceFile.Writer(fs, conf, getPath(
dictionaryPathBase + FREQUENCY_FILE, chunkIndex), Text.class,
LongWritable.class);
currentChunkSize = 0;
}
- int fieldSize = SEQUENCEFILE_BYTE_OVERHEAD
- + (key.toString().length() * 2) + (Long.SIZE / 8);
+ int fieldSize = DICTIONARY_BYTE_OVERHEAD
+ + (key.toString().length() * 2) + (Integer.SIZE / 8);
currentChunkSize += fieldSize;
- writer.append(key, new LongWritable(i++));
+ dictWriter.append(key, new IntWritable(i++));
freqWriter.append(key, value);
}
}
- writer.close();
+ dictWriter.close();
freqWriter.close();
return chunkPaths;
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TFPartialVectorReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TFPartialVectorReducer.java?rev=907465&r1=907464&r2=907465&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TFPartialVectorReducer.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TFPartialVectorReducer.java Sun Feb 7 18:55:29 2010
@@ -19,14 +19,12 @@
import java.io.IOException;
import java.net.URI;
-import java.util.HashMap;
import java.util.Iterator;
-import java.util.Map;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
@@ -39,6 +37,7 @@
import org.apache.mahout.math.RandomAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.map.OpenObjectIntHashMap;
/**
* Converts a document in to a sparse vector
@@ -46,7 +45,7 @@
public class TFPartialVectorReducer extends MapReduceBase implements
Reducer<Text,StringTuple,Text,VectorWritable> {
private Analyzer analyzer;
- private final Map<String,int[]> dictionary = new HashMap<String,int[]>();
+ private final OpenObjectIntHashMap<String> dictionary = new OpenObjectIntHashMap<String>();
private final VectorWritable vectorWritable = new VectorWritable();
@@ -61,10 +60,10 @@
Vector vector = new RandomAccessSparseVector(key.toString(),
Integer.MAX_VALUE, value.length()); // guess at initial size
- for (String tk : value.getEntries()) {
- if (dictionary.containsKey(tk) == false) continue;
- int tokenKey = dictionary.get(tk)[0];
- vector.setQuick(tokenKey, vector.getQuick(tokenKey) + 1);
+ for (String term : value.getEntries()) {
+ if (dictionary.containsKey(term) == false) continue;
+ int termId = dictionary.get(term);
+ vector.setQuick(termId, vector.getQuick(termId) + 1);
}
vectorWritable.set(vector);
@@ -87,12 +86,11 @@
SequenceFile.Reader reader = new SequenceFile.Reader(fs, dictionaryFile,
job);
Text key = new Text();
- LongWritable value = new LongWritable();
+ IntWritable value = new IntWritable();
// key is word value is id
while (reader.next(key, value)) {
- dictionary.put(key.toString(), new int[] {Long.valueOf(value.get())
- .intValue()});
+ dictionary.put(key.toString(), value.get());
}
} catch (IOException e) {
throw new IllegalStateException(e);