You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by jm...@apache.org on 2010/02/12 22:21:26 UTC
svn commit: r909608 - in /lucene/mahout/trunk/utils/src:
main/java/org/apache/mahout/utils/vectors/common/
main/java/org/apache/mahout/utils/vectors/text/
main/java/org/apache/mahout/utils/vectors/text/term/
main/java/org/apache/mahout/utils/vectors/tf...
Author: jmannix
Date: Fri Feb 12 21:21:23 2010
New Revision: 909608
URL: http://svn.apache.org/viewvc?rev=909608&view=rev
Log:
Adds the ability to output SequentialAccessSparseVectors, and keeps track of dimension properly.
Modified:
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMergeReducer.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TFPartialVectorReducer.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFPartialVectorReducer.java
lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMergeReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMergeReducer.java?rev=909608&r1=909607&r2=909608&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMergeReducer.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMergeReducer.java Fri Feb 12 21:21:23 2010
@@ -27,6 +27,7 @@
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.mahout.math.RandomAccessSparseVector;
+import org.apache.mahout.math.SequentialAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
@@ -40,6 +41,8 @@
private final VectorWritable vectorWritable = new VectorWritable();
private double normPower;
+ private int dimension;
+ private boolean sequentialAccess;
@Override
public void reduce(WritableComparable<?> key,
@@ -47,8 +50,7 @@
OutputCollector<WritableComparable<?>,VectorWritable> output,
Reporter reporter) throws IOException {
- Vector vector = new RandomAccessSparseVector(key
- .toString(), Integer.MAX_VALUE, 10);
+ Vector vector = new RandomAccessSparseVector(key.toString(), dimension, 10);
while (values.hasNext()) {
VectorWritable value = values.next();
value.get().addTo(vector);
@@ -56,6 +58,9 @@
if (normPower != PartialVectorMerger.NO_NORMALIZING) {
vector = vector.normalize(normPower);
}
+ if (sequentialAccess) {
+ vector = new SequentialAccessSparseVector(vector);
+ }
vectorWritable.set(vector);
output.collect(key, vectorWritable);
}
@@ -64,6 +69,8 @@
public void configure(JobConf job) {
super.configure(job);
normPower = job.getFloat(PartialVectorMerger.NORMALIZATION_POWER,
- PartialVectorMerger.NO_NORMALIZING);
+ PartialVectorMerger.NO_NORMALIZING);
+ dimension = job.getInt(PartialVectorMerger.DIMENSION, Integer.MAX_VALUE);
+ sequentialAccess = job.getBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, false);
}
}
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java?rev=909608&r1=909607&r2=909608&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java Fri Feb 12 21:21:23 2010
@@ -47,6 +47,10 @@
public static final float NO_NORMALIZING = -1.0f;
public static final String NORMALIZATION_POWER = "normalization.power";
+
+ public static final String DIMENSION = "vector.dimension";
+
+ public static final String SEQUENTIAL_ACCESS = "vector.sequentialAccess";
/**
* Cannot be initialized. Use the static functions
@@ -71,7 +75,9 @@
*/
public static void mergePartialVectors(List<Path> partialVectorPaths,
String output,
- float normPower) throws IOException {
+ float normPower,
+ int dimension,
+ boolean sequentialAccess) throws IOException {
if (normPower != NO_NORMALIZING && normPower < 0) {
throw new IllegalArgumentException("normPower must either be -1 or >= 0");
}
@@ -83,7 +89,8 @@
+ "org.apache.hadoop.io.serializer.WritableSerialization");
// this conf parameter needs to be set enable serialisation of conf values
conf.setJobName("PartialVectorMerger::MergePartialVectors");
-
+ conf.setBoolean(SEQUENTIAL_ACCESS, sequentialAccess);
+ conf.setInt(DIMENSION, dimension);
conf.setFloat(NORMALIZATION_POWER, normPower);
conf.setOutputKeyClass(Text.class);
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java?rev=909608&r1=909607&r2=909608&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java Fri Feb 12 21:21:23 2010
@@ -126,7 +126,8 @@
int maxNGramSize,
float minLLRValue,
int numReducers,
- int chunkSizeInMegabytes) throws IOException {
+ int chunkSizeInMegabytes,
+ boolean sequentialAccess) throws IOException {
if (chunkSizeInMegabytes < MIN_CHUNKSIZE) {
chunkSizeInMegabytes = MIN_CHUNKSIZE;
} else if (chunkSizeInMegabytes > MAX_CHUNKSIZE) { // 10GB
@@ -136,18 +137,19 @@
Path inputPath = new Path(input);
Path dictionaryJobPath = new Path(output + DICTIONARY_JOB_FOLDER);
-
+
+ int[] maxTermDimension = new int[1];
List<Path> dictionaryChunks;
if (maxNGramSize == 1) {
startWordCounting(inputPath, dictionaryJobPath, minSupport);
dictionaryChunks = createDictionaryChunks(minSupport, dictionaryJobPath,
- output, chunkSizeInMegabytes, new LongWritable());
+ output, chunkSizeInMegabytes, new LongWritable(), maxTermDimension);
} else {
CollocDriver.generateAllGrams(inputPath.toString(), dictionaryJobPath
.toString(), maxNGramSize, minSupport, minLLRValue, numReducers);
dictionaryChunks = createDictionaryChunks(minSupport, new Path(
output + DICTIONARY_JOB_FOLDER, CollocDriver.NGRAM_OUTPUT_DIRECTORY), output,
- chunkSizeInMegabytes, new DoubleWritable());
+ chunkSizeInMegabytes, new DoubleWritable(), maxTermDimension);
}
int partialVectorIndex = 0;
@@ -156,8 +158,12 @@
Path partialVectorOutputPath = getPath(output + VECTOR_OUTPUT_FOLDER,
partialVectorIndex++);
partialVectorPaths.add(partialVectorOutputPath);
- makePartialVectors(input, maxNGramSize, dictionaryChunk,
- partialVectorOutputPath);
+ makePartialVectors(input,
+ maxNGramSize,
+ dictionaryChunk,
+ partialVectorOutputPath,
+ maxTermDimension[0],
+ sequentialAccess);
}
Configuration conf = new Configuration();
@@ -165,8 +171,11 @@
String outputDir = output + DOCUMENT_VECTOR_OUTPUT_FOLDER;
if (dictionaryChunks.size() > 1) {
- PartialVectorMerger
- .mergePartialVectors(partialVectorPaths, outputDir, -1);
+ PartialVectorMerger.mergePartialVectors(partialVectorPaths,
+ outputDir,
+ -1,
+ maxTermDimension[0],
+ sequentialAccess);
HadoopUtil.deletePaths(partialVectorPaths, fs);
} else {
Path singlePartialVectorOutputPath = partialVectorPaths.get(0);
@@ -189,7 +198,8 @@
Path wordCountPath,
String dictionaryPathBase,
int chunkSizeInMegabytes,
- Writable value) throws IOException {
+ Writable value,
+ int[] maxTermDimension) throws IOException {
List<Path> chunkPaths = new ArrayList<Path>();
Writable key = new Text();
@@ -233,7 +243,7 @@
dictWriter.append(key, new IntWritable(i++));
}
}
-
+ maxTermDimension[0] = (int)i;
dictWriter.close();
return chunkPaths;
@@ -260,7 +270,9 @@
private static void makePartialVectors(String input,
int maxNGramSize,
Path dictionaryFilePath,
- Path output) throws IOException {
+ Path output,
+ int dimension,
+ boolean sequentialAccess) throws IOException {
Configurable client = new JobClient();
JobConf conf = new JobConf(DictionaryVectorizer.class);
@@ -272,14 +284,15 @@
conf.setJobName("DictionaryVectorizer::MakePartialVectors: input-folder: "
+ input + ", dictionary-file: "
+ dictionaryFilePath.toString());
+ conf.setInt(PartialVectorMerger.DIMENSION, dimension);
+ conf.setBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, sequentialAccess);
conf.setInt(MAX_NGRAMS, maxNGramSize);
-
+
conf.setMapOutputKeyClass(Text.class);
conf.setMapOutputValueClass(StringTuple.class);
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(VectorWritable.class);
- DistributedCache
- .setCacheFiles(new URI[] {dictionaryFilePath.toUri()}, conf);
+ DistributedCache.setCacheFiles(new URI[] {dictionaryFilePath.toUri()}, conf);
FileInputFormat.setInputPaths(conf, new Path(input));
FileOutputFormat.setOutputPath(conf, output);
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TFPartialVectorReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TFPartialVectorReducer.java?rev=909608&r1=909607&r2=909608&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TFPartialVectorReducer.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TFPartialVectorReducer.java Fri Feb 12 21:21:23 2010
@@ -36,9 +36,11 @@
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.mahout.common.StringTuple;
import org.apache.mahout.math.RandomAccessSparseVector;
+import org.apache.mahout.math.SequentialAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.math.map.OpenObjectIntHashMap;
+import org.apache.mahout.utils.vectors.common.PartialVectorMerger;
import org.apache.mahout.utils.nlp.collocations.llr.CollocMapper.IteratorTokenStream;
import org.apache.mahout.utils.vectors.text.DictionaryVectorizer;
@@ -50,6 +52,9 @@
private final OpenObjectIntHashMap<String> dictionary = new OpenObjectIntHashMap<String>();
private final VectorWritable vectorWritable = new VectorWritable();
+
+ private int dimension;
+ private boolean sequentialAccess;
private int maxNGramSize = 1;
@@ -62,7 +67,8 @@
StringTuple value = values.next();
Vector vector = new RandomAccessSparseVector(key.toString(),
- Integer.MAX_VALUE, value.length()); // guess at initial size
+ dimension,
+ value.length()); // guess at initial size
if (maxNGramSize >= 2) {
ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value
@@ -89,6 +95,9 @@
}
}
}
+ if (sequentialAccess) {
+ vector = new SequentialAccessSparseVector(vector);
+ }
vectorWritable.set(vector);
output.collect(key, vectorWritable);
@@ -98,6 +107,8 @@
public void configure(JobConf job) {
super.configure(job);
try {
+ dimension = job.getInt(PartialVectorMerger.DIMENSION, Integer.MAX_VALUE);
+ sequentialAccess = job.getBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, false);
maxNGramSize = job.getInt(DictionaryVectorizer.MAX_NGRAMS, maxNGramSize);
URI[] localFiles = DistributedCache.getCacheFiles(job);
if (localFiles == null || localFiles.length < 1) {
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java?rev=909608&r1=909607&r2=909608&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java Fri Feb 12 21:21:23 2010
@@ -123,7 +123,8 @@
int chunkSizeInMegabytes,
int minDf,
int maxDFPercent,
- float normPower) throws IOException {
+ float normPower,
+ boolean sequentialAccessOutput) throws IOException {
if (chunkSizeInMegabytes < MIN_CHUNKSIZE) {
chunkSizeInMegabytes = MIN_CHUNKSIZE;
} else if (chunkSizeInMegabytes > MAX_CHUNKSIZE) { // 10GB
@@ -151,9 +152,13 @@
Path partialVectorOutputPath = getPath(output + VECTOR_OUTPUT_FOLDER,
partialVectorIndex++);
partialVectorPaths.add(partialVectorOutputPath);
- makePartialVectors(input, datasetFeatures.getFirst()[0], datasetFeatures
- .getFirst()[1], minDf, maxDFPercent, dictionaryChunk,
- partialVectorOutputPath);
+ makePartialVectors(input,
+ datasetFeatures.getFirst()[0],
+ datasetFeatures.getFirst()[1],
+ minDf,
+ maxDFPercent,
+ dictionaryChunk,
+ partialVectorOutputPath);
}
Configuration conf = new Configuration();
@@ -161,8 +166,11 @@
String outputDir = output + DOCUMENT_VECTOR_OUTPUT_FOLDER;
if (dictionaryChunks.size() > 1) {
- PartialVectorMerger.mergePartialVectors(partialVectorPaths, outputDir,
- normPower);
+ PartialVectorMerger.mergePartialVectors(partialVectorPaths,
+ outputDir,
+ normPower,
+ (int)(long)datasetFeatures.getFirst()[0],
+ sequentialAccessOutput);
HadoopUtil.deletePaths(partialVectorPaths, fs);
} else {
Path singlePartialVectorOutputPath = partialVectorPaths.get(0);
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFPartialVectorReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFPartialVectorReducer.java?rev=909608&r1=909607&r2=909608&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFPartialVectorReducer.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFPartialVectorReducer.java Fri Feb 12 21:21:23 2010
@@ -64,7 +64,7 @@
Vector value = values.next().get();
Iterator<Element> it = value.iterateNonZero();
Vector vector = new RandomAccessSparseVector(key
- .toString(), Integer.MAX_VALUE, value.getNumNondefaultElements());
+ .toString(), (int)featureCount, value.getNumNondefaultElements());
while (it.hasNext()) {
Element e = it.next();
if (!dictionary.containsKey(e.index())) continue;
Modified: lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java?rev=909608&r1=909607&r2=909608&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java (original)
+++ lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java Fri Feb 12 21:21:23 2010
@@ -137,8 +137,8 @@
DocumentProcessor.tokenizeDocuments(pathString, analyzer,
"output/tokenized-documents");
DictionaryVectorizer.createTermFrequencyVectors("output/tokenized-documents",
- "output/wordcount", 2, 1, 0.0f, 1, 100);
- TFIDFConverter.processTfIdf("output/wordcount/vectors", "output/tfidf/", 100, 1, 99, 1.0f);
+ "output/wordcount", 2, 1, 0.0f, 1, 100, false);
+ TFIDFConverter.processTfIdf("output/wordcount/vectors", "output/tfidf/", 100, 1, 99, 1.0f, false);
}
}