You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ro...@apache.org on 2010/02/13 18:56:01 UTC
svn commit: r909861 [3/4] - in /lucene/mahout/trunk/utils/src:
main/java/org/apache/mahout/clustering/lda/
main/java/org/apache/mahout/text/ main/java/org/apache/mahout/utils/
main/java/org/apache/mahout/utils/clustering/
main/java/org/apache/mahout/ut...
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java?rev=909861&r1=909860&r2=909861&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java Sat Feb 13 17:55:56 2010
@@ -17,6 +17,14 @@
package org.apache.mahout.utils.vectors.lucene;
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.nio.charset.Charset;
+
import org.apache.commons.cli2.CommandLine;
import org.apache.commons.cli2.Group;
import org.apache.commons.cli2.Option;
@@ -46,87 +54,83 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import java.io.BufferedWriter;
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.FileWriter;
-import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.nio.charset.Charset;
-
public class Driver {
private static final Logger log = LoggerFactory.getLogger(Driver.class);
-
- private Driver() {
- }
-
+
+ private Driver() {}
+
public static void main(String[] args) throws IOException {
DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
ArgumentBuilder abuilder = new ArgumentBuilder();
GroupBuilder gbuilder = new GroupBuilder();
-
+
Option inputOpt = obuilder.withLongName("dir").withRequired(true).withArgument(
- abuilder.withName("dir").withMinimum(1).withMaximum(1).create()).
- withDescription("The Lucene directory").withShortName("d").create();
-
+ abuilder.withName("dir").withMinimum(1).withMaximum(1).create())
+ .withDescription("The Lucene directory").withShortName("d").create();
+
Option outputOpt = obuilder.withLongName("output").withRequired(true).withArgument(
- abuilder.withName("output").withMinimum(1).withMaximum(1).create()).
- withDescription("The output file").withShortName("o").create();
-
+ abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription("The output file")
+ .withShortName("o").create();
+
Option fieldOpt = obuilder.withLongName("field").withRequired(true).withArgument(
- abuilder.withName("field").withMinimum(1).withMaximum(1).create()).
- withDescription("The field in the index").withShortName("f").create();
-
+ abuilder.withName("field").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The field in the index").withShortName("f").create();
+
Option idFieldOpt = obuilder.withLongName("idField").withRequired(false).withArgument(
- abuilder.withName("idField").withMinimum(1).withMaximum(1).create()).
- withDescription("The field in the index containing the index. If null, then the Lucene internal doc " +
- "id is used which is prone to error if the underlying index changes").withShortName("i").create();
-
+ abuilder.withName("idField").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The field in the index containing the index. If null, then the Lucene internal doc "
+ + "id is used which is prone to error if the underlying index changes").withShortName("i").create();
+
Option dictOutOpt = obuilder.withLongName("dictOut").withRequired(true).withArgument(
- abuilder.withName("dictOut").withMinimum(1).withMaximum(1).create()).
- withDescription("The output of the dictionary").withShortName("t").create();
-
+ abuilder.withName("dictOut").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The output of the dictionary").withShortName("t").create();
+
Option weightOpt = obuilder.withLongName("weight").withRequired(false).withArgument(
- abuilder.withName("weight").withMinimum(1).withMaximum(1).create()).
- withDescription("The kind of weight to use. Currently TF or TFIDF").withShortName("w").create();
-
+ abuilder.withName("weight").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The kind of weight to use. Currently TF or TFIDF").withShortName("w").create();
+
Option delimiterOpt = obuilder.withLongName("delimiter").withRequired(false).withArgument(
- abuilder.withName("delimiter").withMinimum(1).withMaximum(1).create()).
- withDescription("The delimiter for outputing the dictionary").withShortName("l").create();
+ abuilder.withName("delimiter").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The delimiter for outputing the dictionary").withShortName("l").create();
Option powerOpt = obuilder.withLongName("norm").withRequired(false).withArgument(
- abuilder.withName("norm").withMinimum(1).withMaximum(1).create()).
- withDescription("The norm to use, expressed as either a double or \"INF\" if you want to use the Infinite norm. " +
- "Must be greater or equal to 0. The default is not to normalize").withShortName("n").create();
+ abuilder.withName("norm").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The norm to use, expressed as either a double or \"INF\" if you want to use the Infinite norm. "
+ + "Must be greater or equal to 0. The default is not to normalize").withShortName("n").create();
Option maxOpt = obuilder.withLongName("max").withRequired(false).withArgument(
- abuilder.withName("max").withMinimum(1).withMaximum(1).create()).
- withDescription("The maximum number of vectors to output. If not specified, then it will loop over all docs").withShortName("m").create();
-
+ abuilder.withName("max").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The maximum number of vectors to output. If not specified, then it will loop over all docs")
+ .withShortName("m").create();
+
Option outWriterOpt = obuilder.withLongName("outputWriter").withRequired(false).withArgument(
- abuilder.withName("outputWriter").withMinimum(1).withMaximum(1).create()).
- withDescription("The VectorWriter to use, either seq (SequenceFileVectorWriter - default) or file (Writes to a File using JSON format)").withShortName("e").create();
+ abuilder.withName("outputWriter").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The VectorWriter to use, either seq "
+ + "(SequenceFileVectorWriter - default) or file (Writes to a File using JSON format)")
+ .withShortName("e").create();
Option minDFOpt = obuilder.withLongName("minDF").withRequired(false).withArgument(
- abuilder.withName("minDF").withMinimum(1).withMaximum(1).create()).
- withDescription("The minimum document frequency. Default is 1").withShortName("md").create();
+ abuilder.withName("minDF").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The minimum document frequency. Default is 1").withShortName("md").create();
Option maxDFPercentOpt = obuilder.withLongName("maxDFPercent").withRequired(false).withArgument(
- abuilder.withName("maxDFPercent").withMinimum(1).withMaximum(1).create()).
- withDescription("The max percentage of docs for the DF. Can be used to remove really high frequency terms. Expressed as an integer between 0 and 100. Default is 99.").withShortName("x").create();
- Option helpOpt = obuilder.withLongName("help").
- withDescription("Print out help").withShortName("h").create();
- Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(idFieldOpt).withOption(outputOpt).withOption(delimiterOpt)
- .withOption(helpOpt).withOption(fieldOpt).withOption(maxOpt).withOption(dictOutOpt).withOption(powerOpt).withOption(outWriterOpt).withOption(maxDFPercentOpt)
- .withOption(weightOpt).withOption(minDFOpt).create();
+ abuilder.withName("maxDFPercent").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The max percentage of docs for the DF. Can be used to remove really high frequency terms."
+ + " Expressed as an integer between 0 and 100. Default is 99.").withShortName("x").create();
+ Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
+ .create();
+ Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(idFieldOpt).withOption(
+ outputOpt).withOption(delimiterOpt).withOption(helpOpt).withOption(fieldOpt).withOption(maxOpt)
+ .withOption(dictOutOpt).withOption(powerOpt).withOption(outWriterOpt).withOption(maxDFPercentOpt)
+ .withOption(weightOpt).withOption(minDFOpt).create();
try {
Parser parser = new Parser();
parser.setGroup(group);
CommandLine cmdLine = parser.parse(args);
-
+
if (cmdLine.hasOption(helpOpt)) {
-
+
CommandLineUtil.printHelp(group);
return;
}
- //Springify all this
- if (cmdLine.hasOption(inputOpt)) {//Lucene case
+ // Springify all this
+ if (cmdLine.hasOption(inputOpt)) {// Lucene case
File file = new File(cmdLine.getValue(inputOpt).toString());
if (file.exists() && file.isDirectory()) {
long maxDocs = Long.MAX_VALUE;
@@ -182,8 +186,8 @@
iterable = new LuceneIterable(reader, idField, field, mapper, norm);
}
String outFile = cmdLine.getValue(outputOpt).toString();
- log.info("Output File: {}", outFile);
-
+ Driver.log.info("Output File: {}", outFile);
+
VectorWriter vectorWriter;
if (cmdLine.hasOption(outWriterOpt)) {
String outWriter = cmdLine.getValue(outWriterOpt).toString();
@@ -191,42 +195,44 @@
BufferedWriter writer = new BufferedWriter(new FileWriter(outFile));
vectorWriter = new JWriterVectorWriter(writer);
} else {
- vectorWriter = getSeqFileWriter(outFile);
+ vectorWriter = Driver.getSeqFileWriter(outFile);
}
} else {
- vectorWriter = getSeqFileWriter(outFile);
+ vectorWriter = Driver.getSeqFileWriter(outFile);
}
-
+
long numDocs = vectorWriter.write(iterable, maxDocs);
vectorWriter.close();
- log.info("Wrote: {} vectors", numDocs);
-
- String delimiter = cmdLine.hasOption(delimiterOpt) ? cmdLine.getValue(delimiterOpt).toString() : "\t";
+ Driver.log.info("Wrote: {} vectors", numDocs);
+
+ String delimiter = cmdLine.hasOption(delimiterOpt) ? cmdLine.getValue(delimiterOpt).toString()
+ : "\t";
File dictOutFile = new File(cmdLine.getValue(dictOutOpt).toString());
- log.info("Dictionary Output file: {}", dictOutFile);
- BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(dictOutFile), Charset.forName("UTF8")));
+ Driver.log.info("Dictionary Output file: {}", dictOutFile);
+ BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(
+ new FileOutputStream(dictOutFile), Charset.forName("UTF8")));
JWriterTermInfoWriter tiWriter = new JWriterTermInfoWriter(writer, delimiter, field);
tiWriter.write(termInfo);
tiWriter.close();
writer.close();
}
}
-
+
} catch (OptionException e) {
- log.error("Exception", e);
+ Driver.log.error("Exception", e);
CommandLineUtil.printHelp(group);
}
}
-
+
private static VectorWriter getSeqFileWriter(String outFile) throws IOException {
Path path = new Path(outFile);
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
- //TODO: Make this parameter driven
- SequenceFile.Writer seqWriter = SequenceFile.createWriter(fs, conf, path, LongWritable.class, VectorWritable.class);
-
+ // TODO: Make this parameter driven
+ SequenceFile.Writer seqWriter = SequenceFile.createWriter(fs, conf, path, LongWritable.class,
+ VectorWritable.class);
+
return new SequenceFileVectorWriter(seqWriter);
}
-
-
+
}
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java?rev=909861&r1=909860&r2=909861&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java Sat Feb 13 17:55:56 2010
@@ -17,36 +17,36 @@
package org.apache.mahout.utils.vectors.lucene;
+import java.io.IOException;
+import java.util.Collections;
+import java.util.Iterator;
+
import org.apache.lucene.document.FieldSelector;
import org.apache.lucene.document.SetBasedFieldSelector;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.TermDocs;
import org.apache.mahout.math.Vector;
-import java.io.IOException;
-import java.util.Collections;
-import java.util.Iterator;
-
/**
* A LuceneIterable is an Iterable<Vector> that uses a Lucene index as the source for creating the {@link Vector}.
* The Field used to create the Vector currently must have Term Vectors stored for it.
*/
public class LuceneIterable implements Iterable<Vector> {
-
- private IndexReader indexReader;
- private String field;
- private String idField;
- private FieldSelector idFieldSelector;
-
- private VectorMapper mapper;
- private double normPower = NO_NORMALIZING;
-
+
+ private final IndexReader indexReader;
+ private final String field;
+ private final String idField;
+ private final FieldSelector idFieldSelector;
+
+ private final VectorMapper mapper;
+ private double normPower = LuceneIterable.NO_NORMALIZING;
+
public static final double NO_NORMALIZING = -1.0;
-
+
public LuceneIterable(IndexReader reader, String idField, String field, VectorMapper mapper) {
- this(reader, idField, field, mapper, NO_NORMALIZING);
+ this(reader, idField, field, mapper, LuceneIterable.NO_NORMALIZING);
}
-
+
/**
* Produce a LuceneIterable that can create the Vector plus normalize it.
*
@@ -57,7 +57,7 @@
* @param normPower The normalization value. Must be greater than or equal to 0 or equal to {@link #NO_NORMALIZING}
*/
public LuceneIterable(IndexReader reader, String idField, String field, VectorMapper mapper, double normPower) {
- if (normPower != NO_NORMALIZING && normPower < 0) {
+ if (normPower != LuceneIterable.NO_NORMALIZING && normPower < 0) {
throw new IllegalArgumentException("normPower must either be -1 or >= 0");
}
idFieldSelector = new SetBasedFieldSelector(Collections.singleton(idField), Collections.emptySet());
@@ -67,8 +67,8 @@
this.mapper = mapper;
this.normPower = normPower;
}
-
-
+
+
@Override
public Iterator<Vector> iterator() {
try {
@@ -77,25 +77,25 @@
throw new IllegalStateException(e);
}
}
-
+
private class TDIterator implements Iterator<Vector> {
private final TermDocs termDocs;
-
+
private TDIterator() throws IOException {
//term docs(null) is a better way of iterating all the docs in Lucene
this.termDocs = indexReader.termDocs(null);
}
-
+
@Override
public boolean hasNext() {
- // TODO this doesn't work with the Iterator contract -- hasNext() cannot have a side effect
+ // TODO this doesn't work with the Iterator contract -- hasNext() cannot have a side effect
try {
return termDocs.next();
} catch (IOException e) {
throw new IllegalStateException(e);
}
}
-
+
@Override
public Vector next() {
Vector result;
@@ -114,24 +114,24 @@
} else {
result.setName(String.valueOf(doc));
}
- if (normPower != NO_NORMALIZING) {
+ if (normPower != LuceneIterable.NO_NORMALIZING) {
result = result.normalize(normPower);
}
} catch (IOException e) {
//Log?
throw new IllegalStateException(e);
}
-
+
return result;
}
-
-
+
+
@Override
public void remove() {
throw new UnsupportedOperationException();
}
-
+
}
-
-
+
+
}
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java?rev=909861&r1=909860&r2=909861&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java Sat Feb 13 17:55:56 2010
@@ -21,46 +21,46 @@
import org.apache.lucene.index.TermVectorOffsetInfo;
import org.apache.mahout.math.RandomAccessSparseVector;
import org.apache.mahout.math.Vector;
-import org.apache.mahout.utils.vectors.Weight;
import org.apache.mahout.utils.vectors.TermEntry;
import org.apache.mahout.utils.vectors.TermInfo;
+import org.apache.mahout.utils.vectors.Weight;
/**
* Not thread-safe
*/
public class TFDFMapper extends VectorMapper {
-
+
//public static final int DEFAULT_CACHE_SIZE = 256;
-
+
//private final IndexReader reader; // TODO never used?
private Vector vector;
-
+
private final Weight weight;
private int numTerms;
private final TermInfo termInfo;
private String field;
private final int numDocs;
-
+
public TFDFMapper(IndexReader reader, Weight weight, TermInfo termInfo) {
//this.reader = reader;
this.weight = weight;
this.termInfo = termInfo;
this.numDocs = reader.numDocs();
}
-
+
@Override
public Vector getVector() {
return vector;
}
-
+
@Override
public void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions) {
this.field = field;
vector = new RandomAccessSparseVector(termInfo.totalTerms(field));
this.numTerms = numTerms;
}
-
+
@Override
public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) {
TermEntry entry = termInfo.getTermEntry(field, term);
@@ -68,12 +68,12 @@
vector.setQuick(entry.termIdx, weight.calculate(frequency, entry.docFreq, numTerms, numDocs));
}
}
-
+
@Override
public boolean isIgnoringPositions() {
return true;
}
-
+
@Override
public boolean isIgnoringOffsets() {
return true;
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java?rev=909861&r1=909860&r2=909861&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java Sat Feb 13 17:55:56 2010
@@ -51,11 +51,10 @@
import org.apache.mahout.utils.vectors.text.term.TermCountReducer;
/**
- * This class converts a set of input documents in the sequence file format to
- * vectors. The Sequence file input should have a {@link Text} key containing
- * the unique document identifier and a {@link StringTuple} value containing the
- * tokenized document. You may use {@link DocumentProcessor} to tokenize the
- * document. This is a dictionary based Vectorizer.
+ * This class converts a set of input documents in the sequence file format to vectors. The Sequence file
+ * input should have a {@link Text} key containing the unique document identifier and a {@link StringTuple}
+ * value containing the tokenized document. You may use {@link DocumentProcessor} to tokenize the document.
+ * This is a dictionary based Vectorizer.
*
*/
public final class DictionaryVectorizer {
@@ -91,33 +90,28 @@
}
/**
- * Create Term Frequency (Tf) Vectors from the input set of documents in
- * {@link SequenceFile} format. This tries to fix the maximum memory used by
- * the feature chunk per node thereby splitting the process across multiple
- * map/reduces.
+ * Create Term Frequency (Tf) Vectors from the input set of documents in {@link SequenceFile} format. This
+ * tries to fix the maximum memory used by the feature chunk per node thereby splitting the process across
+ * multiple map/reduces.
*
* @param input
* input directory of the documents in {@link SequenceFile} format
* @param output
- * output directory where
- * {@link org.apache.mahout.math.RandomAccessSparseVector}'s of the
- * document are generated
+ * output directory where {@link org.apache.mahout.math.RandomAccessSparseVector}'s of the document
+ * are generated
* @param minSupport
- * the minimum frequency of the feature in the entire corpus to be
- * considered for inclusion in the sparse vector
+ * the minimum frequency of the feature in the entire corpus to be considered for inclusion in the
+ * sparse vector
* @param maxNGramSize
- * 1 = unigram, 2 = unigram and bigram, 3 = unigram, bigram and
- * trigram
+ * 1 = unigram, 2 = unigram and bigram, 3 = unigram, bigram and trigram
* @param minLLRValue
* minValue of log likelihood ratio to used to prune ngrams
* @param chunkSizeInMegabytes
- * the size in MB of the feature => id chunk to be kept in memory at
- * each node during Map/Reduce stage. Its recommended you calculated
- * this based on the number of cores and the free memory available to
- * you per node. Say, you have 2 cores and around 1GB extra memory to
- * spare we recommend you use a split size of around 400-500MB so
- * that two simultaneous reducers can create partial vectors without
- * thrashing the system due to increased swapping
+ * the size in MB of the feature => id chunk to be kept in memory at each node during Map/Reduce
+ * stage. Its recommended you calculated this based on the number of cores and the free memory
+ * available to you per node. Say, you have 2 cores and around 1GB extra memory to spare we
+ * recommend you use a split size of around 400-500MB so that two simultaneous reducers can create
+ * partial vectors without thrashing the system due to increased swapping
* @throws IOException
*/
public static void createTermFrequencyVectors(String input,
@@ -128,54 +122,49 @@
int numReducers,
int chunkSizeInMegabytes,
boolean sequentialAccess) throws IOException {
- if (chunkSizeInMegabytes < MIN_CHUNKSIZE) {
- chunkSizeInMegabytes = MIN_CHUNKSIZE;
- } else if (chunkSizeInMegabytes > MAX_CHUNKSIZE) { // 10GB
- chunkSizeInMegabytes = MAX_CHUNKSIZE;
+ if (chunkSizeInMegabytes < DictionaryVectorizer.MIN_CHUNKSIZE) {
+ chunkSizeInMegabytes = DictionaryVectorizer.MIN_CHUNKSIZE;
+ } else if (chunkSizeInMegabytes > DictionaryVectorizer.MAX_CHUNKSIZE) { // 10GB
+ chunkSizeInMegabytes = DictionaryVectorizer.MAX_CHUNKSIZE;
+ }
+ if (minSupport < 0) {
+ minSupport = DictionaryVectorizer.DEFAULT_MIN_SUPPORT;
}
- if (minSupport < 0) minSupport = DEFAULT_MIN_SUPPORT;
Path inputPath = new Path(input);
- Path dictionaryJobPath = new Path(output + DICTIONARY_JOB_FOLDER);
-
+ Path dictionaryJobPath = new Path(output + DictionaryVectorizer.DICTIONARY_JOB_FOLDER);
+
int[] maxTermDimension = new int[1];
List<Path> dictionaryChunks;
if (maxNGramSize == 1) {
- startWordCounting(inputPath, dictionaryJobPath, minSupport);
- dictionaryChunks = createDictionaryChunks(minSupport, dictionaryJobPath,
- output, chunkSizeInMegabytes, new LongWritable(), maxTermDimension);
+ DictionaryVectorizer.startWordCounting(inputPath, dictionaryJobPath, minSupport);
+ dictionaryChunks = DictionaryVectorizer.createDictionaryChunks(minSupport, dictionaryJobPath, output,
+ chunkSizeInMegabytes, new LongWritable(), maxTermDimension);
} else {
- CollocDriver.generateAllGrams(inputPath.toString(), dictionaryJobPath
- .toString(), maxNGramSize, minSupport, minLLRValue, numReducers);
- dictionaryChunks = createDictionaryChunks(minSupport, new Path(
- output + DICTIONARY_JOB_FOLDER, CollocDriver.NGRAM_OUTPUT_DIRECTORY), output,
+ CollocDriver.generateAllGrams(inputPath.toString(), dictionaryJobPath.toString(), maxNGramSize,
+ minSupport, minLLRValue, numReducers);
+ dictionaryChunks = DictionaryVectorizer.createDictionaryChunks(minSupport, new Path(
+ output + DictionaryVectorizer.DICTIONARY_JOB_FOLDER, CollocDriver.NGRAM_OUTPUT_DIRECTORY), output,
chunkSizeInMegabytes, new DoubleWritable(), maxTermDimension);
}
int partialVectorIndex = 0;
List<Path> partialVectorPaths = new ArrayList<Path>();
for (Path dictionaryChunk : dictionaryChunks) {
- Path partialVectorOutputPath = getPath(output + VECTOR_OUTPUT_FOLDER,
- partialVectorIndex++);
+ Path partialVectorOutputPath = DictionaryVectorizer.getPath(
+ output + DictionaryVectorizer.VECTOR_OUTPUT_FOLDER, partialVectorIndex++);
partialVectorPaths.add(partialVectorOutputPath);
- makePartialVectors(input,
- maxNGramSize,
- dictionaryChunk,
- partialVectorOutputPath,
- maxTermDimension[0],
- sequentialAccess);
+ DictionaryVectorizer.makePartialVectors(input, maxNGramSize, dictionaryChunk, partialVectorOutputPath,
+ maxTermDimension[0], sequentialAccess);
}
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(partialVectorPaths.get(0).toUri(), conf);
- String outputDir = output + DOCUMENT_VECTOR_OUTPUT_FOLDER;
+ String outputDir = output + DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER;
if (dictionaryChunks.size() > 1) {
- PartialVectorMerger.mergePartialVectors(partialVectorPaths,
- outputDir,
- -1,
- maxTermDimension[0],
- sequentialAccess);
+ PartialVectorMerger.mergePartialVectors(partialVectorPaths, outputDir, -1, maxTermDimension[0],
+ sequentialAccess);
HadoopUtil.deletePaths(partialVectorPaths, fs);
} else {
Path singlePartialVectorOutputPath = partialVectorPaths.get(0);
@@ -185,9 +174,8 @@
}
/**
- * Read the feature frequency List which is built at the end of the Word Count
- * Job and assign ids to them. This will use constant memory and will run at
- * the speed of your disk read
+ * Read the feature frequency List which is built at the end of the Word Count Job and assign ids to them.
+ * This will use constant memory and will run at the speed of your disk read
*
* @param minSupport
* @param wordCountPath
@@ -207,15 +195,16 @@
FileSystem fs = FileSystem.get(wordCountPath.toUri(), conf);
FileStatus[] outputFiles = fs.globStatus(new Path(wordCountPath.toString()
- + OUTPUT_FILES_PATTERN));
+ + DictionaryVectorizer.OUTPUT_FILES_PATTERN));
long chunkSizeLimit = chunkSizeInMegabytes * 1024 * 1024;
int chunkIndex = 0;
- Path chunkPath = getPath(dictionaryPathBase + DICTIONARY_FILE, chunkIndex);
+ Path chunkPath = DictionaryVectorizer.getPath(dictionaryPathBase + DictionaryVectorizer.DICTIONARY_FILE,
+ chunkIndex);
chunkPaths.add(chunkPath);
- SequenceFile.Writer dictWriter = new SequenceFile.Writer(fs, conf,
- chunkPath, Text.class, IntWritable.class);
+ SequenceFile.Writer dictWriter = new SequenceFile.Writer(fs, conf, chunkPath, Text.class,
+ IntWritable.class);
long currentChunkSize = 0;
@@ -229,21 +218,21 @@
dictWriter.close();
chunkIndex++;
- chunkPath = getPath(dictionaryPathBase + DICTIONARY_FILE, chunkIndex);
+ chunkPath = DictionaryVectorizer.getPath(dictionaryPathBase + DictionaryVectorizer.DICTIONARY_FILE,
+ chunkIndex);
chunkPaths.add(chunkPath);
- dictWriter = new SequenceFile.Writer(fs, conf, chunkPath, Text.class,
- IntWritable.class);
+ dictWriter = new SequenceFile.Writer(fs, conf, chunkPath, Text.class, IntWritable.class);
currentChunkSize = 0;
}
- int fieldSize = DICTIONARY_BYTE_OVERHEAD
- + (key.toString().length() * 2) + (Integer.SIZE / 8);
+ int fieldSize = DictionaryVectorizer.DICTIONARY_BYTE_OVERHEAD + key.toString().length() * 2
+ + Integer.SIZE / 8;
currentChunkSize += fieldSize;
dictWriter.append(key, new IntWritable(i++));
}
}
- maxTermDimension[0] = (int)i;
+ maxTermDimension[0] = i;
dictWriter.close();
return chunkPaths;
@@ -254,8 +243,8 @@
}
/**
- * Create a partial vector using a chunk of features from the input documents.
- * The input documents has to be in the {@link SequenceFile} format
+ * Create a partial vector using a chunk of features from the input documents. The input documents has to be
+ * in the {@link SequenceFile} format
*
* @param input
* input directory of the documents in {@link SequenceFile} format
@@ -276,18 +265,16 @@
Configurable client = new JobClient();
JobConf conf = new JobConf(DictionaryVectorizer.class);
- conf.set("io.serializations",
- "org.apache.hadoop.io.serializer.JavaSerialization,"
- + "org.apache.hadoop.io.serializer.WritableSerialization");
+ conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
+ + "org.apache.hadoop.io.serializer.WritableSerialization");
// this conf parameter needs to be set enable serialisation of conf values
- conf.setJobName("DictionaryVectorizer::MakePartialVectors: input-folder: "
- + input + ", dictionary-file: "
- + dictionaryFilePath.toString());
+ conf.setJobName("DictionaryVectorizer::MakePartialVectors: input-folder: " + input
+ + ", dictionary-file: " + dictionaryFilePath.toString());
conf.setInt(PartialVectorMerger.DIMENSION, dimension);
conf.setBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, sequentialAccess);
- conf.setInt(MAX_NGRAMS, maxNGramSize);
-
+ conf.setInt(DictionaryVectorizer.MAX_NGRAMS, maxNGramSize);
+
conf.setMapOutputKeyClass(Text.class);
conf.setMapOutputValueClass(StringTuple.class);
conf.setOutputKeyClass(Text.class);
@@ -311,21 +298,19 @@
}
/**
- * Count the frequencies of words in parallel using Map/Reduce. The input
- * documents have to be in {@link SequenceFile} format
+ * Count the frequencies of words in parallel using Map/Reduce. The input documents have to be in
+ * {@link SequenceFile} format
*/
private static void startWordCounting(Path input, Path output, int minSupport) throws IOException {
Configurable client = new JobClient();
JobConf conf = new JobConf(DictionaryVectorizer.class);
- conf.set("io.serializations",
- "org.apache.hadoop.io.serializer.JavaSerialization,"
- + "org.apache.hadoop.io.serializer.WritableSerialization");
+ conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
+ + "org.apache.hadoop.io.serializer.WritableSerialization");
// this conf parameter needs to be set enable serialisation of conf values
- conf.setJobName("DictionaryVectorizer::WordCount: input-folder: "
- + input.toString());
- conf.setInt(MIN_SUPPORT, minSupport);
+ conf.setJobName("DictionaryVectorizer::WordCount: input-folder: " + input.toString());
+ conf.setInt(DictionaryVectorizer.MIN_SUPPORT, minSupport);
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(LongWritable.class);
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DocumentProcessor.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DocumentProcessor.java?rev=909861&r1=909860&r2=909861&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DocumentProcessor.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DocumentProcessor.java Sat Feb 13 17:55:56 2010
@@ -36,12 +36,11 @@
import org.apache.mahout.utils.vectors.text.document.SequenceFileTokenizerMapper;
/**
- * This class converts a set of input documents in the sequence file format of
- * {@link StringTuple}s.The {@link SequenceFile} input should have a
- * {@link Text} key containing the unique document identifier and a {@link Text}
- * value containing the whole document. The document should be stored in UTF-8
- * encoding which is recognizable by hadoop. It uses the given {@link Analyzer}
- * to process the document into {@link org.apache.lucene.analysis.Token}s.
+ * This class converts a set of input documents in the sequence file format of {@link StringTuple}s.The
+ * {@link SequenceFile} input should have a {@link Text} key containing the unique document identifier and a
+ * {@link Text} value containing the whole document. The document should be stored in UTF-8 encoding which is
+ * recognizable by hadoop. It uses the given {@link Analyzer} to process the document into
+ * {@link org.apache.lucene.analysis.Token}s.
*
*/
public final class DocumentProcessor {
@@ -59,32 +58,27 @@
}
/**
- * Convert the input documents into token array using the {@link StringTuple}
- * The input documents has to be in the {@link SequenceFile} format
+ * Convert the input documents into token array using the {@link StringTuple} The input documents has to be
+ * in the {@link SequenceFile} format
*
* @param input
* input directory of the documents in {@link SequenceFile} format
* @param output
- * output directory were the {@link StringTuple} token array of each
- * document has to be created
+ * output directory were the {@link StringTuple} token array of each document has to be created
* @param analyzerClass
* The Lucene {@link Analyzer} for tokenizing the UTF-8 text
* @throws IOException
*/
- public static void tokenizeDocuments(String input,
- Class<? extends Analyzer> analyzerClass,
- String output) throws IOException {
+ public static void tokenizeDocuments(String input, Class<? extends Analyzer> analyzerClass, String output) throws IOException {
Configurable client = new JobClient();
JobConf conf = new JobConf(DocumentProcessor.class);
- conf.set("io.serializations",
- "org.apache.hadoop.io.serializer.JavaSerialization,"
- + "org.apache.hadoop.io.serializer.WritableSerialization");
+ conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
+ + "org.apache.hadoop.io.serializer.WritableSerialization");
// this conf parameter needs to be set enable serialisation of conf values
- conf.set(ANALYZER_CLASS, analyzerClass.getName());
- conf.setJobName("DocumentProcessor::DocumentTokenizer: input-folder: "
- + input);
+ conf.set(DocumentProcessor.ANALYZER_CLASS, analyzerClass.getName());
+ conf.setJobName("DocumentProcessor::DocumentTokenizer: input-folder: " + input);
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(StringTuple.class);
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/document/SequenceFileTokenizerMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/document/SequenceFileTokenizerMapper.java?rev=909861&r1=909860&r2=909861&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/document/SequenceFileTokenizerMapper.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/document/SequenceFileTokenizerMapper.java Sat Feb 13 17:55:56 2010
@@ -36,20 +36,14 @@
/**
* Tokenizes a text document and outputs tokens in a StringTuple
*/
-public class SequenceFileTokenizerMapper extends MapReduceBase implements
- Mapper<Text,Text,Text,StringTuple> {
+public class SequenceFileTokenizerMapper extends MapReduceBase implements Mapper<Text,Text,Text,StringTuple> {
private Analyzer analyzer;
@Override
- public void map(Text key,
- Text value,
- OutputCollector<Text,StringTuple> output,
- Reporter reporter) throws IOException {
- TokenStream stream = analyzer.tokenStream(key.toString(), new StringReader(
- value.toString()));
- TermAttribute termAtt = (TermAttribute) stream
- .addAttribute(TermAttribute.class);
+ public void map(Text key, Text value, OutputCollector<Text,StringTuple> output, Reporter reporter) throws IOException {
+ TokenStream stream = analyzer.tokenStream(key.toString(), new StringReader(value.toString()));
+ TermAttribute termAtt = (TermAttribute) stream.addAttribute(TermAttribute.class);
StringTuple document = new StringTuple();
while (stream.incrementToken()) {
if (termAtt.termLength() > 0) {
@@ -64,8 +58,8 @@
super.configure(job);
try {
ClassLoader ccl = Thread.currentThread().getContextClassLoader();
- Class<?> cl = ccl.loadClass(job.get(DocumentProcessor.ANALYZER_CLASS,
- StandardAnalyzer.class.getName()));
+ Class<?> cl = ccl
+ .loadClass(job.get(DocumentProcessor.ANALYZER_CLASS, StandardAnalyzer.class.getName()));
analyzer = (Analyzer) cl.newInstance();
} catch (ClassNotFoundException e) {
throw new IllegalStateException(e);
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TFPartialVectorReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TFPartialVectorReducer.java?rev=909861&r1=909860&r2=909861&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TFPartialVectorReducer.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TFPartialVectorReducer.java Sat Feb 13 17:55:56 2010
@@ -40,8 +40,8 @@
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.math.map.OpenObjectIntHashMap;
-import org.apache.mahout.utils.vectors.common.PartialVectorMerger;
import org.apache.mahout.utils.nlp.collocations.llr.CollocMapper.IteratorTokenStream;
+import org.apache.mahout.utils.vectors.common.PartialVectorMerger;
import org.apache.mahout.utils.vectors.text.DictionaryVectorizer;
/**
@@ -52,7 +52,7 @@
private final OpenObjectIntHashMap<String> dictionary = new OpenObjectIntHashMap<String>();
private final VectorWritable vectorWritable = new VectorWritable();
-
+
private int dimension;
private boolean sequentialAccess;
@@ -63,22 +63,24 @@
Iterator<StringTuple> values,
OutputCollector<Text,VectorWritable> output,
Reporter reporter) throws IOException {
- if (values.hasNext() == false) return;
+ if (values.hasNext() == false) {
+ return;
+ }
StringTuple value = values.next();
- Vector vector = new RandomAccessSparseVector(key.toString(),
- dimension,
- value.length()); // guess at initial size
+ Vector vector = new RandomAccessSparseVector(key.toString(), dimension, value.length()); // guess at
+ // initial size
if (maxNGramSize >= 2) {
- ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value
- .getEntries().iterator()), maxNGramSize);
+ ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()),
+ maxNGramSize);
do {
- String term = ((TermAttribute) sf.getAttribute(TermAttribute.class))
- .term();
+ String term = ((TermAttribute) sf.getAttribute(TermAttribute.class)).term();
if (term.length() > 0) { // ngram
- if (dictionary.containsKey(term) == false) continue;
+ if (dictionary.containsKey(term) == false) {
+ continue;
+ }
int termId = dictionary.get(term);
vector.setQuick(termId, vector.getQuick(termId) + 1);
}
@@ -89,7 +91,9 @@
} else {
for (String term : value.getEntries()) {
if (term.length() > 0) { // unigram
- if (dictionary.containsKey(term) == false) continue;
+ if (dictionary.containsKey(term) == false) {
+ continue;
+ }
int termId = dictionary.get(term);
vector.setQuick(termId, vector.getQuick(termId) + 1);
}
@@ -112,13 +116,11 @@
maxNGramSize = job.getInt(DictionaryVectorizer.MAX_NGRAMS, maxNGramSize);
URI[] localFiles = DistributedCache.getCacheFiles(job);
if (localFiles == null || localFiles.length < 1) {
- throw new IllegalArgumentException(
- "missing paths from the DistributedCache");
+ throw new IllegalArgumentException("missing paths from the DistributedCache");
}
Path dictionaryFile = new Path(localFiles[0].getPath());
FileSystem fs = dictionaryFile.getFileSystem(job);
- SequenceFile.Reader reader = new SequenceFile.Reader(fs, dictionaryFile,
- job);
+ SequenceFile.Reader reader = new SequenceFile.Reader(fs, dictionaryFile, job);
Text key = new Text();
IntWritable value = new IntWritable();
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TermCountMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TermCountMapper.java?rev=909861&r1=909860&r2=909861&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TermCountMapper.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TermCountMapper.java Sat Feb 13 17:55:56 2010
@@ -30,12 +30,10 @@
import org.apache.mahout.math.map.OpenObjectLongHashMap;
/**
- * TextVectorizer Term Count Mapper. Tokenizes a text document and outputs the
- * count of the words
+ * TextVectorizer Term Count Mapper. Tokenizes a text document and outputs the count of the words
*
*/
-public class TermCountMapper extends MapReduceBase implements
- Mapper<Text,StringTuple,Text,LongWritable> {
+public class TermCountMapper extends MapReduceBase implements Mapper<Text,StringTuple,Text,LongWritable> {
@Override
public void map(Text key,
StringTuple value,
@@ -45,7 +43,9 @@
for (String word : value.getEntries()) {
if (wordCount.containsKey(word) == false) {
wordCount.put(word, 1);
- } else wordCount.put(word, wordCount.get(word) + 1);
+ } else {
+ wordCount.put(word, wordCount.get(word) + 1);
+ }
}
wordCount.forEachPair(new ObjectLongProcedure<String>() {
@Override
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TermCountReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TermCountReducer.java?rev=909861&r1=909860&r2=909861&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TermCountReducer.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TermCountReducer.java Sat Feb 13 17:55:56 2010
@@ -30,11 +30,9 @@
import org.apache.mahout.utils.vectors.text.DictionaryVectorizer;
/**
- * Can also be used as a local Combiner. This accumulates all the words and the
- * weights and sums them up.
+ * Can also be used as a local Combiner. This accumulates all the words and the weights and sums them up.
*/
-public class TermCountReducer extends MapReduceBase implements
- Reducer<Text,LongWritable,Text,LongWritable> {
+public class TermCountReducer extends MapReduceBase implements Reducer<Text,LongWritable,Text,LongWritable> {
private static int minSupport;
@@ -44,9 +42,10 @@
OutputCollector<Text,LongWritable> output,
Reporter reporter) throws IOException {
long sum = 0;
- while (values.hasNext())
+ while (values.hasNext()) {
sum += values.next().get();
- if (sum >= minSupport) {
+ }
+ if (sum >= TermCountReducer.minSupport) {
output.collect(key, new LongWritable(sum));
}
}
@@ -54,7 +53,7 @@
@Override
public void configure(JobConf job) {
super.configure(job);
- minSupport = job.getInt(DictionaryVectorizer.MIN_SUPPORT,
+ TermCountReducer.minSupport = job.getInt(DictionaryVectorizer.MIN_SUPPORT,
DictionaryVectorizer.DEFAULT_MIN_SUPPORT);
}
}
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TermDocumentCountMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TermDocumentCountMapper.java?rev=909861&r1=909860&r2=909861&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TermDocumentCountMapper.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TermDocumentCountMapper.java Sat Feb 13 17:55:56 2010
@@ -28,8 +28,8 @@
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.math.Vector.Element;
+import org.apache.mahout.math.VectorWritable;
/**
* TextVectorizer Document Frequency Count Mapper. Outputs 1 for each feature
@@ -51,8 +51,8 @@
while (it.hasNext()) {
Element e = it.next();
- output.collect(new IntWritable(e.index()), ONE);
+ output.collect(new IntWritable(e.index()), TermDocumentCountMapper.ONE);
}
- output.collect(TOTAL_COUNT, ONE);
+ output.collect(TermDocumentCountMapper.TOTAL_COUNT, TermDocumentCountMapper.ONE);
}
}
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TermDocumentCountReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TermDocumentCountReducer.java?rev=909861&r1=909860&r2=909861&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TermDocumentCountReducer.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TermDocumentCountReducer.java Sat Feb 13 17:55:56 2010
@@ -28,8 +28,7 @@
import org.apache.hadoop.mapred.Reporter;
/**
- * Can also be used as a local Combiner. This accumulates all the features and
- * the weights and sums them up.
+ * Can also be used as a local Combiner. This accumulates all the features and the weights and sums them up.
*/
public class TermDocumentCountReducer extends MapReduceBase implements
Reducer<IntWritable,LongWritable,IntWritable,LongWritable> {
@@ -40,8 +39,9 @@
OutputCollector<IntWritable,LongWritable> output,
Reporter reporter) throws IOException {
long sum = 0;
- while (values.hasNext())
+ while (values.hasNext()) {
sum += values.next().get();
+ }
output.collect(key, new LongWritable(sum));
}
}
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java?rev=909861&r1=909860&r2=909861&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java Sat Feb 13 17:55:56 2010
@@ -48,11 +48,10 @@
import org.apache.mahout.utils.vectors.text.term.TermDocumentCountReducer;
/**
- * This class converts a set of input vectors with term frequencies to TfIdf
- * vectors. The Sequence file input should have a {@link WritableComparable} key
- * containing and a {@link VectorWritable} value containing the term frequency
- * vector. This is conversion class uses multiple map/reduces to convert the
- * vectors to TfIdf format
+ * This class converts a set of input vectors with term frequencies to TfIdf vectors. The Sequence file input
+ * should have a {@link WritableComparable} key containing and a {@link VectorWritable} value containing the
+ * term frequency vector. This is conversion class uses multiple map/reduces to convert the vectors to TfIdf
+ * format
*
*/
public final class TFIDFConverter {
@@ -91,31 +90,26 @@
}
/**
- * Create Term Frequency-Inverse Document Frequency (Tf-Idf) Vectors from the
- * input set of vectors in {@link SequenceFile} format. This job uses a fixed
- * limit on the maximum memory used by the feature chunk per node thereby
- * splitting the process across multiple map/reduces.
+ * Create Term Frequency-Inverse Document Frequency (Tf-Idf) Vectors from the input set of vectors in
+ * {@link SequenceFile} format. This job uses a fixed limit on the maximum memory used by the feature chunk
+ * per node thereby splitting the process across multiple map/reduces.
*
* @param input
* input directory of the vectors in {@link SequenceFile} format
* @param output
- * output directory where
- * {@link org.apache.mahout.math.RandomAccessSparseVector}'s of the
- * document are generated
+ * output directory where {@link org.apache.mahout.math.RandomAccessSparseVector}'s of the document
+ * are generated
* @param minDf
* The minimum document frequency. Default 1
* @param maxDFPercent
- * The max percentage of vectors for the DF. Can be used to remove
- * really high frequency features. Expressed as an integer between 0
- * and 100. Default 99
+ * The max percentage of vectors for the DF. Can be used to remove really high frequency features.
+ * Expressed as an integer between 0 and 100. Default 99
* @param chunkSizeInMegabytes
- * the size in MB of the feature => id chunk to be kept in memory at
- * each node during Map/Reduce stage. Its recommended you calculated
- * this based on the number of cores and the free memory available to
- * you per node. Say, you have 2 cores and around 1GB extra memory to
- * spare we recommend you use a split size of around 400-500MB so
- * that two simultaneous reducers can create partial vectors without
- * thrashing the system due to increased swapping
+ * the size in MB of the feature => id chunk to be kept in memory at each node during Map/Reduce
+ * stage. Its recommended you calculated this based on the number of cores and the free memory
+ * available to you per node. Say, you have 2 cores and around 1GB extra memory to spare we
+ * recommend you use a split size of around 400-500MB so that two simultaneous reducers can create
+ * partial vectors without thrashing the system due to increased swapping
* @throws IOException
*/
public static void processTfIdf(String input,
@@ -125,52 +119,48 @@
int maxDFPercent,
float normPower,
boolean sequentialAccessOutput) throws IOException {
- if (chunkSizeInMegabytes < MIN_CHUNKSIZE) {
- chunkSizeInMegabytes = MIN_CHUNKSIZE;
- } else if (chunkSizeInMegabytes > MAX_CHUNKSIZE) { // 10GB
- chunkSizeInMegabytes = MAX_CHUNKSIZE;
+ if (chunkSizeInMegabytes < TFIDFConverter.MIN_CHUNKSIZE) {
+ chunkSizeInMegabytes = TFIDFConverter.MIN_CHUNKSIZE;
+ } else if (chunkSizeInMegabytes > TFIDFConverter.MAX_CHUNKSIZE) { // 10GB
+ chunkSizeInMegabytes = TFIDFConverter.MAX_CHUNKSIZE;
}
if (normPower != PartialVectorMerger.NO_NORMALIZING && normPower < 0) {
throw new IllegalArgumentException("normPower must either be -1 or >= 0");
}
- if (minDf < 1) minDf = 1;
- if (maxDFPercent < 0 || maxDFPercent > 100) maxDFPercent = 99;
+ if (minDf < 1) {
+ minDf = 1;
+ }
+ if (maxDFPercent < 0 || maxDFPercent > 100) {
+ maxDFPercent = 99;
+ }
Path inputPath = new Path(input);
- Path wordCountPath = new Path(output + WORDCOUNT_OUTPUT_FOLDER);
+ Path wordCountPath = new Path(output + TFIDFConverter.WORDCOUNT_OUTPUT_FOLDER);
- startDFCounting(inputPath, wordCountPath);
- Pair<Long[],List<Path>> datasetFeatures = createDictionaryChunks(
- wordCountPath, output, chunkSizeInMegabytes);
+ TFIDFConverter.startDFCounting(inputPath, wordCountPath);
+ Pair<Long[],List<Path>> datasetFeatures = TFIDFConverter.createDictionaryChunks(wordCountPath, output,
+ chunkSizeInMegabytes);
int partialVectorIndex = 0;
List<Path> partialVectorPaths = new ArrayList<Path>();
List<Path> dictionaryChunks = datasetFeatures.getSecond();
for (Path dictionaryChunk : dictionaryChunks) {
- Path partialVectorOutputPath = getPath(output + VECTOR_OUTPUT_FOLDER,
+ Path partialVectorOutputPath = TFIDFConverter.getPath(output + TFIDFConverter.VECTOR_OUTPUT_FOLDER,
partialVectorIndex++);
partialVectorPaths.add(partialVectorOutputPath);
- makePartialVectors(input,
- datasetFeatures.getFirst()[0],
- datasetFeatures.getFirst()[1],
- minDf,
- maxDFPercent,
- dictionaryChunk,
- partialVectorOutputPath);
+ TFIDFConverter.makePartialVectors(input, datasetFeatures.getFirst()[0], datasetFeatures.getFirst()[1],
+ minDf, maxDFPercent, dictionaryChunk, partialVectorOutputPath);
}
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(partialVectorPaths.get(0).toUri(), conf);
- String outputDir = output + DOCUMENT_VECTOR_OUTPUT_FOLDER;
+ String outputDir = output + TFIDFConverter.DOCUMENT_VECTOR_OUTPUT_FOLDER;
if (dictionaryChunks.size() > 1) {
- PartialVectorMerger.mergePartialVectors(partialVectorPaths,
- outputDir,
- normPower,
- (int)(long)datasetFeatures.getFirst()[0],
- sequentialAccessOutput);
+ PartialVectorMerger.mergePartialVectors(partialVectorPaths, outputDir, normPower,
+ (int) (long) datasetFeatures.getFirst()[0], sequentialAccessOutput);
HadoopUtil.deletePaths(partialVectorPaths, fs);
} else {
Path singlePartialVectorOutputPath = partialVectorPaths.get(0);
@@ -180,9 +170,8 @@
}
/**
- * Read the document frequency List which is built at the end of the DF Count
- * Job. This will use constant memory and will run at the speed of your disk
- * read
+ * Read the document frequency List which is built at the end of the DF Count Job. This will use constant
+ * memory and will run at the speed of your disk read
*
* @param featureCountPath
* @param dictionaryPathBase
@@ -198,16 +187,15 @@
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(featureCountPath.toUri(), conf);
- FileStatus[] outputFiles = fs.globStatus(new Path(featureCountPath
- .toString()
- + OUTPUT_FILES_PATTERN));
+ FileStatus[] outputFiles = fs.globStatus(new Path(featureCountPath.toString()
+ + TFIDFConverter.OUTPUT_FILES_PATTERN));
long chunkSizeLimit = chunkSizeInMegabytes * 1024 * 1024;
int chunkIndex = 0;
- Path chunkPath = getPath(dictionaryPathBase + FREQUENCY_FILE, chunkIndex);
+ Path chunkPath = TFIDFConverter.getPath(dictionaryPathBase + TFIDFConverter.FREQUENCY_FILE, chunkIndex);
chunkPaths.add(chunkPath);
- SequenceFile.Writer freqWriter = new SequenceFile.Writer(fs, conf,
- chunkPath, IntWritable.class, LongWritable.class);
+ SequenceFile.Writer freqWriter = new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class,
+ LongWritable.class);
long currentChunkSize = 0;
long featureCount = 0;
@@ -221,16 +209,14 @@
freqWriter.close();
chunkIndex++;
- chunkPath = getPath(dictionaryPathBase + FREQUENCY_FILE, chunkIndex);
+ chunkPath = TFIDFConverter.getPath(dictionaryPathBase + TFIDFConverter.FREQUENCY_FILE, chunkIndex);
chunkPaths.add(chunkPath);
- freqWriter = new SequenceFile.Writer(fs, conf, chunkPath,
- IntWritable.class, LongWritable.class);
+ freqWriter = new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class, LongWritable.class);
currentChunkSize = 0;
}
- int fieldSize = SEQUENCEFILE_BYTE_OVERHEAD + (Integer.SIZE / 8)
- + (Long.SIZE / 8);
+ int fieldSize = TFIDFConverter.SEQUENCEFILE_BYTE_OVERHEAD + Integer.SIZE / 8 + Long.SIZE / 8;
currentChunkSize += fieldSize;
if (key.get() >= 0) {
freqWriter.append(key, value);
@@ -251,8 +237,8 @@
}
/**
- * Create a partial tfidf vector using a chunk of features from the input
- * vectors. The input vectors has to be in the {@link SequenceFile} format
+ * Create a partial tfidf vector using a chunk of features from the input vectors. The input vectors has to
+ * be in the {@link SequenceFile} format
*
* @param input
* input directory of the vectors in {@link SequenceFile} format
@@ -263,9 +249,8 @@
* @param minDf
* The minimum document frequency. Default 1
* @param maxDFPercent
- * The max percentage of vectors for the DF. Can be used to remove
- * really high frequency features. Expressed as an integer between 0
- * and 100. Default 99
+ * The max percentage of vectors for the DF. Can be used to remove really high frequency features.
+ * Expressed as an integer between 0 and 100. Default 99
* @param dictionaryFilePath
* location of the chunk of features and the id's
* @param output
@@ -282,22 +267,19 @@
Configurable client = new JobClient();
JobConf conf = new JobConf(TFIDFConverter.class);
- conf.set("io.serializations",
- "org.apache.hadoop.io.serializer.JavaSerialization,"
- + "org.apache.hadoop.io.serializer.WritableSerialization");
+ conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
+ + "org.apache.hadoop.io.serializer.WritableSerialization");
// this conf parameter needs to be set enable serialisation of conf values
- conf.setJobName("TFIDFConverter:: MakePartialVectors: input-folder: "
- + input + ", dictionary-file: "
+ conf.setJobName("TFIDFConverter:: MakePartialVectors: input-folder: " + input + ", dictionary-file: "
+ dictionaryFilePath.toString());
- conf.setLong(FEATURE_COUNT, featureCount.longValue());
- conf.setLong(VECTOR_COUNT, vectorCount.longValue());
- conf.setInt(MIN_DF, minDf);
- conf.setInt(MAX_DF_PERCENTAGE, maxDFPercent);
+ conf.setLong(TFIDFConverter.FEATURE_COUNT, featureCount.longValue());
+ conf.setLong(TFIDFConverter.VECTOR_COUNT, vectorCount.longValue());
+ conf.setInt(TFIDFConverter.MIN_DF, minDf);
+ conf.setInt(TFIDFConverter.MAX_DF_PERCENTAGE, maxDFPercent);
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(VectorWritable.class);
- DistributedCache
- .setCacheFiles(new URI[] {dictionaryFilePath.toUri()}, conf);
+ DistributedCache.setCacheFiles(new URI[] {dictionaryFilePath.toUri()}, conf);
FileInputFormat.setInputPaths(conf, new Path(input));
FileOutputFormat.setOutputPath(conf, output);
@@ -316,20 +298,18 @@
}
/**
- * Count the document frequencies of features in parallel using Map/Reduce.
- * The input documents have to be in {@link SequenceFile} format
+ * Count the document frequencies of features in parallel using Map/Reduce. The input documents have to be
+ * in {@link SequenceFile} format
*/
private static void startDFCounting(Path input, Path output) throws IOException {
Configurable client = new JobClient();
JobConf conf = new JobConf(TFIDFConverter.class);
- conf.set("io.serializations",
- "org.apache.hadoop.io.serializer.JavaSerialization,"
- + "org.apache.hadoop.io.serializer.WritableSerialization");
+ conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
+ + "org.apache.hadoop.io.serializer.WritableSerialization");
// this conf parameter needs to be set enable serialisation of conf values
- conf.setJobName("VectorTfIdf Document Frequency Count running over input: "
- + input.toString());
+ conf.setJobName("VectorTfIdf Document Frequency Count running over input: " + input.toString());
conf.setOutputKeyClass(IntWritable.class);
conf.setOutputValueClass(LongWritable.class);
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFPartialVectorReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFPartialVectorReducer.java?rev=909861&r1=909860&r2=909861&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFPartialVectorReducer.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFPartialVectorReducer.java Sat Feb 13 17:55:56 2010
@@ -35,16 +35,15 @@
import org.apache.hadoop.mapred.Reporter;
import org.apache.mahout.math.RandomAccessSparseVector;
import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.math.Vector.Element;
+import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.math.map.OpenIntLongHashMap;
import org.apache.mahout.utils.vectors.TFIDF;
/**
* Converts a document in to a sparse vector
*/
-public class TFIDFPartialVectorReducer extends MapReduceBase
- implements
+public class TFIDFPartialVectorReducer extends MapReduceBase implements
Reducer<WritableComparable<?>,VectorWritable,WritableComparable<?>,VectorWritable> {
private final OpenIntLongHashMap dictionary = new OpenIntLongHashMap();
@@ -60,19 +59,27 @@
Iterator<VectorWritable> values,
OutputCollector<WritableComparable<?>,VectorWritable> output,
Reporter reporter) throws IOException {
- if (!values.hasNext()) return;
+ if (!values.hasNext()) {
+ return;
+ }
Vector value = values.next().get();
Iterator<Element> it = value.iterateNonZero();
- Vector vector = new RandomAccessSparseVector(key
- .toString(), (int)featureCount, value.getNumNondefaultElements());
+ Vector vector = new RandomAccessSparseVector(key.toString(), (int) featureCount, value
+ .getNumNondefaultElements());
while (it.hasNext()) {
Element e = it.next();
- if (!dictionary.containsKey(e.index())) continue;
+ if (!dictionary.containsKey(e.index())) {
+ continue;
+ }
long df = dictionary.get(e.index());
- if (df / vectorCount > maxDfPercent) continue;
- if (df < minDf) df = minDf;
- vector.setQuick(e.index(), tfidf.calculate((int) e.get(), (int) df,
- (int) featureCount, (int) vectorCount));
+ if (df / vectorCount > maxDfPercent) {
+ continue;
+ }
+ if (df < minDf) {
+ df = minDf;
+ }
+ vector.setQuick(e.index(), tfidf.calculate((int) e.get(), (int) df, (int) featureCount,
+ (int) vectorCount));
}
vectorWritable.set(vector);
@@ -86,8 +93,7 @@
URI[] localFiles = DistributedCache.getCacheFiles(job);
if (localFiles == null || localFiles.length < 1) {
- throw new IllegalArgumentException(
- "missing paths from the DistributedCache");
+ throw new IllegalArgumentException("missing paths from the DistributedCache");
}
vectorCount = job.getLong(TFIDFConverter.VECTOR_COUNT, 1);
@@ -97,8 +103,7 @@
Path dictionaryFile = new Path(localFiles[0].getPath());
FileSystem fs = dictionaryFile.getFileSystem(job);
- SequenceFile.Reader reader = new SequenceFile.Reader(fs, dictionaryFile,
- job);
+ SequenceFile.Reader reader = new SequenceFile.Reader(fs, dictionaryFile, job);
IntWritable key = new IntWritable();
LongWritable value = new LongWritable();
Modified: lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/dirichlet/TestL1ModelClustering.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/dirichlet/TestL1ModelClustering.java?rev=909861&r1=909860&r2=909861&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/dirichlet/TestL1ModelClustering.java (original)
+++ lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/dirichlet/TestL1ModelClustering.java Sat Feb 13 17:55:56 2010
@@ -6,13 +6,13 @@
import java.util.Iterator;
import java.util.List;
+import junit.framework.Assert;
+
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
-import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.apache.mahout.clustering.dirichlet.models.L1ModelDistribution;
@@ -33,66 +33,67 @@
public class TestL1ModelClustering extends MahoutTestCase {
-
- @SuppressWarnings("unchecked")
+
private class MapElement implements Comparable<MapElement> {
-
+
MapElement(double pdf, String doc) {
super();
this.pdf = pdf;
this.doc = doc;
}
-
+
private final Double pdf;
-
+
private final String doc;
-
+
@Override
// reverse compare to sort in reverse order
public int compareTo(MapElement e) {
- if (e.pdf > pdf)
+ if (e.pdf > pdf) {
return 1;
- else if (e.pdf < pdf)
+ } else if (e.pdf < pdf) {
return -1;
- else
+ } else {
return 0;
+ }
}
-
+
+ @Override
public String toString() {
return pdf.toString() + ' ' + doc.toString();
}
-
+
}
-
+
private static final String[] DOCS = { "The quick red fox jumped over the lazy brown dogs.",
- "The quick brown fox jumped over the lazy red dogs.", "The quick red cat jumped over the lazy brown dogs.",
- "The quick brown cat jumped over the lazy red dogs.", "Mary had a little lamb whose fleece was white as snow.",
- "Moby Dick is a story of a whale and a man obsessed.", "The robber wore a black fleece jacket and a baseball cap.",
- "The English Springer Spaniel is the best of all dogs." };
-
+ "The quick brown fox jumped over the lazy red dogs.", "The quick red cat jumped over the lazy brown dogs.",
+ "The quick brown cat jumped over the lazy red dogs.", "Mary had a little lamb whose fleece was white as snow.",
+ "Moby Dick is a story of a whale and a man obsessed.", "The robber wore a black fleece jacket and a baseball cap.",
+ "The English Springer Spaniel is the best of all dogs." };
+
private List<VectorWritable> sampleData;
-
+
private static final String[] DOCS2 = { "The quick red fox jumped over the lazy brown dogs.",
- "The quick brown fox jumped over the lazy red dogs.", "The quick red cat jumped over the lazy brown dogs.",
- "The quick brown cat jumped over the lazy red dogs.", "Mary had a little lamb whose fleece was white as snow.",
- "Mary had a little goat whose fleece was white as snow.", "Mary had a little lamb whose fleece was black as tar.",
- "Dick had a little goat whose fleece was white as snow.", "Moby Dick is a story of a whale and a man obsessed.",
- "Moby Bob is a story of a walrus and a man obsessed.", "Moby Dick is a story of a whale and a crazy man.",
- "The robber wore a black fleece jacket and a baseball cap.", "The robber wore a red fleece jacket and a baseball cap.",
- "The robber wore a white fleece jacket and a baseball cap.", "The English Springer Spaniel is the best of all dogs." };
-
+ "The quick brown fox jumped over the lazy red dogs.", "The quick red cat jumped over the lazy brown dogs.",
+ "The quick brown cat jumped over the lazy red dogs.", "Mary had a little lamb whose fleece was white as snow.",
+ "Mary had a little goat whose fleece was white as snow.", "Mary had a little lamb whose fleece was black as tar.",
+ "Dick had a little goat whose fleece was white as snow.", "Moby Dick is a story of a whale and a man obsessed.",
+ "Moby Bob is a story of a walrus and a man obsessed.", "Moby Dick is a story of a whale and a crazy man.",
+ "The robber wore a black fleece jacket and a baseball cap.", "The robber wore a red fleece jacket and a baseball cap.",
+ "The robber wore a white fleece jacket and a baseball cap.", "The English Springer Spaniel is the best of all dogs." };
+
@Override
@Before
public void setUp() throws Exception {
super.setUp();
RandomUtils.useTestSeed();
}
-
+
private void getSampleData(String[] docs2) throws IOException {
sampleData = new ArrayList<VectorWritable>();
RAMDirectory directory = new RAMDirectory();
IndexWriter writer = new IndexWriter(directory, new StandardAnalyzer(Version.LUCENE_CURRENT), true,
- IndexWriter.MaxFieldLength.UNLIMITED);
+ IndexWriter.MaxFieldLength.UNLIMITED);
for (int i = 0; i < docs2.length; i++) {
Document doc = new Document();
Field id = new Field("id", "doc_" + i, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS);
@@ -108,15 +109,15 @@
TermInfo termInfo = new CachedTermInfo(reader, "content", 1, 100);
VectorMapper mapper = new TFDFMapper(reader, weight, termInfo);
LuceneIterable iterable = new LuceneIterable(reader, "id", "content", mapper);
-
+
int i = 0;
for (Vector vector : iterable) {
- assertNotNull(vector);
- System.out.println("Vector[" + i++ + "]=" + formatVector(vector));
+ Assert.assertNotNull(vector);
+ System.out.println("Vector[" + i++ + "]=" + TestL1ModelClustering.formatVector(vector));
sampleData.add(new VectorWritable(vector));
}
}
-
+
private static String formatVector(Vector v) {
StringBuilder buf = new StringBuilder();
int nzero = 0;
@@ -131,17 +132,19 @@
int nextIx = 0;
for (int i = 0; i < v.size(); i++) {
double elem = v.get(i);
- if (elem == 0.0)
+ if (elem == 0.0) {
continue;
- if (i > nextIx)
+ }
+ if (i > nextIx) {
buf.append("..{").append(i).append("}=");
+ }
buf.append(String.format("%.2f", elem)).append(", ");
nextIx = i + 1;
}
buf.append(']');
return buf.toString();
}
-
+
private static void printSamples(List<Model<VectorWritable>[]> result, int significant) {
int row = 0;
for (Model<VectorWritable>[] r : result) {
@@ -161,13 +164,14 @@
}
System.out.println();
}
-
+
private void printClusters(Model<VectorWritable>[] models, List<VectorWritable> samples, String[] docs) {
for (int m = 0; m < models.length; m++) {
Model<VectorWritable> model = models[m];
int count = model.count();
- if (count == 0)
+ if (count == 0) {
continue;
+ }
System.out.println("Model[" + m + "] had " + count + " hits (!) and " + (samples.size()-count) + " misses (? in pdf order) during the last iteration:");
MapElement[] map = new MapElement[samples.size()];
// sort the samples by pdf
@@ -178,35 +182,36 @@
Arrays.sort(map);
// now find the n=model.count() most likely docs and output them
for (int i = 0; i < map.length; i++) {
- if (i < count)
+ if (i < count) {
System.out.print("! ");
- else
+ } else {
System.out.print("? ");
+ }
System.out.println(map[i].doc);
}
}
}
-
+
public void testDocs() throws Exception {
System.out.println("testDocs");
- getSampleData(DOCS);
+ getSampleData(TestL1ModelClustering.DOCS);
DirichletClusterer<VectorWritable> dc = new DirichletClusterer<VectorWritable>(sampleData, new L1ModelDistribution(sampleData
- .get(0)), 1.0, 15, 1, 0);
+ .get(0)), 1.0, 15, 1, 0);
List<Model<VectorWritable>[]> result = dc.cluster(10);
- assertNotNull(result);
- printSamples(result, 0);
- printClusters(result.get(result.size() - 1), sampleData, DOCS);
+ Assert.assertNotNull(result);
+ TestL1ModelClustering.printSamples(result, 0);
+ printClusters(result.get(result.size() - 1), sampleData, TestL1ModelClustering.DOCS);
}
-
+
public void testDocs2() throws Exception {
System.out.println("testDocs2");
- getSampleData(DOCS2);
+ getSampleData(TestL1ModelClustering.DOCS2);
DirichletClusterer<VectorWritable> dc = new DirichletClusterer<VectorWritable>(sampleData, new L1ModelDistribution(sampleData
- .get(0)), 1.0, 15, 1, 0);
+ .get(0)), 1.0, 15, 1, 0);
List<Model<VectorWritable>[]> result = dc.cluster(10);
- assertNotNull(result);
- printSamples(result, 0);
- printClusters(result.get(result.size() - 1), sampleData, DOCS2);
+ Assert.assertNotNull(result);
+ TestL1ModelClustering.printSamples(result, 0);
+ printClusters(result.get(result.size() - 1), sampleData, TestL1ModelClustering.DOCS2);
}
-
+
}
Modified: lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapperTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapperTest.java?rev=909861&r1=909860&r2=909861&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapperTest.java (original)
+++ lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapperTest.java Sat Feb 13 17:55:56 2010
@@ -61,29 +61,31 @@
key.set("dummy-key");
String[] input = {"the", "best", "of", "times", "the", "worst", "of",
- "times"};
+ "times"};
StringTuple inputTuple = new StringTuple();
for (String i : input) {
inputTuple.add(i);
}
String[][] values = new String[][] { {"h_the", "the best"},
- {"t_best", "the best"},
- {"h_of", "of times"},
- {"t_times", "of times"},
- {"h_best", "best of"},
- {"t_of", "best of"},
- {"h_the", "the worst"},
- {"t_worst", "the worst"},
- {"h_times", "times the"},
- {"t_the", "times the"},
- {"h_worst", "worst of"},
- {"t_of", "worst of"},};
+ {"t_best", "the best"},
+ {"h_of", "of times"},
+ {"t_times", "of times"},
+ {"h_best", "best of"},
+ {"t_of", "best of"},
+ {"h_the", "the worst"},
+ {"t_worst", "the worst"},
+ {"h_times", "times the"},
+ {"t_the", "times the"},
+ {"h_worst", "worst of"},
+ {"t_of", "worst of"},};
// set up expectations for mocks. ngram max size = 2
for (String[] v : values) {
Type p = v[0].startsWith("h") ? HEAD : TAIL;
int frequency = 1;
- if (v[1].equals("of times")) frequency = 2;
+ if (v[1].equals("of times")) {
+ frequency = 2;
+ }
Gram subgram = new Gram(v[0].substring(2), frequency, p);
Gram ngram = new Gram(v[1], frequency);
collector.collect(subgram, ngram);
@@ -110,34 +112,36 @@
key.set("dummy-key");
String[] input = {"the", "best", "of", "times", "the", "worst", "of",
- "times"};
+ "times"};
StringTuple inputTuple = new StringTuple();
for (String i : input) {
inputTuple.add(i);
}
String[][] values = new String[][] { {"h_the", "the best"},
- {"t_best", "the best"},
- {"h_of", "of times"},
- {"t_times", "of times"},
- {"h_best", "best of"},
- {"t_of", "best of"},
- {"h_the", "the worst"},
- {"t_worst", "the worst"},
- {"h_times", "times the"},
- {"t_the", "times the"},
- {"h_worst", "worst of"},
- {"t_of", "worst of"},
- {"u_worst", "worst"}, {"u_of", "of"},
- {"u_the", "the"}, {"u_best", "best"},
- {"u_times", "times"},};
+ {"t_best", "the best"},
+ {"h_of", "of times"},
+ {"t_times", "of times"},
+ {"h_best", "best of"},
+ {"t_of", "best of"},
+ {"h_the", "the worst"},
+ {"t_worst", "the worst"},
+ {"h_times", "times the"},
+ {"t_the", "times the"},
+ {"h_worst", "worst of"},
+ {"t_of", "worst of"},
+ {"u_worst", "worst"}, {"u_of", "of"},
+ {"u_the", "the"}, {"u_best", "best"},
+ {"u_times", "times"},};
// set up expectations for mocks. ngram max size = 2
for (String[] v : values) {
Type p = v[0].startsWith("h") ? HEAD : TAIL;
p = v[0].startsWith("u") ? UNIGRAM : p;
int frequency = 1;
if (v[1].equals("of times") || v[1].equals("of") || v[1].equals("times")
- || v[1].equals("the")) frequency = 2;
+ || v[1].equals("the")) {
+ frequency = 2;
+ }
Gram subgram = new Gram(v[0].substring(2), frequency, p);
Gram ngram = new Gram(v[1], frequency);
collector.collect(subgram, ngram);
Modified: lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducerTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducerTest.java?rev=909861&r1=909860&r2=909861&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducerTest.java (original)
+++ lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducerTest.java Sat Feb 13 17:55:56 2010
@@ -44,28 +44,28 @@
output = EasyMock.createMock(OutputCollector.class);
reporter = EasyMock.createMock(Reporter.class);
}
-
+
@Test
public void testReduce() throws Exception {
// test input, input[*][0] is the key,
// input[*][1..n] are the values passed in via
// the iterator.
Gram[][] input = new Gram[][] {
- { new Gram("the", UNIGRAM), new Gram("the", UNIGRAM), new Gram("the", UNIGRAM) },
- { new Gram("the", HEAD), new Gram("the best"), new Gram("the worst") },
- { new Gram("of", HEAD), new Gram("of times"), new Gram("of times") },
- { new Gram("times", TAIL), new Gram("of times"), new Gram("of times") }
+ { new Gram("the", UNIGRAM), new Gram("the", UNIGRAM), new Gram("the", UNIGRAM) },
+ { new Gram("the", HEAD), new Gram("the best"), new Gram("the worst") },
+ { new Gram("of", HEAD), new Gram("of times"), new Gram("of times") },
+ { new Gram("times", TAIL), new Gram("of times"), new Gram("of times") }
};
-
+
// expected results.
Gram[][] values = new Gram[][] {
- { new Gram("the", 4, UNIGRAM), new Gram("the", 2, UNIGRAM) },
- { new Gram("the best", 1), new Gram("the", 2, HEAD) },
- { new Gram("the worst", 1), new Gram("the", 2, HEAD) },
- { new Gram("of times", 2), new Gram("of", 2, HEAD) },
- { new Gram("of times", 2), new Gram("times", 2, TAIL) }
+ { new Gram("the", 4, UNIGRAM), new Gram("the", 2, UNIGRAM) },
+ { new Gram("the best", 1), new Gram("the", 2, HEAD) },
+ { new Gram("the worst", 1), new Gram("the", 2, HEAD) },
+ { new Gram("of times", 2), new Gram("of", 2, HEAD) },
+ { new Gram("of times", 2), new Gram("times", 2, TAIL) }
};
-
+
// set up expectations
for (Gram[] v : values) {
output.collect(v[0], v[1]);