You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ro...@apache.org on 2010/02/11 08:12:50 UTC
svn commit: r908859 - in /lucene/mahout/trunk/utils/src:
main/java/org/apache/mahout/text/
main/java/org/apache/mahout/utils/nlp/collocations/llr/
test/java/org/apache/mahout/utils/nlp/collocations/llr/
Author: robinanil
Date: Thu Feb 11 07:12:48 2010
New Revision: 908859
URL: http://svn.apache.org/viewvc?rev=908859&view=rev
Log:
MAHOUT-285 CollocMapper optimisations (Now reduces number of subgrams in output)
Modified:
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapper.java
lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapperTest.java
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java?rev=908859&r1=908858&r2=908859&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java Thu Feb 11 07:12:48 2010
@@ -150,118 +150,119 @@
.withOption(minLLROpt).withOption(numReduceTasksOpt).withOption(
maxNGramSizeOpt).withOption(overwriteOutput).withOption(helpOpt)
.create();
-
- Parser parser = new Parser();
- parser.setGroup(group);
- CommandLine cmdLine = null;
-
try {
- // standard help opt won't work because
- // outputDir is required and exception will
- // be thrown if it is not present.
- cmdLine = parser.parse(args);
- }
- catch (OptionException oe) {
- System.out.println(oe.getMessage());
+ Parser parser = new Parser();
+ parser.setGroup(group);
+ CommandLine cmdLine = parser.parse(args);
+
+ if (cmdLine.hasOption(helpOpt)) {
CommandLineUtil.printHelp(group);
return;
- }
-
- String inputDir = (String) cmdLine.getValue(inputDirOpt);
- String outputDir = (String) cmdLine.getValue(outputDirOpt);
-
- int chunkSize = 100;
- if (cmdLine.hasOption(chunkSizeOpt)) {
- chunkSize = Integer.parseInt((String) cmdLine.getValue(chunkSizeOpt));
- }
- int minSupport = 2;
- if (cmdLine.hasOption(minSupportOpt)) {
- String minSupportString = (String) cmdLine.getValue(minSupportOpt);
- minSupport = Integer.parseInt(minSupportString);
- }
-
- int maxNGramSize = 1;
-
- if (cmdLine.hasOption(maxNGramSizeOpt) == true) {
- try {
- maxNGramSize = Integer.parseInt(cmdLine.getValue(maxNGramSizeOpt)
+ }
+
+ String inputDir = (String) cmdLine.getValue(inputDirOpt);
+ String outputDir = (String) cmdLine.getValue(outputDirOpt);
+
+ int chunkSize = 100;
+ if (cmdLine.hasOption(chunkSizeOpt)) {
+ chunkSize = Integer.parseInt((String) cmdLine.getValue(chunkSizeOpt));
+ }
+ int minSupport = 2;
+ if (cmdLine.hasOption(minSupportOpt)) {
+ String minSupportString = (String) cmdLine.getValue(minSupportOpt);
+ minSupport = Integer.parseInt(minSupportString);
+ }
+
+ int maxNGramSize = 1;
+
+ if (cmdLine.hasOption(maxNGramSizeOpt) == true) {
+ try {
+ maxNGramSize = Integer.parseInt(cmdLine.getValue(maxNGramSizeOpt)
+ .toString());
+ } catch (NumberFormatException ex) {
+ log.warn("Could not parse ngram size option");
+ }
+ }
+ log.info("Maximum n-gram size is: {}", maxNGramSize);
+
+ if (cmdLine.hasOption(overwriteOutput) == true) {
+ HadoopUtil.overwriteOutput(outputDir);
+ }
+
+ float minLLRValue = LLRReducer.DEFAULT_MIN_LLR;
+ if (cmdLine.hasOption(minLLROpt)) {
+ minLLRValue = Float.parseFloat(cmdLine.getValue(minLLROpt).toString());
+ }
+ log.info("Minimum LLR value: {}", minLLRValue);
+
+ int reduceTasks = 1;
+ if (cmdLine.hasOption(numReduceTasksOpt)) {
+ reduceTasks = Integer.parseInt(cmdLine.getValue(numReduceTasksOpt)
.toString());
- } catch (NumberFormatException ex) {
- log.warn("Could not parse ngram size option");
}
- }
- log.info("Maximum n-gram size is: {}", maxNGramSize);
-
- if (cmdLine.hasOption(overwriteOutput) == true) {
- HadoopUtil.overwriteOutput(outputDir);
- }
-
- float minLLRValue = LLRReducer.DEFAULT_MIN_LLR;
- if (cmdLine.hasOption(minLLROpt)) {
- minLLRValue = Float.parseFloat(cmdLine.getValue(minLLROpt).toString());
- }
- log.info("Minimum LLR value: {}", minLLRValue);
-
- int reduceTasks = 1;
- if (cmdLine.hasOption(numReduceTasksOpt)) {
- reduceTasks = Integer.parseInt(cmdLine.getValue(numReduceTasksOpt)
- .toString());
- }
- log.info("Pass1 reduce tasks: {}", reduceTasks);
-
- Class<? extends Analyzer> analyzerClass = StandardAnalyzer.class;
- if (cmdLine.hasOption(analyzerNameOpt)) {
- String className = cmdLine.getValue(analyzerNameOpt).toString();
- analyzerClass = (Class<? extends Analyzer>) Class.forName(className);
- // try instantiating it, b/c there isn't any point in setting it if
- // you can't instantiate it
- analyzerClass.newInstance();
- }
-
- boolean processIdf;
-
- if (cmdLine.hasOption(weightOpt)) {
- String wString = cmdLine.getValue(weightOpt).toString();
- if (wString.equalsIgnoreCase("tf")) {
- processIdf = false;
- } else if (wString.equalsIgnoreCase("tfidf")) {
- processIdf = true;
- } else {
- throw new OptionException(weightOpt);
+ log.info("Pass1 reduce tasks: {}", reduceTasks);
+
+ Class<? extends Analyzer> analyzerClass = StandardAnalyzer.class;
+ if (cmdLine.hasOption(analyzerNameOpt)) {
+ String className = cmdLine.getValue(analyzerNameOpt).toString();
+ analyzerClass = (Class<? extends Analyzer>) Class.forName(className);
+ // try instantiating it, b/c there isn't any point in setting it if
+ // you can't instantiate it
+ analyzerClass.newInstance();
}
- } else {
- processIdf = true;
- }
-
- int minDf = 1;
- if (cmdLine.hasOption(minDFOpt)) {
- minDf = Integer.parseInt(cmdLine.getValue(minDFOpt).toString());
- }
- int maxDFPercent = 99;
- if (cmdLine.hasOption(maxDFPercentOpt)) {
- maxDFPercent = Integer.parseInt(cmdLine.getValue(maxDFPercentOpt)
- .toString());
- }
-
- float norm = PartialVectorMerger.NO_NORMALIZING;
- if (cmdLine.hasOption(powerOpt)) {
- String power = cmdLine.getValue(powerOpt).toString();
- if (power.equals("INF")) {
- norm = Float.POSITIVE_INFINITY;
+
+ boolean processIdf;
+
+ if (cmdLine.hasOption(weightOpt)) {
+ String wString = cmdLine.getValue(weightOpt).toString();
+ if (wString.equalsIgnoreCase("tf")) {
+ processIdf = false;
+ } else if (wString.equalsIgnoreCase("tfidf")) {
+ processIdf = true;
+ } else {
+ throw new OptionException(weightOpt);
+ }
} else {
- norm = Float.parseFloat(power);
+ processIdf = true;
}
- }
- HadoopUtil.overwriteOutput(outputDir);
- String tokenizedPath = outputDir + DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER;
- DocumentProcessor.tokenizeDocuments(inputDir, analyzerClass, tokenizedPath);
-
- DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir,
- minSupport, maxNGramSize, minLLRValue, reduceTasks, chunkSize);
- if (processIdf) {
- TFIDFConverter.processTfIdf(
- outputDir + DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER,
- outputDir + TFIDFConverter.TFIDF_OUTPUT_FOLDER, chunkSize, minDf, maxDFPercent, norm);
+
+ int minDf = 1;
+ if (cmdLine.hasOption(minDFOpt)) {
+ minDf = Integer.parseInt(cmdLine.getValue(minDFOpt).toString());
+ }
+ int maxDFPercent = 99;
+ if (cmdLine.hasOption(maxDFPercentOpt)) {
+ maxDFPercent = Integer.parseInt(cmdLine.getValue(maxDFPercentOpt)
+ .toString());
+ }
+
+ float norm = PartialVectorMerger.NO_NORMALIZING;
+ if (cmdLine.hasOption(powerOpt)) {
+ String power = cmdLine.getValue(powerOpt).toString();
+ if (power.equals("INF")) {
+ norm = Float.POSITIVE_INFINITY;
+ } else {
+ norm = Float.parseFloat(power);
+ }
+ }
+ HadoopUtil.overwriteOutput(outputDir);
+ String tokenizedPath = outputDir
+ + DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER;
+ DocumentProcessor.tokenizeDocuments(inputDir, analyzerClass,
+ tokenizedPath);
+
+ DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir,
+ minSupport, maxNGramSize, minLLRValue, reduceTasks, chunkSize);
+ if (processIdf) {
+ TFIDFConverter.processTfIdf(
+ outputDir + DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER,
+ outputDir + TFIDFConverter.TFIDF_OUTPUT_FOLDER, chunkSize, minDf,
+ maxDFPercent, norm);
+ }
+ } catch (OptionException e) {
+ log.error("Exception", e);
+ CommandLineUtil.printHelp(group);
}
}
+
}
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapper.java?rev=908859&r1=908858&r2=908859&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapper.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapper.java Thu Feb 11 07:12:48 2010
@@ -35,6 +35,8 @@
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.mahout.common.StringTuple;
+import org.apache.mahout.math.function.ObjectIntProcedure;
+import org.apache.mahout.math.map.OpenObjectIntHashMap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -115,12 +117,16 @@
@Override
public void map(Text key,
StringTuple value,
- OutputCollector<Gram,Gram> collector,
+ final OutputCollector<Gram,Gram> collector,
Reporter reporter) throws IOException {
ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value
.getEntries().iterator()), maxShingleSize);
int count = 0; // ngram count
+ OpenObjectIntHashMap<String> ngrams = new OpenObjectIntHashMap<String>(
+ value.getEntries().size() * (maxShingleSize - 1));
+ OpenObjectIntHashMap<String> unigrams = new OpenObjectIntHashMap<String>(
+ value.getEntries().size());
do {
String term = ((TermAttribute) sf.getAttribute(TermAttribute.class))
@@ -129,19 +135,54 @@
.type();
if ("shingle".equals(type)) {
count++;
- Gram ngram = new Gram(term);
+ if (ngrams.containsKey(term) == false) {
+ ngrams.put(term, 1);
+ } else {
+ ngrams.put(term, 1 + ngrams.get(term));
+ }
+ } else if (emitUnigrams && term.length() > 0) { // unigram
+ if (unigrams.containsKey(term) == false) {
+ unigrams.put(term, 1);
+ } else {
+ unigrams.put(term, 1 + unigrams.get(term));
+ }
+ }
+ } while (sf.incrementToken());
+
+ ngrams.forEachPair(new ObjectIntProcedure<String>() {
+
+ @Override
+ public boolean apply(String term, int frequency) {
+ Gram ngram = new Gram(term, frequency);
// obtain components, the leading (n-1)gram and the trailing unigram.
int i = term.lastIndexOf(' ');
if (i != -1) { // bigram, trigram etc
- collector.collect(new Gram(term.substring(0, i), HEAD), ngram);
- collector.collect(new Gram(term.substring(i + 1), TAIL), ngram);
+ try {
+ collector.collect(new Gram(term.substring(0, i), frequency, HEAD),
+ ngram);
+ collector.collect(new Gram(term.substring(i + 1), frequency, TAIL),
+ ngram);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
}
- } else if (emitUnigrams && term.length() > 0) { // unigram
- Gram ngram = new Gram(term);
- Gram unigram = new Gram(term, UNIGRAM);
- collector.collect(unigram, ngram);
+ return true;
}
- } while (sf.incrementToken());
+ });
+
+ unigrams.forEachPair(new ObjectIntProcedure<String>() {
+ @Override
+ public boolean apply(String term, int frequency) {
+ try {
+ Gram ngram = new Gram(term, frequency);
+ Gram unigram = new Gram(term, frequency, UNIGRAM);
+ collector.collect(unigram, ngram);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ return true;
+ }
+ });
reporter.incrCounter(Count.NGRAM_TOTAL, count);
Modified: lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapperTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapperTest.java?rev=908859&r1=908858&r2=908859&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapperTest.java (original)
+++ lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapperTest.java Thu Feb 11 07:12:48 2010
@@ -38,138 +38,126 @@
import org.junit.Before;
import org.junit.Test;
-/** Test for CollocMapper
- * FIXME: Add negative test cases
+/**
+ * Test for CollocMapper FIXME: Add negative test cases
*/
@SuppressWarnings("deprecation")
public class CollocMapperTest {
-
+
OutputCollector<Gram,Gram> collector;
Reporter reporter;
-
+
@Before
@SuppressWarnings("unchecked")
public void setUp() {
collector = EasyMock.createMock(OutputCollector.class);
- reporter = EasyMock.createMock(Reporter.class);
+ reporter = EasyMock.createMock(Reporter.class);
}
-
+
@Test
public void testCollectNgrams() throws Exception {
-
+
Text key = new Text();
key.set("dummy-key");
- String[] input = {"the", "best", "of", "times", "the", "worst", "of", "times"};
+ String[] input = {"the", "best", "of", "times", "the", "worst", "of",
+ "times"};
StringTuple inputTuple = new StringTuple();
- for (String i: input) {
+ for (String i : input) {
inputTuple.add(i);
}
-
- String[][] values =
- new String[][]{
- {"h_the", "the best"},
- {"t_best", "the best"},
- {"h_best", "best of"},
- {"t_of", "best of"},
- {"h_of", "of times"},
- {"t_times", "of times"},
- {"h_times", "times the"},
- {"t_the", "times the"},
- {"h_the", "the worst"},
- {"t_worst", "the worst"},
- {"h_worst", "worst of"},
- {"t_of", "worst of"},
- {"h_of", "of times"},
- {"t_times", "of times"}
- };
+
+ String[][] values = new String[][] { {"h_the", "the best"},
+ {"t_best", "the best"},
+ {"h_of", "of times"},
+ {"t_times", "of times"},
+ {"h_best", "best of"},
+ {"t_of", "best of"},
+ {"h_the", "the worst"},
+ {"t_worst", "the worst"},
+ {"h_times", "times the"},
+ {"t_the", "times the"},
+ {"h_worst", "worst of"},
+ {"t_of", "worst of"},};
// set up expectations for mocks. ngram max size = 2
- for (String[] v: values) {
+ for (String[] v : values) {
Type p = v[0].startsWith("h") ? HEAD : TAIL;
- Gram subgram = new Gram(v[0].substring(2), p);
- Gram ngram = new Gram(v[1]);
+ int frequency = 1;
+ if (v[1].equals("of times")) frequency = 2;
+ Gram subgram = new Gram(v[0].substring(2), frequency, p);
+ Gram ngram = new Gram(v[1], frequency);
collector.collect(subgram, ngram);
}
-
-
+
reporter.incrCounter(CollocMapper.Count.NGRAM_TOTAL, 7);
EasyMock.replay(reporter, collector);
-
JobConf conf = new JobConf();
conf.set(CollocMapper.MAX_SHINGLE_SIZE, "2");
-
+
CollocMapper c = new CollocMapper();
c.configure(conf);
c.map(key, inputTuple, collector, reporter);
-
+
EasyMock.verify(reporter, collector);
}
@Test
public void testCollectNgramsWithUnigrams() throws Exception {
-
+
Text key = new Text();
key.set("dummy-key");
- String[] input = {"the", "best", "of", "times", "the", "worst", "of", "times"};
+ String[] input = {"the", "best", "of", "times", "the", "worst", "of",
+ "times"};
StringTuple inputTuple = new StringTuple();
- for (String i: input) {
+ for (String i : input) {
inputTuple.add(i);
}
-
- String[][] values =
- new String[][]{
- {"u_the", "the"},
- {"h_the", "the best"},
- {"t_best", "the best"},
- {"u_best", "best"},
- {"h_best", "best of"},
- {"t_of", "best of"},
- {"u_of", "of"},
- {"h_of", "of times"},
- {"t_times", "of times"},
- {"u_times", "times"},
- {"h_times", "times the"},
- {"t_the", "times the"},
- {"u_the", "the"},
- {"h_the", "the worst"},
- {"t_worst", "the worst"},
- {"u_worst", "worst"},
- {"h_worst", "worst of"},
- {"t_of", "worst of"},
- {"u_of", "of"},
- {"h_of", "of times"},
- {"t_times", "of times"},
- {"u_times", "times"},
- };
+
+ String[][] values = new String[][] { {"h_the", "the best"},
+ {"t_best", "the best"},
+ {"h_of", "of times"},
+ {"t_times", "of times"},
+ {"h_best", "best of"},
+ {"t_of", "best of"},
+ {"h_the", "the worst"},
+ {"t_worst", "the worst"},
+ {"h_times", "times the"},
+ {"t_the", "times the"},
+ {"h_worst", "worst of"},
+ {"t_of", "worst of"},
+ {"u_worst", "worst"}, {"u_of", "of"},
+ {"u_the", "the"}, {"u_best", "best"},
+ {"u_times", "times"},};
// set up expectations for mocks. ngram max size = 2
- for (String[] v: values) {
+ for (String[] v : values) {
Type p = v[0].startsWith("h") ? HEAD : TAIL;
p = v[0].startsWith("u") ? UNIGRAM : p;
- Gram subgram = new Gram(v[0].substring(2), p);
- Gram ngram = new Gram(v[1]);
+ int frequency = 1;
+ if (v[1].equals("of times") || v[1].equals("of") || v[1].equals("times")
+ || v[1].equals("the")) frequency = 2;
+ Gram subgram = new Gram(v[0].substring(2), frequency, p);
+ Gram ngram = new Gram(v[1], frequency);
collector.collect(subgram, ngram);
}
-
-
+
reporter.incrCounter(CollocMapper.Count.NGRAM_TOTAL, 7);
EasyMock.replay(reporter, collector);
-
JobConf conf = new JobConf();
conf.set(CollocMapper.MAX_SHINGLE_SIZE, "2");
conf.setBoolean(CollocDriver.EMIT_UNIGRAMS, true);
-
+
CollocMapper c = new CollocMapper();
c.configure(conf);
c.map(key, inputTuple, collector, reporter);
-
+
EasyMock.verify(reporter, collector);
}
-
+
/** A lucene 2.9 standard analyzer with no stopwords. */
public static class TestAnalyzer extends Analyzer {
final Analyzer a;