You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by dr...@apache.org on 2010/07/08 01:39:45 UTC
svn commit: r961536 - in /mahout/trunk/utils/src:
main/java/org/apache/mahout/utils/nlp/collocations/llr/
test/java/org/apache/mahout/utils/nlp/collocations/llr/
Author: drew
Date: Wed Jul 7 23:39:44 2010
New Revision: 961536
URL: http://svn.apache.org/viewvc?rev=961536&view=rev
Log:
MAHOUT-167, MAHOUT-417 -- minor tweaks to collocations code updated to 0.20
Modified:
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/GramKeyGroupComparator.java
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapperTest.java
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducerTest.java
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/LLRReducerTest.java
Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java?rev=961536&r1=961535&r2=961536&view=diff
==============================================================================
--- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java (original)
+++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java Wed Jul 7 23:39:44 2010
@@ -202,7 +202,7 @@ public final class CollocDriver extends
int reduceTasks,
int minSupport) throws IOException {
- Configuration con = new Configuration();
+ Configuration con = new Configuration(baseConf);
con.setBoolean(EMIT_UNIGRAMS, emitUnigrams);
con.setInt(CollocMapper.MAX_SHINGLE_SIZE, maxNGramSize);
con.setInt(CollocReducer.MIN_SUPPORT, minSupport);
@@ -245,7 +245,7 @@ public final class CollocDriver extends
boolean emitUnigrams,
float minLLRValue,
int reduceTasks) throws IOException, InterruptedException, ClassNotFoundException {
- Configuration conf = new Configuration();
+ Configuration conf = new Configuration(baseConf);
conf.setLong(LLRReducer.NGRAM_TOTAL, nGramTotal);
conf.setBoolean(EMIT_UNIGRAMS, emitUnigrams);
Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/GramKeyGroupComparator.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/GramKeyGroupComparator.java?rev=961536&r1=961535&r2=961536&view=diff
==============================================================================
--- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/GramKeyGroupComparator.java (original)
+++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/GramKeyGroupComparator.java Wed Jul 7 23:39:44 2010
@@ -29,7 +29,6 @@ public class GramKeyGroupComparator exte
super(GramKey.class, true);
}
- @SuppressWarnings("unchecked")
@Override
public int compare(WritableComparable a, WritableComparable b) {
GramKey gka = (GramKey) a;
Modified: mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapperTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapperTest.java?rev=961536&r1=961535&r2=961536&view=diff
==============================================================================
--- mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapperTest.java (original)
+++ mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapperTest.java Wed Jul 7 23:39:44 2010
@@ -17,32 +17,40 @@
package org.apache.mahout.utils.nlp.collocations.llr;
-import static org.junit.Assert.assertEquals;
-
import java.io.Reader;
import java.util.Collections;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapred.OutputCollector;
-import org.apache.hadoop.mapreduce.Counter;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.Counters.Counter;
import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.util.Version;
-import org.apache.mahout.common.DummyOutputCollector;
-import org.apache.mahout.common.DummyRecordWriter;
import org.apache.mahout.common.StringTuple;
import org.apache.mahout.utils.nlp.collocations.llr.Gram.Type;
+import org.easymock.classextension.EasyMock;
+import org.junit.Before;
import org.junit.Test;
/**
* Test for CollocMapper
*/
+@SuppressWarnings("deprecation")
public class CollocMapperTest {
- private OutputCollector<GramKey,Gram> collector;
+ private Mapper<Text,StringTuple,GramKey,Gram>.Context context;
+ private Counter counter;
+ @Before
+ @SuppressWarnings("unchecked")
+ public void setUp() {
+ counter = EasyMock.createMock(Counter.class);
+ context = EasyMock.createMock(Context.class);
+ }
+
@Test
public void testCollectNgrams() throws Exception {
@@ -69,6 +77,11 @@ public class CollocMapperTest {
{"h_worst", "worst of"},
{"t_of", "worst of"},};
// set up expectations for mocks. ngram max size = 2
+
+ Configuration conf = new Configuration();
+ conf.set(CollocMapper.MAX_SHINGLE_SIZE, "2");
+ EasyMock.expect(context.getConfiguration()).andReturn(conf);
+
for (String[] v : values) {
Type p = v[0].startsWith("h") ? Gram.Type.HEAD : Gram.Type.TAIL;
int frequency = 1;
@@ -81,23 +94,20 @@ public class CollocMapperTest {
GramKey subgramKey = new GramKey(subgram, new byte[0]);
GramKey subgramNgramKey = new GramKey(subgram, ngram.getBytes());
- collector = new DummyOutputCollector<GramKey, Gram>();
- collector.collect(subgramKey, subgram);
- collector.collect(subgramNgramKey, ngram);
+
+ context.write(subgramKey, subgram);
+ context.write(subgramNgramKey, ngram);
}
+ EasyMock.expect(context.getCounter(CollocMapper.Count.NGRAM_TOTAL)).andReturn(counter);
+ counter.increment(7);
+ EasyMock.replay(context,counter);
+
+ CollocMapper c = new CollocMapper();
+ c.setup(context);
- Configuration conf = new Configuration();
- conf.set(CollocMapper.MAX_SHINGLE_SIZE, "2");
-
- CollocMapper mapper = new CollocMapper();
- DummyRecordWriter<GramKey, Gram> writer = new DummyRecordWriter<GramKey, Gram>();
- Mapper<Text, StringTuple, GramKey, Gram>.Context context = DummyRecordWriter.build(mapper, conf, writer);
- mapper.setup(context);
-
- mapper.map(key, inputTuple, context);
+ c.map(key, inputTuple, context);
- Counter counter = (Counter) context.getCounter(CollocMapper.Count.NGRAM_TOTAL);
- assertEquals("counter", 7, counter.getValue());
+ EasyMock.verify(context);
}
@Test
@@ -130,6 +140,11 @@ public class CollocMapperTest {
{"u_times", "times"},};
// set up expectations for mocks. ngram max size = 2
+ Configuration conf = new Configuration();
+ conf.set(CollocMapper.MAX_SHINGLE_SIZE, "2");
+ conf.setBoolean(CollocDriver.EMIT_UNIGRAMS, true);
+ EasyMock.expect(context.getConfiguration()).andReturn(conf);
+
for (String[] v : values) {
Type p = v[0].startsWith("h") ? Gram.Type.HEAD : Gram.Type.TAIL;
p = v[0].startsWith("u") ? Gram.Type.UNIGRAM : p;
@@ -139,12 +154,12 @@ public class CollocMapperTest {
frequency = 2;
}
- collector = new DummyOutputCollector<GramKey, Gram>();
+
if (p == Gram.Type.UNIGRAM) {
Gram unigram = new Gram(v[1], frequency, Gram.Type.UNIGRAM);
GramKey unigramKey = new GramKey(unigram, new byte[0]);
- collector.collect(unigramKey, unigram);
+ context.write(unigramKey, unigram);
}
else {
Gram subgram = new Gram(v[0].substring(2), frequency, p);
@@ -152,24 +167,21 @@ public class CollocMapperTest {
GramKey subgramKey = new GramKey(subgram, new byte[0]);
GramKey subgramNgramKey = new GramKey(subgram, ngram.getBytes());
- collector.collect(subgramKey, subgram);
- collector.collect(subgramNgramKey, ngram);
+ context.write(subgramKey, subgram);
+ context.write(subgramNgramKey, ngram);
}
}
- Configuration conf = new Configuration();
- conf.set(CollocMapper.MAX_SHINGLE_SIZE, "2");
- conf.setBoolean(CollocDriver.EMIT_UNIGRAMS, true);
+ EasyMock.expect(context.getCounter(CollocMapper.Count.NGRAM_TOTAL)).andReturn(counter);
+ counter.increment(7);
+ EasyMock.replay(context,counter);
- CollocMapper mapper = new CollocMapper();
- DummyRecordWriter<GramKey, Gram> writer = new DummyRecordWriter<GramKey, Gram>();
- Mapper<Text, StringTuple, GramKey, Gram>.Context context = DummyRecordWriter.build(mapper, conf, writer);
- mapper.setup(context);
+ CollocMapper c = new CollocMapper();
+ c.setup(context);
- mapper.map(key, inputTuple, context);
+ c.map(key, inputTuple, context);
- Counter counter = (Counter) context.getCounter(CollocMapper.Count.NGRAM_TOTAL);
- assertEquals("counter", 7, counter.getValue());
+ EasyMock.verify(context);
}
/** A lucene 2.9 standard analyzer with no stopwords. */
Modified: mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducerTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducerTest.java?rev=961536&r1=961535&r2=961536&view=diff
==============================================================================
--- mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducerTest.java (original)
+++ mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducerTest.java Wed Jul 7 23:39:44 2010
@@ -21,71 +21,69 @@ import static org.apache.mahout.utils.nl
import static org.apache.mahout.utils.nlp.collocations.llr.Gram.Type.NGRAM;
import static org.apache.mahout.utils.nlp.collocations.llr.Gram.Type.TAIL;
import static org.apache.mahout.utils.nlp.collocations.llr.Gram.Type.UNIGRAM;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-import java.util.ArrayList;
import java.util.Arrays;
-import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
-import java.util.Map;
-import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.Reducer;
-import org.apache.mahout.common.DummyRecordWriter;
+import org.apache.hadoop.mapreduce.Reducer.Context;
+import org.easymock.classextension.EasyMock;
+import org.junit.Before;
import org.junit.Test;
/**
* Test the CollocReducer
*/
public class CollocReducerTest {
-
+
+ private Reducer<GramKey,Gram,Gram,Gram>.Context context;
+;
+ @Before
+ @SuppressWarnings("unchecked")
+ public void setUp() {
+ context = EasyMock.createMock(Context.class);
+ }
+
@Test
public void testReduce() throws Exception {
// test input, input[*][0] is the key,
// input[*][1..n] are the values passed in via
// the iterator.
- Gram[][] input = { { new Gram("the", UNIGRAM), new Gram("the", UNIGRAM), new Gram("the", UNIGRAM) },
- { new Gram("the", HEAD), new Gram("the best", NGRAM), new Gram("the worst", NGRAM) },
- { new Gram("of", HEAD), new Gram("of times", NGRAM), new Gram("of times", NGRAM) },
- { new Gram("times", TAIL), new Gram("of times", NGRAM), new Gram("of times", NGRAM) } };
-
+ Gram[][] input = {
+ {new Gram("the", UNIGRAM), new Gram("the", UNIGRAM), new Gram("the", UNIGRAM)},
+ {new Gram("the", HEAD), new Gram("the best", NGRAM), new Gram("the worst", NGRAM)},
+ {new Gram("of", HEAD), new Gram("of times", NGRAM), new Gram("of times", NGRAM)},
+ {new Gram("times", TAIL), new Gram("of times", NGRAM), new Gram("of times", NGRAM)}};
+
// expected results.
- Gram[][] values = { { new Gram("the", 2, UNIGRAM), new Gram("the", 2, UNIGRAM) },
- { new Gram("the best", 1, NGRAM), new Gram("the", 2, HEAD) },
- { new Gram("the worst", 1, NGRAM), new Gram("the", 2, HEAD) },
- { new Gram("of times", 2, NGRAM), new Gram("of", 2, HEAD) },
- { new Gram("of times", 2, NGRAM), new Gram("times", 2, TAIL) } };
-
- Map<Gram, List<Gram>> reference = new HashMap<Gram, List<Gram>>();
- for (Gram[] grams : values) {
- List<Gram> list = reference.get(grams[0]);
- if (list == null) {
- list = new ArrayList<Gram>();
- reference.put(grams[0], list);
- }
- for (int j = 1; j < grams.length; j++)
- list.add(grams[j]);
+ Gram[][] values = {{new Gram("the", 2, UNIGRAM), new Gram("the", 2, UNIGRAM)},
+ {new Gram("the best", 1, NGRAM), new Gram("the", 2, HEAD)},
+ {new Gram("the worst", 1, NGRAM), new Gram("the", 2, HEAD)},
+ {new Gram("of times", 2, NGRAM), new Gram("of", 2, HEAD)},
+ {new Gram("of times", 2, NGRAM), new Gram("times", 2, TAIL)}};
+
+ // set up expectations
+ for (Gram[] v : values) {
+ context.write(v[0], v[1]);
}
-
- // reduce the input data.
- Configuration conf = new Configuration();
- CollocReducer reducer = new CollocReducer();
- DummyRecordWriter<Gram, Gram> writer = new DummyRecordWriter<Gram, Gram>();
- Reducer<GramKey, Gram, Gram, Gram>.Context context = DummyRecordWriter.build(reducer, conf, writer, GramKey.class, Gram.class);
-
+ EasyMock.replay(context);
+
+ // play back the input data.
+ CollocReducer c = new CollocReducer();
+
GramKey key = new GramKey();
byte[] empty = new byte[0];
for (Gram[] ii : input) {
key.set(ii[0], empty);
+
List<Gram> vv = new LinkedList<Gram>();
vv.addAll(Arrays.asList(ii));
- reducer.reduce(key, vv, context);
+ c.reduce(key, vv, context);
}
- assertTrue(writer.getKeys().size() == reference.keySet().size());
- for (Gram gram : reference.keySet())
- assertEquals("Gram " + gram, reference.get(gram).size(), writer.getValue(gram).size());
+
+ EasyMock.verify(context);
}
+
}
Modified: mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/LLRReducerTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/LLRReducerTest.java?rev=961536&r1=961535&r2=961536&view=diff
==============================================================================
--- mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/LLRReducerTest.java (original)
+++ mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/LLRReducerTest.java Wed Jul 7 23:39:44 2010
@@ -28,11 +28,12 @@ import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapreduce.Reducer;
-import org.apache.mahout.common.DummyRecordWriter;
+import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.mahout.math.stats.LogLikelihood;
import org.apache.mahout.utils.nlp.collocations.llr.LLRReducer.LLCallback;
-import org.easymock.EasyMock;
+import org.easymock.classextension.EasyMock;
import org.junit.Before;
import org.junit.Test;
import org.slf4j.Logger;
@@ -46,11 +47,14 @@ public class LLRReducerTest {
private static final Logger log =
LoggerFactory.getLogger(LLRReducerTest.class);
+ private Reducer<Gram, Gram, Text, DoubleWritable>.Context context;
private LLCallback ll;
private LLCallback cl;
-
+
@Before
+ @SuppressWarnings("unchecked")
public void setUp() {
+ context = EasyMock.createMock(Reducer.Context.class);
ll = EasyMock.createMock(LLCallback.class);
cl = new LLCallback() {
@Override
@@ -63,6 +67,7 @@ public class LLRReducerTest {
@Test
public void testReduce() throws Exception {
+ LLRReducer reducer = new LLRReducer(ll);
// test input, input[*][0] is the key,
// input[*][1..n] are the values passed in via
@@ -88,21 +93,19 @@ public class LLRReducerTest {
{1, 0, 1, 5} // worst of
};
- for (int[] ee: expectations) {
+ Configuration config = new Configuration();
+ config.set(LLRReducer.NGRAM_TOTAL, "7");
+ EasyMock.expect(context.getConfiguration()).andReturn(config);
+
+ for (int i=0; i < expectations.length; i++) {
+ int[] ee = expectations[i];
+ context.write(EasyMock.eq(new Text(input[i][0].getString())), (DoubleWritable) EasyMock.anyObject());
EasyMock.expect(ll.logLikelihoodRatio(ee[0], ee[1], ee[2], ee[3])).andDelegateTo(cl);
+
}
+
+ EasyMock.replay(context, ll);
- EasyMock.replay(ll);
-
- Configuration conf = new Configuration();
- conf.set(LLRReducer.NGRAM_TOTAL, "7");
- LLRReducer reducer = new LLRReducer(ll);
- DummyRecordWriter<Text, DoubleWritable> writer = new DummyRecordWriter<Text, DoubleWritable>();
- Reducer<Gram, Gram, Text, DoubleWritable>.Context context = DummyRecordWriter.build(reducer,
- conf,
- writer,
- Gram.class,
- Gram.class);
reducer.setup(context);
for (Gram[] ii: input) {