You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by dr...@apache.org on 2010/07/08 01:39:45 UTC

svn commit: r961536 - in /mahout/trunk/utils/src: main/java/org/apache/mahout/utils/nlp/collocations/llr/ test/java/org/apache/mahout/utils/nlp/collocations/llr/

Author: drew
Date: Wed Jul  7 23:39:44 2010
New Revision: 961536

URL: http://svn.apache.org/viewvc?rev=961536&view=rev
Log:
MAHOUT-167, MAHOUT-417 -- minor tweaks to collocations code updated to 0.20

Modified:
    mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java
    mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/GramKeyGroupComparator.java
    mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapperTest.java
    mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducerTest.java
    mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/LLRReducerTest.java

Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java?rev=961536&r1=961535&r2=961536&view=diff
==============================================================================
--- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java (original)
+++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java Wed Jul  7 23:39:44 2010
@@ -202,7 +202,7 @@ public final class CollocDriver extends 
                                           int reduceTasks,
                                           int minSupport) throws IOException {
 
-    Configuration con = new Configuration();
+    Configuration con = new Configuration(baseConf);
     con.setBoolean(EMIT_UNIGRAMS, emitUnigrams);
     con.setInt(CollocMapper.MAX_SHINGLE_SIZE, maxNGramSize);
     con.setInt(CollocReducer.MIN_SUPPORT, minSupport);
@@ -245,7 +245,7 @@ public final class CollocDriver extends 
                                              boolean emitUnigrams,
                                              float minLLRValue,
                                              int reduceTasks) throws IOException, InterruptedException, ClassNotFoundException {
-    Configuration conf = new Configuration();
+    Configuration conf = new Configuration(baseConf);
     conf.setLong(LLRReducer.NGRAM_TOTAL, nGramTotal);
     conf.setBoolean(EMIT_UNIGRAMS, emitUnigrams);
  

Modified: mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/GramKeyGroupComparator.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/GramKeyGroupComparator.java?rev=961536&r1=961535&r2=961536&view=diff
==============================================================================
--- mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/GramKeyGroupComparator.java (original)
+++ mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/GramKeyGroupComparator.java Wed Jul  7 23:39:44 2010
@@ -29,7 +29,6 @@ public class GramKeyGroupComparator exte
     super(GramKey.class, true);
   }
 
-  @SuppressWarnings("unchecked")
   @Override
   public int compare(WritableComparable a, WritableComparable b) {
     GramKey gka = (GramKey) a;

Modified: mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapperTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapperTest.java?rev=961536&r1=961535&r2=961536&view=diff
==============================================================================
--- mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapperTest.java (original)
+++ mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocMapperTest.java Wed Jul  7 23:39:44 2010
@@ -17,32 +17,40 @@
 
 package org.apache.mahout.utils.nlp.collocations.llr;
 
-import static org.junit.Assert.assertEquals;
-
 import java.io.Reader;
 import java.util.Collections;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapred.OutputCollector;
-import org.apache.hadoop.mapreduce.Counter;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.Counters.Counter;
 import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.Mapper.Context;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.util.Version;
-import org.apache.mahout.common.DummyOutputCollector;
-import org.apache.mahout.common.DummyRecordWriter;
 import org.apache.mahout.common.StringTuple;
 import org.apache.mahout.utils.nlp.collocations.llr.Gram.Type;
+import org.easymock.classextension.EasyMock;
+import org.junit.Before;
 import org.junit.Test;
 
 /**
  * Test for CollocMapper 
  */
+@SuppressWarnings("deprecation")
 public class CollocMapperTest {
   
-  private OutputCollector<GramKey,Gram> collector;
+  private Mapper<Text,StringTuple,GramKey,Gram>.Context context;
+  private Counter counter;
+  @Before
+  @SuppressWarnings("unchecked")
+  public void setUp() {
+    counter = EasyMock.createMock(Counter.class);
+    context = EasyMock.createMock(Context.class);
+  }
+  
   @Test
   public void testCollectNgrams() throws Exception {
     
@@ -69,6 +77,11 @@ public class CollocMapperTest {
                           {"h_worst", "worst of"},
                           {"t_of", "worst of"},};
     // set up expectations for mocks. ngram max size = 2
+    
+    Configuration conf = new Configuration();
+    conf.set(CollocMapper.MAX_SHINGLE_SIZE, "2");
+    EasyMock.expect(context.getConfiguration()).andReturn(conf);
+    
     for (String[] v : values) {
       Type p = v[0].startsWith("h") ? Gram.Type.HEAD : Gram.Type.TAIL;
       int frequency = 1;
@@ -81,23 +94,20 @@ public class CollocMapperTest {
       
       GramKey subgramKey = new GramKey(subgram, new byte[0]);
       GramKey subgramNgramKey = new GramKey(subgram, ngram.getBytes());
-      collector = new DummyOutputCollector<GramKey, Gram>();
-      collector.collect(subgramKey, subgram);
-      collector.collect(subgramNgramKey, ngram);
+
+      context.write(subgramKey, subgram);
+      context.write(subgramNgramKey, ngram);
     }
+    EasyMock.expect(context.getCounter(CollocMapper.Count.NGRAM_TOTAL)).andReturn(counter);
+    counter.increment(7);
+    EasyMock.replay(context,counter);
+
+    CollocMapper c = new CollocMapper();
+    c.setup(context);
     
-    Configuration conf = new Configuration();
-    conf.set(CollocMapper.MAX_SHINGLE_SIZE, "2");
-    
-    CollocMapper mapper = new CollocMapper();
-    DummyRecordWriter<GramKey, Gram> writer = new DummyRecordWriter<GramKey, Gram>();
-    Mapper<Text, StringTuple, GramKey, Gram>.Context context =  DummyRecordWriter.build(mapper, conf, writer);
-    mapper.setup(context);
-    
-    mapper.map(key, inputTuple, context);
+    c.map(key, inputTuple, context);
     
-    Counter counter = (Counter) context.getCounter(CollocMapper.Count.NGRAM_TOTAL);
-    assertEquals("counter", 7, counter.getValue());
+    EasyMock.verify(context);
   }
   
   @Test
@@ -130,6 +140,11 @@ public class CollocMapperTest {
                                          {"u_times", "times"},};
 
     // set up expectations for mocks. ngram max size = 2
+    Configuration conf = new Configuration();
+    conf.set(CollocMapper.MAX_SHINGLE_SIZE, "2");
+    conf.setBoolean(CollocDriver.EMIT_UNIGRAMS, true);
+    EasyMock.expect(context.getConfiguration()).andReturn(conf);
+    
     for (String[] v : values) {
       Type p = v[0].startsWith("h") ? Gram.Type.HEAD : Gram.Type.TAIL;
       p = v[0].startsWith("u") ? Gram.Type.UNIGRAM : p;
@@ -139,12 +154,12 @@ public class CollocMapperTest {
         frequency = 2;
       }
       
-      collector = new DummyOutputCollector<GramKey, Gram>();
+      
      
       if (p == Gram.Type.UNIGRAM) {
         Gram unigram = new Gram(v[1], frequency, Gram.Type.UNIGRAM);
         GramKey unigramKey = new GramKey(unigram, new byte[0]);
-        collector.collect(unigramKey, unigram);
+        context.write(unigramKey, unigram);
       }
       else {
         Gram subgram = new Gram(v[0].substring(2), frequency, p);
@@ -152,24 +167,21 @@ public class CollocMapperTest {
         
         GramKey subgramKey = new GramKey(subgram, new byte[0]);
         GramKey subgramNgramKey = new GramKey(subgram, ngram.getBytes());
-        collector.collect(subgramKey, subgram);
-        collector.collect(subgramNgramKey, ngram);
+        context.write(subgramKey, subgram);
+        context.write(subgramNgramKey, ngram);
       }
     }
     
-    Configuration conf = new Configuration();
-    conf.set(CollocMapper.MAX_SHINGLE_SIZE, "2");
-    conf.setBoolean(CollocDriver.EMIT_UNIGRAMS, true);
+    EasyMock.expect(context.getCounter(CollocMapper.Count.NGRAM_TOTAL)).andReturn(counter);
+    counter.increment(7);
+    EasyMock.replay(context,counter);
     
-    CollocMapper mapper = new CollocMapper();
-    DummyRecordWriter<GramKey, Gram> writer = new DummyRecordWriter<GramKey, Gram>();
-    Mapper<Text, StringTuple, GramKey, Gram>.Context context = DummyRecordWriter.build(mapper, conf, writer);
-    mapper.setup(context);
+    CollocMapper c = new CollocMapper();
+    c.setup(context);
     
-    mapper.map(key, inputTuple, context);
+    c.map(key, inputTuple, context);
     
-    Counter counter = (Counter) context.getCounter(CollocMapper.Count.NGRAM_TOTAL);
-    assertEquals("counter", 7, counter.getValue());
+    EasyMock.verify(context);
   }
   
   /** A lucene 2.9 standard analyzer with no stopwords. */

Modified: mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducerTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducerTest.java?rev=961536&r1=961535&r2=961536&view=diff
==============================================================================
--- mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducerTest.java (original)
+++ mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/CollocReducerTest.java Wed Jul  7 23:39:44 2010
@@ -21,71 +21,69 @@ import static org.apache.mahout.utils.nl
 import static org.apache.mahout.utils.nlp.collocations.llr.Gram.Type.NGRAM;
 import static org.apache.mahout.utils.nlp.collocations.llr.Gram.Type.TAIL;
 import static org.apache.mahout.utils.nlp.collocations.llr.Gram.Type.UNIGRAM;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
 
-import java.util.ArrayList;
 import java.util.Arrays;
-import java.util.HashMap;
 import java.util.LinkedList;
 import java.util.List;
-import java.util.Map;
 
-import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.mapreduce.Reducer;
-import org.apache.mahout.common.DummyRecordWriter;
+import org.apache.hadoop.mapreduce.Reducer.Context;
+import org.easymock.classextension.EasyMock;
+import org.junit.Before;
 import org.junit.Test;
 
 /**
  * Test the CollocReducer
  */
 public class CollocReducerTest {
-
+  
+  private Reducer<GramKey,Gram,Gram,Gram>.Context context;
+;  
+  @Before
+  @SuppressWarnings("unchecked")
+  public void setUp() {
+    context = EasyMock.createMock(Context.class);
+  }
+  
   @Test
   public void testReduce() throws Exception {
     // test input, input[*][0] is the key,
     // input[*][1..n] are the values passed in via
     // the iterator.
-    Gram[][] input = { { new Gram("the", UNIGRAM), new Gram("the", UNIGRAM), new Gram("the", UNIGRAM) },
-        { new Gram("the", HEAD), new Gram("the best", NGRAM), new Gram("the worst", NGRAM) },
-        { new Gram("of", HEAD), new Gram("of times", NGRAM), new Gram("of times", NGRAM) },
-        { new Gram("times", TAIL), new Gram("of times", NGRAM), new Gram("of times", NGRAM) } };
-
+    Gram[][] input = {
+        {new Gram("the", UNIGRAM), new Gram("the", UNIGRAM), new Gram("the", UNIGRAM)},
+        {new Gram("the", HEAD), new Gram("the best", NGRAM), new Gram("the worst", NGRAM)},
+        {new Gram("of", HEAD), new Gram("of times", NGRAM), new Gram("of times", NGRAM)},
+        {new Gram("times", TAIL), new Gram("of times", NGRAM), new Gram("of times", NGRAM)}};
+    
     // expected results.
-    Gram[][] values = { { new Gram("the", 2, UNIGRAM), new Gram("the", 2, UNIGRAM) },
-        { new Gram("the best", 1, NGRAM), new Gram("the", 2, HEAD) },
-        { new Gram("the worst", 1, NGRAM), new Gram("the", 2, HEAD) }, 
-        { new Gram("of times", 2, NGRAM), new Gram("of", 2, HEAD) },
-        { new Gram("of times", 2, NGRAM), new Gram("times", 2, TAIL) } };
-
-    Map<Gram, List<Gram>> reference = new HashMap<Gram, List<Gram>>();
-    for (Gram[] grams : values) {
-      List<Gram> list = reference.get(grams[0]);
-      if (list == null) {
-        list = new ArrayList<Gram>();
-        reference.put(grams[0], list);
-      }
-      for (int j = 1; j < grams.length; j++)
-        list.add(grams[j]);
+    Gram[][] values = {{new Gram("the", 2, UNIGRAM), new Gram("the", 2, UNIGRAM)},
+                                    {new Gram("the best", 1, NGRAM), new Gram("the", 2, HEAD)},
+                                    {new Gram("the worst", 1, NGRAM), new Gram("the", 2, HEAD)},
+                                    {new Gram("of times", 2, NGRAM), new Gram("of", 2, HEAD)},
+                                    {new Gram("of times", 2, NGRAM), new Gram("times", 2, TAIL)}};
+
+    // set up expectations
+    for (Gram[] v : values) {
+      context.write(v[0], v[1]);
     }
-
-    // reduce the input data.
-    Configuration conf = new Configuration();
-    CollocReducer reducer = new CollocReducer();
-    DummyRecordWriter<Gram, Gram> writer = new DummyRecordWriter<Gram, Gram>();
-    Reducer<GramKey, Gram, Gram, Gram>.Context context = DummyRecordWriter.build(reducer, conf, writer, GramKey.class, Gram.class);
-
+    EasyMock.replay(context);
+    
+    // play back the input data.
+    CollocReducer c = new CollocReducer();
+    
     GramKey key = new GramKey();
 
     byte[] empty = new byte[0];
     for (Gram[] ii : input) {
       key.set(ii[0], empty);
+
       List<Gram> vv = new LinkedList<Gram>();
       vv.addAll(Arrays.asList(ii));
-      reducer.reduce(key, vv, context);
+      c.reduce(key, vv, context);
     }
-    assertTrue(writer.getKeys().size() == reference.keySet().size());
-    for (Gram gram : reference.keySet())
-      assertEquals("Gram " + gram, reference.get(gram).size(), writer.getValue(gram).size());
+    
+    EasyMock.verify(context);
   }
+  
 }

Modified: mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/LLRReducerTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/LLRReducerTest.java?rev=961536&r1=961535&r2=961536&view=diff
==============================================================================
--- mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/LLRReducerTest.java (original)
+++ mahout/trunk/utils/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/LLRReducerTest.java Wed Jul  7 23:39:44 2010
@@ -28,11 +28,12 @@ import java.util.List;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.DoubleWritable;
 import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.OutputCollector;
 import org.apache.hadoop.mapreduce.Reducer;
-import org.apache.mahout.common.DummyRecordWriter;
+import org.apache.hadoop.mapreduce.Reducer.Context;
 import org.apache.mahout.math.stats.LogLikelihood;
 import org.apache.mahout.utils.nlp.collocations.llr.LLRReducer.LLCallback;
-import org.easymock.EasyMock;
+import org.easymock.classextension.EasyMock;
 import org.junit.Before;
 import org.junit.Test;
 import org.slf4j.Logger;
@@ -46,11 +47,14 @@ public class LLRReducerTest {
   private static final Logger log =
     LoggerFactory.getLogger(LLRReducerTest.class);
   
+  private Reducer<Gram, Gram, Text, DoubleWritable>.Context context;
   private LLCallback ll;
   private LLCallback cl;
-
+  
   @Before
+  @SuppressWarnings("unchecked")
   public void setUp() {
+    context   = EasyMock.createMock(Reducer.Context.class);
     ll        = EasyMock.createMock(LLCallback.class);
     cl        = new LLCallback() {
       @Override
@@ -63,6 +67,7 @@ public class LLRReducerTest {
   
   @Test
   public void testReduce() throws Exception {
+    LLRReducer reducer = new LLRReducer(ll);
     
     // test input, input[*][0] is the key,
     // input[*][1..n] are the values passed in via
@@ -88,21 +93,19 @@ public class LLRReducerTest {
                             {1, 0, 1, 5}  // worst of
     };
     
-    for (int[] ee: expectations) {
+    Configuration config = new Configuration();
+    config.set(LLRReducer.NGRAM_TOTAL, "7");
+    EasyMock.expect(context.getConfiguration()).andReturn(config);
+    
+    for (int i=0; i < expectations.length; i++) {
+      int[] ee = expectations[i];
+      context.write(EasyMock.eq(new Text(input[i][0].getString())), (DoubleWritable) EasyMock.anyObject());
       EasyMock.expect(ll.logLikelihoodRatio(ee[0], ee[1], ee[2], ee[3])).andDelegateTo(cl);
+      
     }
+
+    EasyMock.replay(context, ll);
     
-    EasyMock.replay(ll);
-    
-    Configuration conf = new Configuration();
-    conf.set(LLRReducer.NGRAM_TOTAL, "7");
-    LLRReducer reducer = new LLRReducer(ll);
-    DummyRecordWriter<Text, DoubleWritable> writer = new DummyRecordWriter<Text, DoubleWritable>();
-    Reducer<Gram, Gram, Text, DoubleWritable>.Context context = DummyRecordWriter.build(reducer,
-                                                                                          conf,
-                                                                                          writer,
-                                                                                          Gram.class,
-                                                                                          Gram.class);
     reducer.setup(context);
     
     for (Gram[] ii: input) {