You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by td...@apache.org on 2011/01/17 02:02:52 UTC
svn commit: r1059716 - in /mahout/trunk/math/src: main/java/org/apache/mahout/math/stats/LogLikelihood.java test/java/org/apache/mahout/math/stats/LogLikelihoodTest.java

Author: tdunning
Date: Mon Jan 17 01:02:52 2011
New Revision: 1059716

URL: http://svn.apache.org/viewvc?rev=1059716&view=rev
Log:
MAHOUT-585 - Added LLR comparison code as in Luduan system.  Added
test case for same. Also some small javadoc cleanups.

Modified:
    mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java
    mahout/trunk/math/src/test/java/org/apache/mahout/math/stats/LogLikelihoodTest.java

Modified: mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java
URL: http://svn.apache.org/viewvc/mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java?rev=1059716&r1=1059715&r2=1059716&view=diff
==============================================================================
--- mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java (original)
+++ mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java Mon Jan 17 01:02:52 2011
@@ -17,6 +17,14 @@
 
 package org.apache.mahout.math.stats;
 
+import com.google.common.collect.Lists;
+import com.google.common.collect.Multiset;
+import com.google.common.collect.Ordering;
+
+import java.util.Collections;
+import java.util.List;
+import java.util.PriorityQueue;
+
 /**
  * Utility methods for working with log-likelihood
  */
@@ -26,7 +34,7 @@ public final class LogLikelihood {
   }
 
   /**
-   * Calculate the unnormalized Shannon entropy.  This is
+   * Calculates the unnormalized Shannon entropy.  This is
    *
    * -sum x_i log x_i / N = -N sum x_i/N log x_i/N
    *
@@ -55,7 +63,7 @@ public final class LogLikelihood {
   }
 
   /**
-   * Calculate the Raw Log-likelihood ratio for two events, call them A and B.  Then we have:
+   * Calculates the Raw Log-likelihood ratio for two events, call them A and B.  Then we have:
    * <p/>
    * <table border="1" cellpadding="5" cellspacing="0">
    * <tbody><tr><td>&nbsp;</td><td>Event A</td><td>Everything but A</td></tr>
@@ -85,7 +93,7 @@ public final class LogLikelihood {
   }
   
   /** 
-   * Calculate the root log-likelihood ratio for two events.
+   * Calculates the root log-likelihood ratio for two events.
    * See {@link #logLikelihoodRatio(int, int, int, int)}.
 
    * @param k11 The number of times the two events occurred together
@@ -106,4 +114,68 @@ public final class LogLikelihood {
     }
     return sqrt;
   }
+
+  /**
+   * Compares two sets of counts to see which items are interestingly over-represented in the first
+   * set.
+   * @param a  The first counts.
+   * @param b  The reference counts.
+   * @param maxReturn  The maximum number of items to return.  Use maxReturn >= a.elementSet.size() to return all
+   * scores above the threshold.
+   * @param threshold  The minimum score for items to be returned.  Use 0 to return all items more common
+   * in a than b.  Use -Double.MAX_VALUE (not Double.MIN_VALUE !) to not use a threshold.
+   * @return  A list of scored items with their scores.
+   */
+  public static <T> List<ScoredItem<T>> compareFrequencies(Multiset<T> a, Multiset<T> b, int maxReturn, double threshold) {
+    int totalA = a.size();
+    int totalB = b.size();
+
+    Ordering<ScoredItem<T>> byScoreAscending = new Ordering<ScoredItem<T>>() {
+      public int compare(ScoredItem<T> tScoredItem, ScoredItem<T> tScoredItem1) {
+        return Double.compare(tScoredItem.score, tScoredItem1.score);
+      }
+    };
+    PriorityQueue<ScoredItem<T>> best = new PriorityQueue<ScoredItem<T>>(maxReturn + 1, byScoreAscending);
+
+    for (T t : a.elementSet()) {
+      compareAndAdd(a, b, maxReturn, threshold, totalA, totalB, best, t);
+    }
+
+    // if threshold >= 0 we only iterate through a because anything not there can't be as or more common than in b.
+    if (threshold < 0) {
+      for (T t : b.elementSet()) {
+        // only items missing from a need be scored
+        if (a.count(t) == 0) {
+          compareAndAdd(a, b, maxReturn, threshold, totalA, totalB, best, t);
+        }
+      }
+    }
+
+    List<ScoredItem<T>> r = Lists.newArrayList(best);
+    Collections.sort(r, byScoreAscending.reverse());
+    return r;
+  }
+
+  private static <T> void compareAndAdd(Multiset<T> a, Multiset<T> b, int maxReturn, double threshold, int totalA, int totalB, PriorityQueue<ScoredItem<T>> best, T t) {
+    int kA = a.count(t);
+    int kB = b.count(t);
+    double score = rootLogLikelihoodRatio(kA, totalA - kA, kB, totalB - kB);
+    if (score >= threshold) {
+      ScoredItem<T> x = new ScoredItem<T>(t, score);
+      best.add(x);
+      while (best.size() > maxReturn) {
+        best.poll();
+      }
+    }
+  }
+
+  public final static class ScoredItem<T> {
+    public T item;
+    public double score;
+
+    public ScoredItem(T item, double score) {
+      this.item = item;
+      this.score = score;
+    }
+  }
 }

Modified: mahout/trunk/math/src/test/java/org/apache/mahout/math/stats/LogLikelihoodTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/math/src/test/java/org/apache/mahout/math/stats/LogLikelihoodTest.java?rev=1059716&r1=1059715&r2=1059716&view=diff
==============================================================================
--- mahout/trunk/math/src/test/java/org/apache/mahout/math/stats/LogLikelihoodTest.java (original)
+++ mahout/trunk/math/src/test/java/org/apache/mahout/math/stats/LogLikelihoodTest.java Mon Jan 17 01:02:52 2011
@@ -17,10 +17,19 @@
 
 package org.apache.mahout.math.stats;
 
+import com.google.common.collect.HashMultiset;
+import com.google.common.collect.Multiset;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.math.DenseVector;
 import org.apache.mahout.math.MahoutTestCase;
-
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.function.DoubleFunction;
+import org.apache.mahout.math.function.Functions;
 import org.junit.Test;
 
+import java.util.List;
+import java.util.Random;
+
 public final class LogLikelihoodTest extends MahoutTestCase {
 
   @Test
@@ -33,7 +42,7 @@ public final class LogLikelihoodTest ext
       LogLikelihood.entropy(-1, -1);//exception
       fail();
     } catch (IllegalArgumentException e) {
-      
+
     }
   }
 
@@ -52,7 +61,7 @@ public final class LogLikelihoodTest ext
   public void testRootLogLikelihood() throws Exception {
     // positive where k11 is bigger than expected.
     assertTrue(LogLikelihood.rootLogLikelihoodRatio(904, 21060, 1144, 283012) > 0.0);
-    
+
     // negative because k11 is lower than expected
     assertTrue(LogLikelihood.rootLogLikelihoodRatio(36, 21928, 60280, 623876) < 0.0);
   }
@@ -61,4 +70,116 @@ public final class LogLikelihoodTest ext
   public void testRootNegativeLLR() {
     assertEquals(0.0, LogLikelihood.rootLogLikelihoodRatio(6, 7567, 1924, 2426487), 0.00000001);
   }
+
+  @Test
+  public void testFrequencyComparison() {
+    final Random rand = RandomUtils.getRandom();
+
+    // build a vector full of sample from exponential distribuiton
+    // this will have lots of little positive values and a few big ones
+    Vector p1 = new DenseVector(25)
+      .assign(new DoubleFunction() {
+        public double apply(double arg1) {
+          return -Math.log(1 - rand.nextDouble());
+        }
+      });
+
+    // make a copy
+    Vector p2 = p1.like().assign(p1);
+
+    // nuke elements 0..4
+    p1.viewPart(0, 5).assign(0);
+
+    // and boost elements 5..7
+    p1.viewPart(5, 3).assign(Functions.mult(4));
+
+    // then normalize to turn it into a probability distribution
+    p1.assign(Functions.div(p1.norm(1)));
+
+    // likewise normalize p2
+    p2.assign(Functions.div(p2.norm(1)));
+
+    // sample 100 times from p1
+    Multiset<Integer> w1 = HashMultiset.create();
+    for (int i = 0; i < 100; i++) {
+      w1.add(sample(p1, rand));
+    }
+
+    // and 1000 times from p2
+    Multiset<Integer> w2 = HashMultiset.create();
+    for (int i = 0; i < 1000; i++) {
+      w2.add(sample(p2, rand));
+    }
+
+    // comparing frequencies, we should be able to find 8 items with score > 0
+    List<LogLikelihood.ScoredItem<Integer>> r = LogLikelihood.compareFrequencies(w1, w2, 8, 0);
+    assertTrue(r.size() <= 8);
+    assertTrue(r.size() > 0);
+    for (LogLikelihood.ScoredItem<Integer> item : r) {
+      assertTrue(item.score >= 0);
+    }
+
+    // the most impressive should be 7
+    assertEquals(7, (int) r.get(0).item);
+
+    // make sure scores are descending
+    double lastScore = r.get(0).score;
+    for (LogLikelihood.ScoredItem<Integer> item : r) {
+      assertTrue(item.score <= lastScore);
+      lastScore = item.score;
+    }
+
+    // now as many as have score >= 1
+    r = LogLikelihood.compareFrequencies(w1, w2, 40, 1);
+
+    // only the boosted items should make the cut
+    assertEquals(3, r.size());
+    assertEquals(7, (int) r.get(0).item);
+    assertEquals(5, (int) r.get(1).item);
+    assertEquals(6, (int) r.get(2).item);
+
+    r = LogLikelihood.compareFrequencies(w1, w2, 1000, -100);
+    Multiset<Integer> k = HashMultiset.create();
+    for (LogLikelihood.ScoredItem<Integer> item : r) {
+      k.add(item.item);
+    }
+    for (int i = 0; i < 25; i++) {
+      assertTrue("i = " + i, k.count(i) == 1 || w2.count(i) == 0);
+    }
+
+    // all values that had non-zero counts in larger set should have result scores
+    assertEquals(w2.elementSet().size(), r.size());
+    assertEquals(7, (int) r.get(0).item);
+    assertEquals(5, (int) r.get(1).item);
+    assertEquals(6, (int) r.get(2).item);
+    
+    // the last item should definitely have negative score
+    assertTrue(r.get(r.size() - 1).score < 0);
+
+    // make sure scores are descending
+    lastScore = r.get(0).score;
+    for (LogLikelihood.ScoredItem<Integer> item : r) {
+      assertTrue(item.score <= lastScore);
+      lastScore = item.score;
+    }
+  }
+
+  /**
+   * Samples from a multinomial distribution with parameters p and random generator rand.
+   * @param p      A vector describing the distribution.  Should sum to 1.
+   * @param rand   A random number generator.
+   * @return  A single sample from the multinomial distribution.
+   */
+  private int sample(Vector p, Random rand) {
+    double u = rand.nextDouble();
+
+    // simple sequential algorithm.  Not the fastest, but we don't care
+    for (int i = 0; i < p.size(); i++) {
+      if (u <= p.get(i)) {
+        return i;
+      }
+      u -= p.get(i);
+    }
+    return p.size() - 1;
+  }
 }