You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by td...@apache.org on 2011/01/17 02:02:52 UTC
svn commit: r1059716 - in /mahout/trunk/math/src:
main/java/org/apache/mahout/math/stats/LogLikelihood.java
test/java/org/apache/mahout/math/stats/LogLikelihoodTest.java
Author: tdunning
Date: Mon Jan 17 01:02:52 2011
New Revision: 1059716
URL: http://svn.apache.org/viewvc?rev=1059716&view=rev
Log:
MAHOUT-585 - Added LLR comparison code as in Luduan system. Added
test case for same. Also some small javadoc cleanups.
Modified:
mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java
mahout/trunk/math/src/test/java/org/apache/mahout/math/stats/LogLikelihoodTest.java
Modified: mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java
URL: http://svn.apache.org/viewvc/mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java?rev=1059716&r1=1059715&r2=1059716&view=diff
==============================================================================
--- mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java (original)
+++ mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java Mon Jan 17 01:02:52 2011
@@ -17,6 +17,14 @@
package org.apache.mahout.math.stats;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Multiset;
+import com.google.common.collect.Ordering;
+
+import java.util.Collections;
+import java.util.List;
+import java.util.PriorityQueue;
+
/**
* Utility methods for working with log-likelihood
*/
@@ -26,7 +34,7 @@ public final class LogLikelihood {
}
/**
- * Calculate the unnormalized Shannon entropy. This is
+ * Calculates the unnormalized Shannon entropy. This is
*
* -sum x_i log x_i / N = -N sum x_i/N log x_i/N
*
@@ -55,7 +63,7 @@ public final class LogLikelihood {
}
/**
- * Calculate the Raw Log-likelihood ratio for two events, call them A and B. Then we have:
+ * Calculates the Raw Log-likelihood ratio for two events, call them A and B. Then we have:
* <p/>
* <table border="1" cellpadding="5" cellspacing="0">
* <tbody><tr><td> </td><td>Event A</td><td>Everything but A</td></tr>
@@ -85,7 +93,7 @@ public final class LogLikelihood {
}
/**
- * Calculate the root log-likelihood ratio for two events.
+ * Calculates the root log-likelihood ratio for two events.
* See {@link #logLikelihoodRatio(int, int, int, int)}.
* @param k11 The number of times the two events occurred together
@@ -106,4 +114,68 @@ public final class LogLikelihood {
}
return sqrt;
}
+
+ /**
+ * Compares two sets of counts to see which items are interestingly over-represented in the first
+ * set.
+ * @param a The first counts.
+ * @param b The reference counts.
+ * @param maxReturn The maximum number of items to return. Use maxReturn >= a.elementSet.size() to return all
+ * scores above the threshold.
+ * @param threshold The minimum score for items to be returned. Use 0 to return all items more common
+ * in a than b. Use -Double.MAX_VALUE (not Double.MIN_VALUE !) to not use a threshold.
+ * @return A list of scored items with their scores.
+ */
+ public static <T> List<ScoredItem<T>> compareFrequencies(Multiset<T> a, Multiset<T> b, int maxReturn, double threshold) {
+ int totalA = a.size();
+ int totalB = b.size();
+
+ Ordering<ScoredItem<T>> byScoreAscending = new Ordering<ScoredItem<T>>() {
+ public int compare(ScoredItem<T> tScoredItem, ScoredItem<T> tScoredItem1) {
+ return Double.compare(tScoredItem.score, tScoredItem1.score);
+ }
+ };
+ PriorityQueue<ScoredItem<T>> best = new PriorityQueue<ScoredItem<T>>(maxReturn + 1, byScoreAscending);
+
+ for (T t : a.elementSet()) {
+ compareAndAdd(a, b, maxReturn, threshold, totalA, totalB, best, t);
+ }
+
+ // if threshold >= 0 we only iterate through a because anything not there can't be as or more common than in b.
+ if (threshold < 0) {
+ for (T t : b.elementSet()) {
+ // only items missing from a need be scored
+ if (a.count(t) == 0) {
+ compareAndAdd(a, b, maxReturn, threshold, totalA, totalB, best, t);
+ }
+ }
+ }
+
+ List<ScoredItem<T>> r = Lists.newArrayList(best);
+ Collections.sort(r, byScoreAscending.reverse());
+ return r;
+ }
+
+ private static <T> void compareAndAdd(Multiset<T> a, Multiset<T> b, int maxReturn, double threshold, int totalA, int totalB, PriorityQueue<ScoredItem<T>> best, T t) {
+ int kA = a.count(t);
+ int kB = b.count(t);
+ double score = rootLogLikelihoodRatio(kA, totalA - kA, kB, totalB - kB);
+ if (score >= threshold) {
+ ScoredItem<T> x = new ScoredItem<T>(t, score);
+ best.add(x);
+ while (best.size() > maxReturn) {
+ best.poll();
+ }
+ }
+ }
+
+ public final static class ScoredItem<T> {
+ public T item;
+ public double score;
+
+ public ScoredItem(T item, double score) {
+ this.item = item;
+ this.score = score;
+ }
+ }
}
Modified: mahout/trunk/math/src/test/java/org/apache/mahout/math/stats/LogLikelihoodTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/math/src/test/java/org/apache/mahout/math/stats/LogLikelihoodTest.java?rev=1059716&r1=1059715&r2=1059716&view=diff
==============================================================================
--- mahout/trunk/math/src/test/java/org/apache/mahout/math/stats/LogLikelihoodTest.java (original)
+++ mahout/trunk/math/src/test/java/org/apache/mahout/math/stats/LogLikelihoodTest.java Mon Jan 17 01:02:52 2011
@@ -17,10 +17,19 @@
package org.apache.mahout.math.stats;
+import com.google.common.collect.HashMultiset;
+import com.google.common.collect.Multiset;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.MahoutTestCase;
-
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.function.DoubleFunction;
+import org.apache.mahout.math.function.Functions;
import org.junit.Test;
+import java.util.List;
+import java.util.Random;
+
public final class LogLikelihoodTest extends MahoutTestCase {
@Test
@@ -33,7 +42,7 @@ public final class LogLikelihoodTest ext
LogLikelihood.entropy(-1, -1);//exception
fail();
} catch (IllegalArgumentException e) {
-
+
}
}
@@ -52,7 +61,7 @@ public final class LogLikelihoodTest ext
public void testRootLogLikelihood() throws Exception {
// positive where k11 is bigger than expected.
assertTrue(LogLikelihood.rootLogLikelihoodRatio(904, 21060, 1144, 283012) > 0.0);
-
+
// negative because k11 is lower than expected
assertTrue(LogLikelihood.rootLogLikelihoodRatio(36, 21928, 60280, 623876) < 0.0);
}
@@ -61,4 +70,116 @@ public final class LogLikelihoodTest ext
public void testRootNegativeLLR() {
assertEquals(0.0, LogLikelihood.rootLogLikelihoodRatio(6, 7567, 1924, 2426487), 0.00000001);
}
+
+ @Test
+ public void testFrequencyComparison() {
+ final Random rand = RandomUtils.getRandom();
+
+ // build a vector full of sample from exponential distribuiton
+ // this will have lots of little positive values and a few big ones
+ Vector p1 = new DenseVector(25)
+ .assign(new DoubleFunction() {
+ public double apply(double arg1) {
+ return -Math.log(1 - rand.nextDouble());
+ }
+ });
+
+ // make a copy
+ Vector p2 = p1.like().assign(p1);
+
+ // nuke elements 0..4
+ p1.viewPart(0, 5).assign(0);
+
+ // and boost elements 5..7
+ p1.viewPart(5, 3).assign(Functions.mult(4));
+
+ // then normalize to turn it into a probability distribution
+ p1.assign(Functions.div(p1.norm(1)));
+
+ // likewise normalize p2
+ p2.assign(Functions.div(p2.norm(1)));
+
+ // sample 100 times from p1
+ Multiset<Integer> w1 = HashMultiset.create();
+ for (int i = 0; i < 100; i++) {
+ w1.add(sample(p1, rand));
+ }
+
+ // and 1000 times from p2
+ Multiset<Integer> w2 = HashMultiset.create();
+ for (int i = 0; i < 1000; i++) {
+ w2.add(sample(p2, rand));
+ }
+
+ // comparing frequencies, we should be able to find 8 items with score > 0
+ List<LogLikelihood.ScoredItem<Integer>> r = LogLikelihood.compareFrequencies(w1, w2, 8, 0);
+ assertTrue(r.size() <= 8);
+ assertTrue(r.size() > 0);
+ for (LogLikelihood.ScoredItem<Integer> item : r) {
+ assertTrue(item.score >= 0);
+ }
+
+ // the most impressive should be 7
+ assertEquals(7, (int) r.get(0).item);
+
+ // make sure scores are descending
+ double lastScore = r.get(0).score;
+ for (LogLikelihood.ScoredItem<Integer> item : r) {
+ assertTrue(item.score <= lastScore);
+ lastScore = item.score;
+ }
+
+ // now as many as have score >= 1
+ r = LogLikelihood.compareFrequencies(w1, w2, 40, 1);
+
+ // only the boosted items should make the cut
+ assertEquals(3, r.size());
+ assertEquals(7, (int) r.get(0).item);
+ assertEquals(5, (int) r.get(1).item);
+ assertEquals(6, (int) r.get(2).item);
+
+ r = LogLikelihood.compareFrequencies(w1, w2, 1000, -100);
+ Multiset<Integer> k = HashMultiset.create();
+ for (LogLikelihood.ScoredItem<Integer> item : r) {
+ k.add(item.item);
+ }
+ for (int i = 0; i < 25; i++) {
+ assertTrue("i = " + i, k.count(i) == 1 || w2.count(i) == 0);
+ }
+
+ // all values that had non-zero counts in larger set should have result scores
+ assertEquals(w2.elementSet().size(), r.size());
+ assertEquals(7, (int) r.get(0).item);
+ assertEquals(5, (int) r.get(1).item);
+ assertEquals(6, (int) r.get(2).item);
+
+ // the last item should definitely have negative score
+ assertTrue(r.get(r.size() - 1).score < 0);
+
+ // make sure scores are descending
+ lastScore = r.get(0).score;
+ for (LogLikelihood.ScoredItem<Integer> item : r) {
+ assertTrue(item.score <= lastScore);
+ lastScore = item.score;
+ }
+ }
+
+ /**
+ * Samples from a multinomial distribution with parameters p and random generator rand.
+ * @param p A vector describing the distribution. Should sum to 1.
+ * @param rand A random number generator.
+ * @return A single sample from the multinomial distribution.
+ */
+ private int sample(Vector p, Random rand) {
+ double u = rand.nextDouble();
+
+ // simple sequential algorithm. Not the fastest, but we don't care
+ for (int i = 0; i < p.size(); i++) {
+ if (u <= p.get(i)) {
+ return i;
+ }
+ u -= p.get(i);
+ }
+ return p.size() - 1;
+ }
}