You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by gs...@apache.org on 2010/01/05 15:12:09 UTC
svn commit: r896053 - in /lucene/mahout/trunk:
math/src/main/java/org/apache/mahout/math/stats/
math/src/test/java/org/apache/mahout/math/stats/
utils/src/main/java/org/apache/mahout/utils/vectors/lucene/
Author: gsingers
Date: Tue Jan 5 14:12:09 2010
New Revision: 896053
URL: http://svn.apache.org/viewvc?rev=896053&view=rev
Log:
MAHOUT-163: slight refactoring to move log likelihood calculation into math and add tests
Added:
lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/
lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java (with props)
lucene/mahout/trunk/math/src/test/java/org/apache/mahout/math/stats/
lucene/mahout/trunk/math/src/test/java/org/apache/mahout/math/stats/LogLikelihoodTest.java (with props)
Modified:
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java
Added: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java?rev=896053&view=auto
==============================================================================
--- lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java (added)
+++ lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java Tue Jan 5 14:12:09 2010
@@ -0,0 +1,69 @@
+package org.apache.mahout.math.stats;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/**
+ * Utility methods for working with log-likelihood
+ */
+public class LogLikelihood {
+ /**
+ * Calculate the Shannon entropy.
+ * @param elements TODO FILL IN HERE
+ * @return The entropy value for the elements
+ */
+ public static double entropy(int... elements) {
+ double sum = 0;
+ for (int element : elements) {
+ sum += element;
+ }
+ double result = 0.0;
+ for (int x : elements) {
+ if (x < 0) {
+ throw new IllegalArgumentException("Should not have negative count for entropy computation: (" + x + ")");
+ }
+ int zeroFlag = (x == 0 ? 1 : 0);
+ result += x * Math.log((x + zeroFlag) / sum);
+ }
+ return -result;
+ }
+
+ /**
+ * Calculate the Log-likelihood ratio for two events, call them A and B. Then we have:
+ * <p/>
+ * <table border="1" cellpadding="5" cellspacing="0">
+ * <tbody><tr><td> </td><td>Event A</td><td>Everything but A</td></tr>
+ * <tr><td>Event B</td><td>A and B together (k_11)</td><td>B, but not A (k_12)</td></tr>
+ * <tr><td>Everything but B</td><td>A without B (k_21)</td><td>Neither A nor B (k_22)</td></tr></tbody>
+ * </table>
+ *
+ * @param k11 The number of times the two events occurred together
+ * @param k12 The number of times the second event occurred WITHOUT the first event
+ * @param k21 The number of times the first event occurred WITHOUT the second event
+ * @param k22 The number of times something else occurred (i.e. was neither of these events
+ * @return The log-likelihood ratio
+ *
+ * <p/>
+ * Credit to http://tdunning.blogspot.com/2008/03/surprise-and-coincidence.html for the table and the descriptions.
+ */
+ public static double logLikelihoodRatio(int k11, int k12, int k21, int k22) {
+ double rowEntropy = entropy(k11, k12) + entropy(k21, k22);
+ double columnEntropy = entropy(k11, k21) + entropy(k12, k22);
+ double matrixEntropy = entropy(k11, k12, k21, k22);
+ return 2 * (matrixEntropy - rowEntropy - columnEntropy);
+ }
+}
Propchange: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/mahout/trunk/math/src/test/java/org/apache/mahout/math/stats/LogLikelihoodTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/src/test/java/org/apache/mahout/math/stats/LogLikelihoodTest.java?rev=896053&view=auto
==============================================================================
--- lucene/mahout/trunk/math/src/test/java/org/apache/mahout/math/stats/LogLikelihoodTest.java (added)
+++ lucene/mahout/trunk/math/src/test/java/org/apache/mahout/math/stats/LogLikelihoodTest.java Tue Jan 5 14:12:09 2010
@@ -0,0 +1,68 @@
+package org.apache.mahout.math.stats;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ *
+ *
+ **/
+public class LogLikelihoodTest extends Assert{
+ /*
+ *> entropy(c(1,1))
+[1] 1.386294
+llr(matrix(c(1,0,0,1), nrow=2))
+[1] 2.772589
+llr(matrix(c(10,0,0,10), nrow=2))
+[1] 27.72589
+llr(matrix(c(5,1995,0,100000), nrow=2))
+[1] 39.33052
+llr(matrix(c(1000,1995,1000,100000), nrow=2))
+[1] 4730.737
+llr(matrix(c(1000,1000,1000,100000), nrow=2))
+[1] 5734.343
+llr(matrix(c(1000,1000,1000,99000), nrow=2))
+[1] 5714.932
+*
+ */
+ @Test
+ public void testEntropy() throws Exception {
+
+ assertEquals(LogLikelihood.entropy(1, 1), 1.386294, 0.0001);
+ //TODO: more tests here
+ try {
+ LogLikelihood.entropy(-1, -1);//exception
+ assertFalse(true);
+ } catch (IllegalArgumentException e) {
+
+ }
+ }
+
+ @Test
+ public void testLogLikelihood() throws Exception {
+ //TODO: check the epsilons
+ assertEquals(LogLikelihood.logLikelihoodRatio(1,0,0,1), 2.772589, 0.0001);
+ assertEquals(LogLikelihood.logLikelihoodRatio(10,0,0,10), 27.72589, 0.0001);
+ assertEquals(LogLikelihood.logLikelihoodRatio(5,1995,0,100000), 39.33052, 0.0001);
+ assertEquals(LogLikelihood.logLikelihoodRatio(1000,1995, 1000, 100000), 4730.737, 0.001);
+ assertEquals(LogLikelihood.logLikelihoodRatio(1000,1000,1000, 100000), 5734.343, 0.001);
+ assertEquals(LogLikelihood.logLikelihoodRatio(1000,1000,1000, 99000), 5714.932, 0.001);
+ }
+
+}
Propchange: lucene/mahout/trunk/math/src/test/java/org/apache/mahout/math/stats/LogLikelihoodTest.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java?rev=896053&r1=896052&r2=896053&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java Tue Jan 5 14:12:09 2010
@@ -35,6 +35,7 @@
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.OpenBitSet;
import org.apache.mahout.common.CommandLineUtil;
+import org.apache.mahout.math.stats.LogLikelihood;
import org.apache.mahout.utils.clustering.ClusterDumper;
import org.apache.mahout.utils.vectors.TermEntry;
import org.slf4j.Logger;
@@ -282,35 +283,11 @@
int k12 = clusterSize - inDF;
int k22 = corpusSize - clusterSize - outDF;
- return logLikelihoodRatio(inDF, k12, outDF, k22);
+ return LogLikelihood.logLikelihoodRatio(inDF, k12, outDF, k22);
}
- private double entropy(int ... elements) {
- double sum = 0;
- for (int element : elements) {
- sum += element;
- }
- double result = 0.0;
- for (int x : elements) {
- if (x < 0) {
- throw new IllegalArgumentException("Should not have negative count for entropy computation: (" + x + ")");
- }
- int zeroFlag = (x == 0 ? 1 : 0);
- result += x * Math.log((x + zeroFlag) / sum);
- }
- return -result;
- }
- /**
- * Calculate Log-likehood ratio for the given matrix.
- */
- private double logLikelihoodRatio(int k11, int k12, int k21, int k22) {
- double rowEntropy = entropy(k11, k12) + entropy(k21, k22);
- double columnEntropy = entropy(k11, k21) + entropy(k12, k22);
- double matrixEntropy = entropy(k11, k12, k21, k22);
- return 2 * (matrixEntropy - rowEntropy - columnEntropy);
- }
public String getIdField() {
return idField;