You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by gs...@apache.org on 2010/01/05 15:12:09 UTC

svn commit: r896053 - in /lucene/mahout/trunk: math/src/main/java/org/apache/mahout/math/stats/ math/src/test/java/org/apache/mahout/math/stats/ utils/src/main/java/org/apache/mahout/utils/vectors/lucene/

Author: gsingers
Date: Tue Jan  5 14:12:09 2010
New Revision: 896053

URL: http://svn.apache.org/viewvc?rev=896053&view=rev
Log:
MAHOUT-163: slight refactoring to move log likelihood calculation into math and add tests

Added:
    lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/
    lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java   (with props)
    lucene/mahout/trunk/math/src/test/java/org/apache/mahout/math/stats/
    lucene/mahout/trunk/math/src/test/java/org/apache/mahout/math/stats/LogLikelihoodTest.java   (with props)
Modified:
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java

Added: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java?rev=896053&view=auto
==============================================================================
--- lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java (added)
+++ lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java Tue Jan  5 14:12:09 2010
@@ -0,0 +1,69 @@
+package org.apache.mahout.math.stats;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/**
+ * Utility methods for working with log-likelihood
+ */
+public class LogLikelihood {
+  /**
+   * Calculate the Shannon entropy.
+   * @param elements TODO FILL IN HERE
+   * @return The entropy value for the elements
+   */
+  public static double entropy(int... elements) {
+    double sum = 0;
+    for (int element : elements) {
+      sum += element;
+    }
+    double result = 0.0;
+    for (int x : elements) {
+      if (x < 0) {
+        throw new IllegalArgumentException("Should not have negative count for entropy computation: (" + x + ")");
+      }
+      int zeroFlag = (x == 0 ? 1 : 0);
+      result += x * Math.log((x + zeroFlag) / sum);
+    }
+    return -result;
+  }
+
+  /**
+   * Calculate the Log-likelihood ratio for two events, call them A and B.  Then we have:
+   * <p/>
+   * <table border="1" cellpadding="5" cellspacing="0">
+   * <tbody><tr><td>&nbsp;</td><td>Event A</td><td>Everything but A</td></tr>
+   * <tr><td>Event B</td><td>A and B together (k_11)</td><td>B, but not A (k_12)</td></tr>
+   * <tr><td>Everything but B</td><td>A without B (k_21)</td><td>Neither A nor B (k_22)</td></tr></tbody>
+   * </table>
+   *
+   * @param k11 The number of times the two events occurred together
+   * @param k12 The number of times the second event occurred WITHOUT the first event
+   * @param k21 The number of times the first event occurred WITHOUT the second event
+   * @param k22 The number of times something else occurred (i.e. was neither of these events
+   * @return The log-likelihood ratio
+   *
+   * <p/>
+   * Credit to http://tdunning.blogspot.com/2008/03/surprise-and-coincidence.html for the table and the descriptions.
+   */
+  public static double logLikelihoodRatio(int k11, int k12, int k21, int k22) {
+    double rowEntropy = entropy(k11, k12) + entropy(k21, k22);
+    double columnEntropy = entropy(k11, k21) + entropy(k12, k22);
+    double matrixEntropy = entropy(k11, k12, k21, k22);
+    return 2 * (matrixEntropy - rowEntropy - columnEntropy);
+  }
+}

Propchange: lucene/mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/mahout/trunk/math/src/test/java/org/apache/mahout/math/stats/LogLikelihoodTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/math/src/test/java/org/apache/mahout/math/stats/LogLikelihoodTest.java?rev=896053&view=auto
==============================================================================
--- lucene/mahout/trunk/math/src/test/java/org/apache/mahout/math/stats/LogLikelihoodTest.java (added)
+++ lucene/mahout/trunk/math/src/test/java/org/apache/mahout/math/stats/LogLikelihoodTest.java Tue Jan  5 14:12:09 2010
@@ -0,0 +1,68 @@
+package org.apache.mahout.math.stats;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ *
+ *
+ **/
+public class LogLikelihoodTest extends Assert{
+  /*
+  *> entropy(c(1,1))
+[1] 1.386294
+llr(matrix(c(1,0,0,1), nrow=2))
+[1] 2.772589
+llr(matrix(c(10,0,0,10), nrow=2))
+[1] 27.72589
+llr(matrix(c(5,1995,0,100000), nrow=2))
+[1] 39.33052
+llr(matrix(c(1000,1995,1000,100000), nrow=2))
+[1] 4730.737
+llr(matrix(c(1000,1000,1000,100000), nrow=2))
+[1] 5734.343
+llr(matrix(c(1000,1000,1000,99000), nrow=2))
+[1] 5714.932
+*
+   */
+  @Test
+  public void testEntropy() throws Exception {
+
+    assertEquals(LogLikelihood.entropy(1, 1), 1.386294, 0.0001);
+    //TODO: more tests here
+    try {
+      LogLikelihood.entropy(-1, -1);//exception
+      assertFalse(true);
+    } catch (IllegalArgumentException e) {
+      
+    }
+  }
+
+  @Test
+  public void testLogLikelihood() throws Exception {
+    //TODO: check the epsilons
+    assertEquals(LogLikelihood.logLikelihoodRatio(1,0,0,1), 2.772589, 0.0001);
+    assertEquals(LogLikelihood.logLikelihoodRatio(10,0,0,10), 27.72589, 0.0001);
+    assertEquals(LogLikelihood.logLikelihoodRatio(5,1995,0,100000), 39.33052, 0.0001);
+    assertEquals(LogLikelihood.logLikelihoodRatio(1000,1995, 1000, 100000), 4730.737, 0.001);
+    assertEquals(LogLikelihood.logLikelihoodRatio(1000,1000,1000, 100000), 5734.343, 0.001);
+    assertEquals(LogLikelihood.logLikelihoodRatio(1000,1000,1000, 99000), 5714.932, 0.001);
+  }
+
+}

Propchange: lucene/mahout/trunk/math/src/test/java/org/apache/mahout/math/stats/LogLikelihoodTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java?rev=896053&r1=896052&r2=896053&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java Tue Jan  5 14:12:09 2010
@@ -35,6 +35,7 @@
 import org.apache.lucene.store.FSDirectory;
 import org.apache.lucene.util.OpenBitSet;
 import org.apache.mahout.common.CommandLineUtil;
+import org.apache.mahout.math.stats.LogLikelihood;
 import org.apache.mahout.utils.clustering.ClusterDumper;
 import org.apache.mahout.utils.vectors.TermEntry;
 import org.slf4j.Logger;
@@ -282,35 +283,11 @@
     int k12 = clusterSize - inDF;
     int k22 = corpusSize - clusterSize - outDF;
 
-    return logLikelihoodRatio(inDF, k12, outDF, k22);
+    return LogLikelihood.logLikelihoodRatio(inDF, k12, outDF, k22);
   }
 
 
-  private double entropy(int ... elements) {
-    double sum = 0;
-    for (int element : elements) {
-      sum += element;
-    }
-    double result = 0.0;
-    for (int x : elements) {
-      if (x < 0) {
-        throw new IllegalArgumentException("Should not have negative count for entropy computation: (" + x + ")");
-      }
-      int zeroFlag = (x == 0 ? 1 : 0);
-      result += x * Math.log((x + zeroFlag) / sum);
-    }
-    return -result;
-  }
 
-  /**
-   * Calculate Log-likehood ratio for the given matrix.
-   */
-  private double logLikelihoodRatio(int k11, int k12, int k21, int k22) {
-    double rowEntropy = entropy(k11, k12) + entropy(k21, k22);
-    double columnEntropy = entropy(k11, k21) + entropy(k12, k22);
-    double matrixEntropy = entropy(k11, k12, k21, k22);
-    return 2 * (matrixEntropy - rowEntropy - columnEntropy);
-  }
 
   public String getIdField() {
     return idField;