You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by td...@apache.org on 2010/09/11 07:42:52 UTC

svn commit: r996075 - /mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java

Author: tdunning
Date: Sat Sep 11 05:42:51 2010
New Revision: 996075

URL: http://svn.apache.org/viewvc?rev=996075&view=rev
Log:
Added comments for LogLikelihood to explain un-normalized entropy.

Modified:
    mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java

Modified: mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java
URL: http://svn.apache.org/viewvc/mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java?rev=996075&r1=996074&r2=996075&view=diff
==============================================================================
--- mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java (original)
+++ mahout/trunk/math/src/main/java/org/apache/mahout/math/stats/LogLikelihood.java Sat Sep 11 05:42:51 2010
@@ -26,7 +26,16 @@ public final class LogLikelihood {
   }
 
   /**
-   * Calculate the Shannon entropy.
+   * Calculate the unnormalized Shannon entropy.  This is
+   *
+   * -sum x_i log x_i / N = -N sum x_i/N log x_i/N
+   *
+   * where N = sum x_i
+   *
+   * If the x's sum to 1, then this is the same as the normal
+   * expression.  Leaving this un-normalized makes working with
+   * counts and computing the LLR easier.
+   *
    * @return The entropy value for the elements
    */
   public static double entropy(int... elements) {
@@ -64,6 +73,7 @@ public final class LogLikelihood {
    * Credit to http://tdunning.blogspot.com/2008/03/surprise-and-coincidence.html for the table and the descriptions.
    */
   public static double logLikelihoodRatio(int k11, int k12, int k21, int k22) {
+    // note that we have counts here, not probabilities, and that the entropy is not normalized.
     double rowEntropy = entropy(k11, k12) + entropy(k21, k22);
     double columnEntropy = entropy(k11, k21) + entropy(k12, k22);
     double matrixEntropy = entropy(k11, k12, k21, k22);
@@ -75,7 +85,7 @@ public final class LogLikelihood {
   }
   
   /** 
-   * Calculate the Root Log-likelihood ratio for two events.
+   * Calculate the root log-likelihood ratio for two events.
    * See {@link #logLikelihoodRatio(int, int, int, int)}.
 
    * @param k11 The number of times the two events occurred together