You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ab...@apache.org on 2012/04/02 13:03:47 UTC

svn commit: r1308298 - in /lucene/dev/branches/branch_3x/lucene/contrib: CHANGES.txt pruning/src/java/org/apache/lucene/index/pruning/RIDFTermPruningPolicy.java pruning/src/test/org/apache/lucene/index/TestPruningReader.java

Author: ab
Date: Mon Apr  2 11:03:46 2012
New Revision: 1308298

URL: http://svn.apache.org/viewvc?rev=1308298&view=rev
Log:
LUCENE-3934 Residual IDF calculation in the pruning package was wrong,
correct formula required a slightly different implementation. Unit test added.

Modified:
    lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt
    lucene/dev/branches/branch_3x/lucene/contrib/pruning/src/java/org/apache/lucene/index/pruning/RIDFTermPruningPolicy.java
    lucene/dev/branches/branch_3x/lucene/contrib/pruning/src/test/org/apache/lucene/index/TestPruningReader.java

Modified: lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt?rev=1308298&r1=1308297&r2=1308298&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt Mon Apr  2 11:03:46 2012
@@ -177,6 +177,9 @@ Bug Fixes
    
  * LUCENE-3937: Workaround a XERCES-J bug in benchmark module.
    (Uwe Schindler, Robert Muir, Mike McCandless)
+
+ * LUCENE-3934: Residual IDF calculation in the pruning package is wrong
+   (Andrzej Bialecki)
   
 Documentation
 

Modified: lucene/dev/branches/branch_3x/lucene/contrib/pruning/src/java/org/apache/lucene/index/pruning/RIDFTermPruningPolicy.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/pruning/src/java/org/apache/lucene/index/pruning/RIDFTermPruningPolicy.java?rev=1308298&r1=1308297&r2=1308298&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/pruning/src/java/org/apache/lucene/index/pruning/RIDFTermPruningPolicy.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/pruning/src/java/org/apache/lucene/index/pruning/RIDFTermPruningPolicy.java Mon Apr  2 11:03:46 2012
@@ -29,15 +29,27 @@ import org.apache.lucene.index.TermPosit
 
 /**
  * Implementation of {@link TermPruningPolicy} that uses "residual IDF"
- * metric to determine the postings of terms to keep/remove. Residual
- * IDF is a difference between a collection-wide IDF of a term and the
- * observed in-document frequency of the term.
+ * metric to determine the postings of terms to keep/remove, as defined in
+ * <a href="">http://www.dc.fi.udc.es/~barreiro/publications/blanco_barreiro_ecir2007.pdf</a>.
+ * <p>Residual IDF measures a difference between a collection-wide IDF of a term
+ * (which assumes a uniform distribution of occurrences) and the actual
+ * observed total number of occurrences of a term in all documents. Positive
+ * values indicate that a term is informative (e.g. for rare terms), negative
+ * values indicate that a term is not informative (e.g. too popular to offer
+ * good selectivity).
+ * <p>This metric produces small values close to [-1, 1], so useful ranges for
+ * thresholds under this metrics are somewhere between [0, 1]. The higher the
+ * threshold the more informative (and more rare) terms will be retained. For
+ * filtering of common words a value of close to or slightly below 0 (e.g. -0.1)
+ * should be a good starting point. 
+ * 
  */
 public class RIDFTermPruningPolicy extends TermPruningPolicy {
   double defThreshold;
   Map<String, Double> thresholds;
-  double df;
+  double idf;
   double maxDoc;
+  double ridf;
 
   public RIDFTermPruningPolicy(IndexReader in,
           Map<String, Integer> fieldFlags, Map<String, Double> thresholds,
@@ -54,7 +66,18 @@ public class RIDFTermPruningPolicy exten
 
   @Override
   public void initPositionsTerm(TermPositions tp, Term t) throws IOException {
-    df = Math.log(in.docFreq(t) / maxDoc);
+    // from formula [2], not the formula [1]
+    // 
+    idf = - Math.log((double)in.docFreq(t) / maxDoc);
+    // calculate total number of occurrences
+    int totalFreq = 0;
+    while (tp.next()) {
+      totalFreq += tp.freq();
+    }
+    // reposition the enum
+    tp.seek(t);
+    // rest of the formula [2] in the paper
+    ridf = idf + Math.log(1 - Math.pow(Math.E,  - totalFreq / maxDoc));
   }
 
   @Override
@@ -65,7 +88,6 @@ public class RIDFTermPruningPolicy exten
   @Override
   public boolean pruneAllPositions(TermPositions termPositions, Term t)
           throws IOException {
-    double ridf = Math.log(1 - Math.pow(Math.E, termPositions.freq() / maxDoc)) - df;
     double thr = defThreshold;
     String key = t.field() + ":" + t.text();
     if (thresholds.containsKey(key)) {

Modified: lucene/dev/branches/branch_3x/lucene/contrib/pruning/src/test/org/apache/lucene/index/TestPruningReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/pruning/src/test/org/apache/lucene/index/TestPruningReader.java?rev=1308298&r1=1308297&r2=1308298&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/pruning/src/test/org/apache/lucene/index/TestPruningReader.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/pruning/src/test/org/apache/lucene/index/TestPruningReader.java Mon Apr  2 11:03:46 2012
@@ -27,6 +27,7 @@ import org.apache.lucene.document.Field;
 import org.apache.lucene.index.PruningReader;
 import org.apache.lucene.index.pruning.CarmelTopKTermPruningPolicy;
 import org.apache.lucene.index.pruning.PruningPolicy;
+import org.apache.lucene.index.pruning.RIDFTermPruningPolicy;
 import org.apache.lucene.index.pruning.StorePruningPolicy;
 import org.apache.lucene.index.pruning.TFTermPruningPolicy;
 import org.apache.lucene.search.IndexSearcher;
@@ -55,7 +56,8 @@ public class TestPruningReader extends L
     try {
       int i = 0;
       while(td.next()) {
-        assertEquals(t + ", i=" + i, ids[i], td.doc());
+        int doc = td.doc();
+        assertEquals(t + ", i=" + i, ids[i], doc);
         i++;
       }
       assertEquals(ids.length, i);
@@ -208,6 +210,20 @@ public class TestPruningReader extends L
     ir.deleteDocument(5);
     ir.close();
   }
+  
+  public void testRIDFPruning() throws Exception {
+    RAMDirectory targetDir = new RAMDirectory();
+    IndexReader in = IndexReader.open(sourceDir, true);
+    // remove only very popular terms
+    RIDFTermPruningPolicy ridf = new RIDFTermPruningPolicy(in, null, null, -0.12);
+    PruningReader tfr = new PruningReader(in, null, ridf);
+    assertTDCount(tfr, new Term("body", "one"), 0);
+    assertTD(tfr, new Term("body", "two"), new int[]{0, 1, 2, 4});
+    assertTD(tfr, new Term("body", "three"), new int[]{0, 1, 3});
+    assertTD(tfr, new Term("test", "one"), new int[]{4});
+    assertTD(tfr, new Term("body", "four"), new int[]{0});
+    assertTD(tfr, new Term("test", "four"), new int[]{4});
+  }
 
   public void testTfPruning() throws Exception {
     RAMDirectory targetDir = new RAMDirectory();