You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ab...@apache.org on 2012/04/02 13:03:47 UTC
svn commit: r1308298 - in /lucene/dev/branches/branch_3x/lucene/contrib:
CHANGES.txt
pruning/src/java/org/apache/lucene/index/pruning/RIDFTermPruningPolicy.java
pruning/src/test/org/apache/lucene/index/TestPruningReader.java
Author: ab
Date: Mon Apr 2 11:03:46 2012
New Revision: 1308298
URL: http://svn.apache.org/viewvc?rev=1308298&view=rev
Log:
LUCENE-3934 Residual IDF calculation in the pruning package was wrong,
correct formula required a slightly different implementation. Unit test added.
Modified:
lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt
lucene/dev/branches/branch_3x/lucene/contrib/pruning/src/java/org/apache/lucene/index/pruning/RIDFTermPruningPolicy.java
lucene/dev/branches/branch_3x/lucene/contrib/pruning/src/test/org/apache/lucene/index/TestPruningReader.java
Modified: lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt?rev=1308298&r1=1308297&r2=1308298&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt Mon Apr 2 11:03:46 2012
@@ -177,6 +177,9 @@ Bug Fixes
* LUCENE-3937: Workaround a XERCES-J bug in benchmark module.
(Uwe Schindler, Robert Muir, Mike McCandless)
+
+ * LUCENE-3934: Residual IDF calculation in the pruning package is wrong
+ (Andrzej Bialecki)
Documentation
Modified: lucene/dev/branches/branch_3x/lucene/contrib/pruning/src/java/org/apache/lucene/index/pruning/RIDFTermPruningPolicy.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/pruning/src/java/org/apache/lucene/index/pruning/RIDFTermPruningPolicy.java?rev=1308298&r1=1308297&r2=1308298&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/pruning/src/java/org/apache/lucene/index/pruning/RIDFTermPruningPolicy.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/pruning/src/java/org/apache/lucene/index/pruning/RIDFTermPruningPolicy.java Mon Apr 2 11:03:46 2012
@@ -29,15 +29,27 @@ import org.apache.lucene.index.TermPosit
/**
* Implementation of {@link TermPruningPolicy} that uses "residual IDF"
- * metric to determine the postings of terms to keep/remove. Residual
- * IDF is a difference between a collection-wide IDF of a term and the
- * observed in-document frequency of the term.
+ * metric to determine the postings of terms to keep/remove, as defined in
+ * <a href="">http://www.dc.fi.udc.es/~barreiro/publications/blanco_barreiro_ecir2007.pdf</a>.
+ * <p>Residual IDF measures a difference between a collection-wide IDF of a term
+ * (which assumes a uniform distribution of occurrences) and the actual
+ * observed total number of occurrences of a term in all documents. Positive
+ * values indicate that a term is informative (e.g. for rare terms), negative
+ * values indicate that a term is not informative (e.g. too popular to offer
+ * good selectivity).
+ * <p>This metric produces small values close to [-1, 1], so useful ranges for
+ * thresholds under this metrics are somewhere between [0, 1]. The higher the
+ * threshold the more informative (and more rare) terms will be retained. For
+ * filtering of common words a value of close to or slightly below 0 (e.g. -0.1)
+ * should be a good starting point.
+ *
*/
public class RIDFTermPruningPolicy extends TermPruningPolicy {
double defThreshold;
Map<String, Double> thresholds;
- double df;
+ double idf;
double maxDoc;
+ double ridf;
public RIDFTermPruningPolicy(IndexReader in,
Map<String, Integer> fieldFlags, Map<String, Double> thresholds,
@@ -54,7 +66,18 @@ public class RIDFTermPruningPolicy exten
@Override
public void initPositionsTerm(TermPositions tp, Term t) throws IOException {
- df = Math.log(in.docFreq(t) / maxDoc);
+ // from formula [2], not the formula [1]
+ //
+ idf = - Math.log((double)in.docFreq(t) / maxDoc);
+ // calculate total number of occurrences
+ int totalFreq = 0;
+ while (tp.next()) {
+ totalFreq += tp.freq();
+ }
+ // reposition the enum
+ tp.seek(t);
+ // rest of the formula [2] in the paper
+ ridf = idf + Math.log(1 - Math.pow(Math.E, - totalFreq / maxDoc));
}
@Override
@@ -65,7 +88,6 @@ public class RIDFTermPruningPolicy exten
@Override
public boolean pruneAllPositions(TermPositions termPositions, Term t)
throws IOException {
- double ridf = Math.log(1 - Math.pow(Math.E, termPositions.freq() / maxDoc)) - df;
double thr = defThreshold;
String key = t.field() + ":" + t.text();
if (thresholds.containsKey(key)) {
Modified: lucene/dev/branches/branch_3x/lucene/contrib/pruning/src/test/org/apache/lucene/index/TestPruningReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/pruning/src/test/org/apache/lucene/index/TestPruningReader.java?rev=1308298&r1=1308297&r2=1308298&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/pruning/src/test/org/apache/lucene/index/TestPruningReader.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/pruning/src/test/org/apache/lucene/index/TestPruningReader.java Mon Apr 2 11:03:46 2012
@@ -27,6 +27,7 @@ import org.apache.lucene.document.Field;
import org.apache.lucene.index.PruningReader;
import org.apache.lucene.index.pruning.CarmelTopKTermPruningPolicy;
import org.apache.lucene.index.pruning.PruningPolicy;
+import org.apache.lucene.index.pruning.RIDFTermPruningPolicy;
import org.apache.lucene.index.pruning.StorePruningPolicy;
import org.apache.lucene.index.pruning.TFTermPruningPolicy;
import org.apache.lucene.search.IndexSearcher;
@@ -55,7 +56,8 @@ public class TestPruningReader extends L
try {
int i = 0;
while(td.next()) {
- assertEquals(t + ", i=" + i, ids[i], td.doc());
+ int doc = td.doc();
+ assertEquals(t + ", i=" + i, ids[i], doc);
i++;
}
assertEquals(ids.length, i);
@@ -208,6 +210,20 @@ public class TestPruningReader extends L
ir.deleteDocument(5);
ir.close();
}
+
+ public void testRIDFPruning() throws Exception {
+ RAMDirectory targetDir = new RAMDirectory();
+ IndexReader in = IndexReader.open(sourceDir, true);
+ // remove only very popular terms
+ RIDFTermPruningPolicy ridf = new RIDFTermPruningPolicy(in, null, null, -0.12);
+ PruningReader tfr = new PruningReader(in, null, ridf);
+ assertTDCount(tfr, new Term("body", "one"), 0);
+ assertTD(tfr, new Term("body", "two"), new int[]{0, 1, 2, 4});
+ assertTD(tfr, new Term("body", "three"), new int[]{0, 1, 3});
+ assertTD(tfr, new Term("test", "one"), new int[]{4});
+ assertTD(tfr, new Term("body", "four"), new int[]{0});
+ assertTD(tfr, new Term("test", "four"), new int[]{4});
+ }
public void testTfPruning() throws Exception {
RAMDirectory targetDir = new RAMDirectory();