You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/08/06 16:55:41 UTC
svn commit: r1369859 - in /lucene/dev/trunk/lucene: ./
highlighter/src/java/org/apache/lucene/search/highlight/
highlighter/src/java/org/apache/lucene/search/vectorhighlight/
Author: rmuir
Date: Mon Aug 6 14:55:41 2012
New Revision: 1369859
URL: http://svn.apache.org/viewvc?rev=1369859&view=rev
Log:
LUCENE-4289: fix highlighter idf inconsistencies/inefficiencies
Modified:
lucene/dev/trunk/lucene/CHANGES.txt
lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/highlight/QueryTermExtractor.java
lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java
lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1369859&r1=1369858&r2=1369859&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Mon Aug 6 14:55:41 2012
@@ -169,6 +169,9 @@ Bug Fixes
* LUCENE-4282: Automaton FuzzyQuery didnt always deliver all results.
(Johannes Christen, Uwe Schindler, Robert Muir)
+* LUCENE-4289: Fix minor idf inconsistencies/inefficiencies in highlighter.
+ (Robert Muir)
+
Changes in Runtime Behavior
* LUCENE-4109: Enable position increments in the flexible queryparser by default.
Modified: lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/highlight/QueryTermExtractor.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/highlight/QueryTermExtractor.java?rev=1369859&r1=1369858&r2=1369859&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/highlight/QueryTermExtractor.java (original)
+++ lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/highlight/QueryTermExtractor.java Mon Aug 6 14:55:41 2012
@@ -60,18 +60,14 @@ public final class QueryTermExtractor
public static final WeightedTerm[] getIdfWeightedTerms(Query query, IndexReader reader, String fieldName)
{
WeightedTerm[] terms=getTerms(query,false, fieldName);
- int totalNumDocs=reader.numDocs();
+ int totalNumDocs=reader.maxDoc();
for (int i = 0; i < terms.length; i++)
{
try
{
int docFreq=reader.docFreq(new Term(fieldName,terms[i].term));
- // docFreq counts deletes
- if(totalNumDocs < docFreq) {
- docFreq = totalNumDocs;
- }
//IDF algorithm taken from DefaultSimilarity class
- float idf=(float)(Math.log((float)totalNumDocs/(double)(docFreq+1)) + 1.0);
+ float idf=(float)(Math.log(totalNumDocs/(double)(docFreq+1)) + 1.0);
terms[i].weight*=idf;
}
catch (IOException e)
Modified: lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java?rev=1369859&r1=1369858&r2=1369859&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java (original)
+++ lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java Mon Aug 6 14:55:41 2012
@@ -431,7 +431,7 @@ public class WeightedSpanTermExtractor {
Map<String,WeightedSpanTerm> terms = new PositionCheckingMap<String>();
extract(query, terms);
- int totalNumDocs = reader.numDocs();
+ int totalNumDocs = reader.maxDoc();
Set<String> weightedTerms = terms.keySet();
Iterator<String> it = weightedTerms.iterator();
@@ -439,12 +439,8 @@ public class WeightedSpanTermExtractor {
while (it.hasNext()) {
WeightedSpanTerm weightedSpanTerm = terms.get(it.next());
int docFreq = reader.docFreq(new Term(fieldName, weightedSpanTerm.term));
- // docFreq counts deletes
- if(totalNumDocs < docFreq) {
- docFreq = totalNumDocs;
- }
// IDF algorithm taken from DefaultSimilarity class
- float idf = (float) (Math.log((float) totalNumDocs / (double) (docFreq + 1)) + 1.0);
+ float idf = (float) (Math.log(totalNumDocs / (double) (docFreq + 1)) + 1.0);
weightedSpanTerm.weight *= idf;
}
} finally {
Modified: lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java?rev=1369859&r1=1369858&r2=1369859&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java (original)
+++ lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java Mon Aug 6 14:55:41 2012
@@ -95,8 +95,7 @@ public class FieldTermStack {
DocsAndPositionsEnum dpEnum = null;
BytesRef text;
- int numDocs = reader.numDocs() - reader.numDeletedDocs();
- float weight = 0;
+ int numDocs = reader.maxDoc();
while ((text = termsEnum.next()) != null) {
UnicodeUtil.UTF8toUTF16(text, spare);
@@ -111,13 +110,14 @@ public class FieldTermStack {
}
dpEnum.nextDoc();
+
+ // For weight look here: http://lucene.apache.org/core/3_6_0/api/core/org/apache/lucene/search/DefaultSimilarity.html
+ final float weight = ( float ) ( Math.log( numDocs / ( double ) ( reader.docFreq( fieldName, text ) + 1 ) ) + 1.0 );
final int freq = dpEnum.freq();
for(int i = 0;i < freq;i++) {
int pos = dpEnum.nextPosition();
- // For weight look here: http://lucene.apache.org/core/3_6_0/api/core/org/apache/lucene/search/DefaultSimilarity.html
- weight = ( float ) ( Math.log( numDocs / ( double ) ( reader.docFreq( fieldName, text ) + 1 ) ) + 1.0 );
if (dpEnum.startOffset() < 0) {
return; // no offsets, null snippet
}