You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ds...@apache.org on 2015/09/13 05:54:47 UTC

svn commit: r1702695 - in /lucene/dev/trunk/lucene: CHANGES.txt highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java highlighter/src/test/org/apache/lucene/search/highlight/HighlighterPhraseTest.java

Author: dsmiley
Date: Sun Sep 13 03:54:46 2015
New Revision: 1702695

URL: http://svn.apache.org/r1702695
Log:
LUCENE-5503: Highlighter WSTE didn't always convert a PhraseQuery to a SpanQuery correctly.

Modified:
    lucene/dev/trunk/lucene/CHANGES.txt
    lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java
    lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterPhraseTest.java

Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1702695&r1=1702694&r2=1702695&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Sun Sep 13 03:54:46 2015
@@ -126,6 +126,11 @@ Bug Fixes
 * LUCENE-6792: Fix TermsQuery.toString() to work with binary terms.
   (Ruslan Muzhikov, Robert Muir)
 
+* LUCENE-5503: When Highlighter's WeightedSpanTermExtractor converts a
+  PhraseQuery to an equivalent SpanQuery, it would sometimes use a slop that is
+  too low (no highlight) or determine inOrder wrong.
+  (Tim Allison via David Smiley)
+
 Other
 
 * LUCENE-6174: Improve "ant eclipse" to select right JRE for building.

Modified: lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java?rev=1702695&r1=1702694&r2=1702695&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java (original)
+++ lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java Sun Sep 13 03:54:46 2015
@@ -17,6 +17,16 @@ package org.apache.lucene.search.highlig
  * limitations under the License.
  */
 
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
 import org.apache.lucene.analysis.CachingTokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.index.BinaryDocValues;
@@ -59,17 +69,6 @@ import org.apache.lucene.search.spans.Sp
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.IOUtils;
 
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-
-
 /**
  * Class used to extract {@link WeightedSpanTerm}s from a {@link Query} based on whether 
  * {@link Term}s from the {@link Query} are contained in a supplied {@link TokenStream}.
@@ -121,33 +120,19 @@ public class WeightedSpanTermExtractor {
       for (int i = 0; i < phraseQueryTerms.length; i++) {
         clauses[i] = new SpanTermQuery(phraseQueryTerms[i]);
       }
-      int slop = phraseQuery.getSlop();
+
+      // sum position increments beyond 1
+      int positionGaps = 0;
       int[] positions = phraseQuery.getPositions();
-      // add largest position increment to slop
-      if (positions.length > 0) {
-        int lastPos = positions[0];
-        int largestInc = 0;
-        int sz = positions.length;
-        for (int i = 1; i < sz; i++) {
-          int pos = positions[i];
-          int inc = pos - lastPos;
-          if (inc > largestInc) {
-            largestInc = inc;
-          }
-          lastPos = pos;
-        }
-        if(largestInc > 1) {
-          slop += largestInc;
-        }
+      if (positions.length >= 2) {
+        // positions are in increasing order.   max(0,...) is just a safeguard.
+        positionGaps = Math.max(0, positions[positions.length-1] - positions[0] - positions.length + 1);
       }
 
-      boolean inorder = false;
-
-      if (slop == 0) {
-        inorder = true;
-      }
+      //if original slop is 0 then require inOrder
+      boolean inorder = (phraseQuery.getSlop() == 0);
 
-      SpanNearQuery sp = new SpanNearQuery(clauses, slop, inorder);
+      SpanNearQuery sp = new SpanNearQuery(clauses, phraseQuery.getSlop() + positionGaps, inorder);
       extractWeightedSpanTerms(terms, sp, boost);
     } else if (query instanceof TermQuery) {
       extractWeightedTerms(terms, query, boost);

Modified: lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterPhraseTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterPhraseTest.java?rev=1702695&r1=1702694&r2=1702695&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterPhraseTest.java (original)
+++ lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterPhraseTest.java Sun Sep 13 03:54:46 2015
@@ -20,10 +20,12 @@ package org.apache.lucene.search.highlig
 import java.io.IOException;
 
 import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.analysis.MockTokenFilter;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.document.Field.Store;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.document.Document;
@@ -267,6 +269,76 @@ public class HighlighterPhraseTest exten
       directory.close();
     }
   }
+
+  //shows the need to sum the increments in WeightedSpanTermExtractor
+  public void testStopWords() throws IOException, InvalidTokenOffsetsException {
+    MockAnalyzer stopAnalyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true,
+        MockTokenFilter.ENGLISH_STOPSET);    
+    final String TEXT = "the ab the the cd the the the ef the";
+    final Directory directory = newDirectory();
+    try (IndexWriter indexWriter = new IndexWriter(directory,
+        newIndexWriterConfig(stopAnalyzer))) {
+      final Document document = new Document();
+      document.add(newTextField(FIELD, TEXT, Store.YES));
+      indexWriter.addDocument(document);
+    }
+    try (IndexReader indexReader = DirectoryReader.open(directory)) {
+      assertEquals(1, indexReader.numDocs());
+      final IndexSearcher indexSearcher = newSearcher(indexReader);
+      //equivalent of "ab the the cd the the the ef"
+      final PhraseQuery phraseQuery = new PhraseQuery.Builder()
+          .add(new Term(FIELD, "ab"), 0)
+          .add(new Term(FIELD, "cd"), 3)
+          .add(new Term(FIELD, "ef"), 7).build();
+
+      TopDocs hits = indexSearcher.search(phraseQuery, 100);
+      assertEquals(1, hits.totalHits);
+      final Highlighter highlighter = new Highlighter(
+          new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
+          new QueryScorer(phraseQuery));
+      assertEquals(1, highlighter.getBestFragments(stopAnalyzer, FIELD, TEXT, 10).length);
+    } finally {
+      directory.close();
+    }
+  }
+  
+  //shows the need to require inOrder if getSlop() == 0, not if final slop == 0 
+  //in WeightedSpanTermExtractor
+  public void testInOrderWithStopWords() throws IOException, InvalidTokenOffsetsException {
+    MockAnalyzer stopAnalyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true,
+        MockTokenFilter.ENGLISH_STOPSET);        
+    final String TEXT = "the cd the ab the the the the the the the ab the cd the";
+    final Directory directory = newDirectory();
+    try (IndexWriter indexWriter = new IndexWriter(directory,
+        newIndexWriterConfig(stopAnalyzer))) {
+      final Document document = new Document();
+      document.add(newTextField(FIELD, TEXT, Store.YES));
+      indexWriter.addDocument(document);
+    }
+    try (IndexReader indexReader = DirectoryReader.open(directory)) {
+      assertEquals(1, indexReader.numDocs());
+      final IndexSearcher indexSearcher = newSearcher(indexReader);
+      //equivalent of "ab the cd"
+      final PhraseQuery phraseQuery = new PhraseQuery.Builder()
+          .add(new Term(FIELD, "ab"), 0)
+          .add(new Term(FIELD, "cd"), 2).build();
+
+      TopDocs hits = indexSearcher.search(phraseQuery, 100);
+      assertEquals(1, hits.totalHits);
+
+      final Highlighter highlighter = new Highlighter(
+          new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
+          new QueryScorer(phraseQuery));
+      String[] frags = highlighter.getBestFragments(stopAnalyzer, FIELD, TEXT, 10);
+      assertEquals(1, frags.length);
+      assertTrue("contains <B>ab</B> the <B>cd</B>",
+          (frags[0].contains("<B>ab</B> the <B>cd</B>")));
+      assertTrue("does not contain <B>cd</B> the <B>ab</B>",
+          (!frags[0].contains("<B>cd</B> the <B>ab</B>")));
+    } finally {
+      directory.close();
+    }
+  }
 
   private static final class TokenStreamSparse extends TokenStream {
     private Token[] tokens;