You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2013/02/26 16:10:20 UTC

svn commit: r1450206 - in /lucene/dev/trunk/lucene: ./ highlighter/src/java/org/apache/lucene/search/postingshighlight/ highlighter/src/test/org/apache/lucene/search/postingshighlight/

Author: rmuir
Date: Tue Feb 26 15:10:20 2013
New Revision: 1450206

URL: http://svn.apache.org/r1450206
Log:
LUCENE-4798: PostingsHighlighter's formatter sometimes doesnt highlight matched terms

Modified:
    lucene/dev/trunk/lucene/CHANGES.txt
    lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/Passage.java
    lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java
    lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java
    lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighterRanking.java

Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1450206&r1=1450205&r2=1450206&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Tue Feb 26 15:10:20 2013
@@ -264,6 +264,8 @@ Bug Fixes
   large performance impacts for many non-random or non-uniform
   term distributions.  (John Wang, yonik)
 
+* LUCENE-4798: PostingsHighlighter's formatter sometimes didn't highlight 
+  matched terms.  (Robert Muir)
 
 Documentation
 

Modified: lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/Passage.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/Passage.java?rev=1450206&r1=1450205&r2=1450206&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/Passage.java (original)
+++ lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/Passage.java Tue Feb 26 15:10:20 2013
@@ -20,6 +20,7 @@ package org.apache.lucene.search.posting
 import org.apache.lucene.index.Term;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.SorterTemplate;
 
 /**
  * Represents a passage (typically a sentence of the document). 
@@ -53,6 +54,45 @@ public final class Passage {
     numMatches++;
   }
   
+  void sort() {
+    final int starts[] = matchStarts;
+    final int ends[] = matchEnds;
+    final Term terms[] = matchTerms;
+    new SorterTemplate() {
+      @Override
+      protected void swap(int i, int j) {
+        int temp = starts[i];
+        starts[i] = starts[j];
+        starts[j] = temp;
+        
+        temp = ends[i];
+        ends[i] = ends[j];
+        ends[j] = temp;
+        
+        Term tempTerm = terms[i];
+        terms[i] = terms[j];
+        terms[j] = tempTerm;
+      }
+
+      @Override
+      protected int compare(int i, int j) {
+        return Integer.compare(starts[i], starts[j]);
+      }
+
+      @Override
+      protected void setPivot(int i) {
+        pivot = starts[i];
+      }
+
+      @Override
+      protected int comparePivot(int j) {
+        return Integer.compare(pivot, starts[j]);
+      }
+      
+      int pivot;
+    }.mergeSort(0, numMatches-1);
+  }
+  
   void reset() {
     startOffset = endOffset = -1;
     score = 0.0f;

Modified: lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java?rev=1450206&r1=1450205&r2=1450206&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java (original)
+++ lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java Tue Feb 26 15:10:20 2013
@@ -399,6 +399,9 @@ public final class PostingsHighlighter {
         if (start >= contentLength) {
           Passage passages[] = new Passage[passageQueue.size()];
           passageQueue.toArray(passages);
+          for (Passage p : passages) {
+            p.sort();
+          }
           // sort in ascending order
           Arrays.sort(passages, new Comparator<Passage>() {
             @Override

Modified: lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java?rev=1450206&r1=1450205&r2=1450206&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java (original)
+++ lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java Tue Feb 26 15:10:20 2013
@@ -19,6 +19,7 @@ package org.apache.lucene.search.posting
 
 import java.util.Map;
 
+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.document.Document;
@@ -34,6 +35,7 @@ import org.apache.lucene.index.Term;
 import org.apache.lucene.search.BooleanClause;
 import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.PhraseQuery;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.Sort;
 import org.apache.lucene.search.TermQuery;
@@ -272,4 +274,40 @@ public class TestPostingsHighlighter ext
     ir.close();
     dir.close();
   }
+  
+  public void testBuddhism() throws Exception {
+    String text = "This eight-volume set brings together seminal papers in Buddhist studies from a vast " +
+    		          "range of academic disciplines published over the last forty years. With a new introduction " + 
+                  "by the editor, this collection is a unique and unrivalled research resource for both " + 
+    		          "student and scholar. Coverage includes: - Buddhist origins; early history of Buddhism in " + 
+                  "South and Southeast Asia - early Buddhist Schools and Doctrinal History; Theravada Doctrine " + 
+    		          "- the Origins and nature of Mahayana Buddhism; some Mahayana religious topics - Abhidharma " + 
+                  "and Madhyamaka - Yogacara, the Epistemological tradition, and Tathagatagarbha - Tantric " + 
+    		          "Buddhism (Including China and Japan); Buddhism in Nepal and Tibet - Buddhism in South and " + 
+                  "Southeast Asia, and - Buddhism in China, East Asia, and Japan.";
+    Directory dir = newDirectory();
+    Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
+    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, analyzer);
+    
+    FieldType positionsType = new FieldType(TextField.TYPE_STORED);
+    positionsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+    Field body = new Field("body", text, positionsType);
+    Document document = new Document();
+    document.add(body);
+    iw.addDocument(document);
+    IndexReader ir = iw.getReader();
+    iw.close();
+    IndexSearcher searcher = newSearcher(ir);
+    PhraseQuery query = new PhraseQuery();
+    query.add(new Term("body", "buddhist"));
+    query.add(new Term("body", "origins"));
+    TopDocs topDocs = searcher.search(query, 10);
+    assertEquals(1, topDocs.totalHits);
+    PostingsHighlighter highlighter = new PostingsHighlighter();
+    String snippets[] = highlighter.highlight("body", query, searcher, topDocs, 2);
+    assertEquals(1, snippets.length);
+    assertTrue(snippets[0].contains("<b>Buddhist</b> <b>origins</b>"));
+    ir.close();
+    dir.close();
+  }
 }

Modified: lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighterRanking.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighterRanking.java?rev=1450206&r1=1450205&r2=1450206&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighterRanking.java (original)
+++ lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighterRanking.java Tue Feb 26 15:10:20 2013
@@ -166,6 +166,7 @@ public class TestPostingsHighlighterRank
         assertTrue(p.getStartOffset() >= 0);
         assertTrue(p.getStartOffset() <= content.length());
         // we use a very simple analyzer. so we can assert the matches are correct
+        int lastMatchStart = -1;
         for (int i = 0; i < p.getNumMatches(); i++) {
           Term term = p.getMatchTerms()[i];
           assertEquals("body", term.field());
@@ -173,6 +174,9 @@ public class TestPostingsHighlighterRank
           assertTrue(matchStart >= 0);
           int matchEnd = p.getMatchEnds()[i];
           assertTrue(matchEnd >= 0);
+          // always moving forward
+          assertTrue(matchStart >= lastMatchStart);
+          lastMatchStart = matchStart;
           // single character terms
           assertEquals(matchStart+1, matchEnd);
           // and the offsets must be correct...