You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2014/03/19 16:38:16 UTC

svn commit: r1579264 - in /lucene/dev/branches/branch_4x: ./ lucene/ lucene/highlighter/ lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/ lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/

Author: rmuir
Date: Wed Mar 19 15:38:15 2014
New Revision: 1579264

URL: http://svn.apache.org/r1579264
Log:
LUCENE-5538: FastVectorHighlighter fails with booleans of phrases

Modified:
    lucene/dev/branches/branch_4x/   (props changed)
    lucene/dev/branches/branch_4x/lucene/   (props changed)
    lucene/dev/branches/branch_4x/lucene/CHANGES.txt
    lucene/dev/branches/branch_4x/lucene/highlighter/   (props changed)
    lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java
    lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java
    lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java
    lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/IndexTimeSynonymTest.java

Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1579264&r1=1579263&r2=1579264&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Wed Mar 19 15:38:15 2014
@@ -164,6 +164,9 @@ Bug fixes
 
 * LUCENE-5111: Fix WordDelimiterFilter to return offsets in correct order.  (Robert Muir)
 
+* LUCENE-5538: Fix FastVectorHighlighter bug with index-time synonyms when the
+  query is more complex than a single phrase.  (Robert Muir)
+
 Test Framework
 
 * LUCENE-5449: Rename _TestUtil and _TestHelper to remove the leading _.

Modified: lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java?rev=1579264&r1=1579263&r2=1579264&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java (original)
+++ lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java Wed Mar 19 15:38:15 2014
@@ -72,8 +72,15 @@ public class FieldPhraseList {
     {      
       phraseCandidate.clear();
 
-      TermInfo ti = fieldTermStack.pop();
+      TermInfo ti = null;
+      TermInfo first = null;
+      
+      first = ti = fieldTermStack.pop();
       currMap = fieldQuery.getFieldTermMap( field, ti.getText() );
+      while (currMap == null && ti.getNext() != first) {
+        ti = ti.getNext();
+        currMap = fieldQuery.getFieldTermMap( field, ti.getText() );
+      }
 
       // if not found, discard top TermInfo from stack, then try next element
       if( currMap == null ) continue;
@@ -81,10 +88,15 @@ public class FieldPhraseList {
       // if found, search the longest phrase
       phraseCandidate.add( ti );
       while( true ){
-        ti = fieldTermStack.pop();
+        first = ti = fieldTermStack.pop();
         nextMap = null;
-        if( ti != null )
+        if( ti != null ) {
           nextMap = currMap.getTermMap( ti.getText() );
+          while (nextMap == null && ti.getNext() != first) {
+            ti = ti.getNext();
+            nextMap = currMap.getTermMap( ti.getText() );
+          }
+        }
         if( ti == null || nextMap == null ){
           if( ti != null ) 
             fieldTermStack.push( ti );

Modified: lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java?rev=1579264&r1=1579263&r2=1579264&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java (original)
+++ lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java Wed Mar 19 15:38:15 2014
@@ -18,6 +18,7 @@ package org.apache.lucene.search.vectorh
 
 import java.io.IOException;
 import java.util.Collections;
+import java.util.Iterator;
 import java.util.LinkedList;
 import java.util.Set;
 
@@ -128,6 +129,30 @@ public class FieldTermStack {
     
     // sort by position
     Collections.sort(termList);
+    
+    // now look for dups at the same position, linking them together
+    int currentPos = -1;
+    TermInfo previous = null;
+    TermInfo first = null;
+    Iterator<TermInfo> iterator = termList.iterator();
+    while (iterator.hasNext()) {
+      TermInfo current = iterator.next();
+      if (current.position == currentPos) {
+        assert previous != null;
+        previous.setNext(current);
+        previous = current;
+        iterator.remove();
+      } else {
+        if (previous != null) {
+          previous.setNext(first);
+        }
+        previous = first = current;
+        currentPos = current.position;
+      }
+    }
+    if (previous != null) {
+      previous.setNext(first);
+    }
   }
 
   /**
@@ -173,6 +198,10 @@ public class FieldTermStack {
 
     // IDF-weight of this term
     private final float weight;
+    
+    // pointer to other TermInfo's at the same position.
+    // this is a circular list, so with no syns, just points to itself
+    private TermInfo next;
 
     public TermInfo( String text, int startOffset, int endOffset, int position, float weight ){
       this.text = text;
@@ -180,8 +209,15 @@ public class FieldTermStack {
       this.endOffset = endOffset;
       this.position = position;
       this.weight = weight;
+      this.next = this;
     }
     
+    void setNext(TermInfo next) { this.next = next; }
+    /** 
+     * Returns the next TermInfo at this same position.
+     * This is a circular list!
+     */
+    public TermInfo getNext() { return next; }
     public String getText(){ return text; }
     public int getStartOffset(){ return startOffset; }
     public int getEndOffset(){ return endOffset; }

Modified: lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java?rev=1579264&r1=1579263&r2=1579264&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java (original)
+++ lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java Wed Mar 19 15:38:15 2014
@@ -31,6 +31,7 @@ import org.apache.lucene.analysis.Token;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FieldType;
+import org.apache.lucene.document.StoredField;
 import org.apache.lucene.document.TextField;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexReader;
@@ -38,6 +39,7 @@ import org.apache.lucene.index.IndexWrit
 import org.apache.lucene.index.Term;
 import org.apache.lucene.queries.CommonTermsQuery;
 import org.apache.lucene.search.BooleanClause.Occur;
+import org.apache.lucene.search.BooleanClause;
 import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.PhraseQuery;
@@ -504,6 +506,69 @@ public class FastVectorHighlighterTest e
     writer.close();
     dir.close();
   }
+  
+  public void testBooleanPhraseWithSynonym() throws IOException {
+    Directory dir = newDirectory();
+    IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())));
+    Document doc = new Document();
+    FieldType type = new FieldType(TextField.TYPE_NOT_STORED);
+    type.setStoreTermVectorOffsets(true);
+    type.setStoreTermVectorPositions(true);
+    type.setStoreTermVectors(true);
+    type.freeze();
+    Token syn = new Token("httpwwwfacebookcom", 6, 29);
+    syn.setPositionIncrement(0);
+    CannedTokenStream ts = new CannedTokenStream(
+        new Token("test", 0, 4),
+        new Token("http", 6, 10),
+        syn,
+        new Token("www", 13, 16),
+        new Token("facebook", 17, 25),
+        new Token("com", 26, 29)
+    );
+    Field field = new Field("field", ts, type);
+    doc.add(field);
+    doc.add(new StoredField("field", "Test: http://www.facebook.com"));
+    writer.addDocument(doc);
+    FastVectorHighlighter highlighter = new FastVectorHighlighter();
+    
+    IndexReader reader = DirectoryReader.open(writer, true);
+    int docId = 0;
+    
+    // query1: match
+    PhraseQuery pq = new PhraseQuery();
+    pq.add(new Term("field", "test"));
+    pq.add(new Term("field", "http"));
+    pq.add(new Term("field", "www"));
+    pq.add(new Term("field", "facebook"));
+    pq.add(new Term("field", "com"));
+    FieldQuery fieldQuery  = highlighter.getFieldQuery(pq, reader);
+    String[] bestFragments = highlighter.getBestFragments(fieldQuery, reader, docId, "field", 54, 1);
+    assertEquals("<b>Test: http://www.facebook.com</b>", bestFragments[0]);
+    
+    // query2: match
+    PhraseQuery pq2 = new PhraseQuery();
+    pq2.add(new Term("field", "test"));
+    pq2.add(new Term("field", "httpwwwfacebookcom"));
+    pq2.add(new Term("field", "www"));
+    pq2.add(new Term("field", "facebook"));
+    pq2.add(new Term("field", "com"));
+    fieldQuery  = highlighter.getFieldQuery(pq2, reader);
+    bestFragments = highlighter.getBestFragments(fieldQuery, reader, docId, "field", 54, 1);
+    assertEquals("<b>Test: http://www.facebook.com</b>", bestFragments[0]);
+    
+    // query3: OR query1 and query2 together
+    BooleanQuery bq = new BooleanQuery();
+    bq.add(pq, BooleanClause.Occur.SHOULD);
+    bq.add(pq2, BooleanClause.Occur.SHOULD);
+    fieldQuery  = highlighter.getFieldQuery(bq, reader);
+    bestFragments = highlighter.getBestFragments(fieldQuery, reader, docId, "field", 54, 1);
+    assertEquals("<b>Test: http://www.facebook.com</b>", bestFragments[0]);
+    
+    reader.close();
+    writer.close();
+    dir.close();
+  }
 
   private void matchedFieldsTestCase( String fieldValue, String expected, Query... queryClauses ) throws IOException {
     matchedFieldsTestCase( true, true, fieldValue, expected, queryClauses );

Modified: lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/IndexTimeSynonymTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/IndexTimeSynonymTest.java?rev=1579264&r1=1579263&r2=1579264&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/IndexTimeSynonymTest.java (original)
+++ lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/IndexTimeSynonymTest.java Wed Mar 19 15:38:15 2014
@@ -26,6 +26,7 @@ import org.apache.lucene.analysis.*;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.BooleanClause.Occur;
+import org.apache.lucene.search.vectorhighlight.FieldTermStack.TermInfo;
 import org.apache.lucene.util.AttributeImpl;
 
 public class IndexTimeSynonymTest extends AbstractTestCase {
@@ -47,12 +48,11 @@ public class IndexTimeSynonymTest extend
     bq.add( tq( "MacBook" ), Occur.SHOULD );
     FieldQuery fq = new FieldQuery( bq, true, true );
     FieldTermStack stack = new FieldTermStack( reader, 0, F, fq );
-    assertEquals( 2, stack.termList.size() );
-    Set<String> expectedSet = new HashSet<>();
-    expectedSet.add( "Mac(11,20,3)" );
-    expectedSet.add( "MacBook(11,20,3)" );
-    assertTrue( expectedSet.contains( stack.pop().toString() ) );
-    assertTrue( expectedSet.contains( stack.pop().toString() ) );
+    assertEquals( 1, stack.termList.size() );
+    TermInfo ti = stack.pop();
+    assertEquals("Mac(11,20,3)", ti.toString());
+    assertEquals("MacBook(11,20,3)", ti.getNext().toString());
+    assertSame(ti, ti.getNext().getNext());
   }
   
   public void testFieldTermStackIndex1w2wSearch1term() throws Exception {
@@ -91,12 +91,11 @@ public class IndexTimeSynonymTest extend
     bq.add( pqF( "personal", "computer" ), Occur.SHOULD );
     FieldQuery fq = new FieldQuery( bq, true, true );
     FieldTermStack stack = new FieldTermStack( reader, 0, F, fq );
-    assertEquals( 3, stack.termList.size() );
-    Set<String> expectedSet = new HashSet<>();
-    expectedSet.add( "pc(3,5,1)" );
-    expectedSet.add( "personal(3,5,1)" );
-    assertTrue( expectedSet.contains( stack.pop().toString() ) );
-    assertTrue( expectedSet.contains( stack.pop().toString() ) );
+    assertEquals( 2, stack.termList.size() );
+    TermInfo ti = stack.pop();
+    assertEquals( "pc(3,5,1)", ti.toString());
+    assertEquals( "personal(3,5,1)", ti.getNext().toString());
+    assertSame(ti, ti.getNext().getNext());
     assertEquals( "computer(3,5,2)", stack.pop().toString() );
   }
   
@@ -136,12 +135,11 @@ public class IndexTimeSynonymTest extend
     bq.add( pqF( "personal", "computer" ), Occur.SHOULD );
     FieldQuery fq = new FieldQuery( bq, true, true );
     FieldTermStack stack = new FieldTermStack( reader, 0, F, fq );
-    assertEquals( 3, stack.termList.size() );
-    Set<String> expectedSet = new HashSet<>();
-    expectedSet.add( "pc(3,20,1)" );
-    expectedSet.add( "personal(3,20,1)" );
-    assertTrue( expectedSet.contains( stack.pop().toString() ) );
-    assertTrue( expectedSet.contains( stack.pop().toString() ) );
+    assertEquals( 2, stack.termList.size() );
+    TermInfo ti = stack.pop();
+    assertEquals("pc(3,20,1)", ti.toString());
+    assertEquals("personal(3,20,1)", ti.getNext().toString());
+    assertSame(ti, ti.getNext().getNext());
     assertEquals( "computer(3,20,2)", stack.pop().toString() );
   }