You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2014/03/19 16:19:26 UTC
svn commit: r1579255 - in /lucene/dev/trunk/lucene: ./
highlighter/src/java/org/apache/lucene/search/vectorhighlight/
highlighter/src/test/org/apache/lucene/search/vectorhighlight/
Author: rmuir
Date: Wed Mar 19 15:19:26 2014
New Revision: 1579255
URL: http://svn.apache.org/r1579255
Log:
LUCENE-5538: FastVectorHighlighter fails with booleans of phrases
Modified:
lucene/dev/trunk/lucene/CHANGES.txt
lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java
lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java
lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java
lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/IndexTimeSynonymTest.java
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1579255&r1=1579254&r2=1579255&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Wed Mar 19 15:19:26 2014
@@ -217,6 +217,9 @@ Bug fixes
* LUCENE-5111: Fix WordDelimiterFilter to return offsets in correct order. (Robert Muir)
+* LUCENE-5538: Fix FastVectorHighlighter bug with index-time synonyms when the
+ query is more complex than a single phrase. (Robert Muir)
+
Test Framework
* LUCENE-5449: Rename _TestUtil and _TestHelper to remove the leading _.
Modified: lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java?rev=1579255&r1=1579254&r2=1579255&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java (original)
+++ lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java Wed Mar 19 15:19:26 2014
@@ -72,8 +72,15 @@ public class FieldPhraseList {
{
phraseCandidate.clear();
- TermInfo ti = fieldTermStack.pop();
+ TermInfo ti = null;
+ TermInfo first = null;
+
+ first = ti = fieldTermStack.pop();
currMap = fieldQuery.getFieldTermMap( field, ti.getText() );
+ while (currMap == null && ti.getNext() != first) {
+ ti = ti.getNext();
+ currMap = fieldQuery.getFieldTermMap( field, ti.getText() );
+ }
// if not found, discard top TermInfo from stack, then try next element
if( currMap == null ) continue;
@@ -81,10 +88,15 @@ public class FieldPhraseList {
// if found, search the longest phrase
phraseCandidate.add( ti );
while( true ){
- ti = fieldTermStack.pop();
+ first = ti = fieldTermStack.pop();
nextMap = null;
- if( ti != null )
+ if( ti != null ) {
nextMap = currMap.getTermMap( ti.getText() );
+ while (nextMap == null && ti.getNext() != first) {
+ ti = ti.getNext();
+ nextMap = currMap.getTermMap( ti.getText() );
+ }
+ }
if( ti == null || nextMap == null ){
if( ti != null )
fieldTermStack.push( ti );
Modified: lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java?rev=1579255&r1=1579254&r2=1579255&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java (original)
+++ lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java Wed Mar 19 15:19:26 2014
@@ -18,6 +18,7 @@ package org.apache.lucene.search.vectorh
import java.io.IOException;
import java.util.Collections;
+import java.util.Iterator;
import java.util.LinkedList;
import java.util.Set;
@@ -128,6 +129,30 @@ public class FieldTermStack {
// sort by position
Collections.sort(termList);
+
+ // now look for dups at the same position, linking them together
+ int currentPos = -1;
+ TermInfo previous = null;
+ TermInfo first = null;
+ Iterator<TermInfo> iterator = termList.iterator();
+ while (iterator.hasNext()) {
+ TermInfo current = iterator.next();
+ if (current.position == currentPos) {
+ assert previous != null;
+ previous.setNext(current);
+ previous = current;
+ iterator.remove();
+ } else {
+ if (previous != null) {
+ previous.setNext(first);
+ }
+ previous = first = current;
+ currentPos = current.position;
+ }
+ }
+ if (previous != null) {
+ previous.setNext(first);
+ }
}
/**
@@ -173,6 +198,10 @@ public class FieldTermStack {
// IDF-weight of this term
private final float weight;
+
+ // pointer to other TermInfo's at the same position.
+ // this is a circular list, so with no syns, just points to itself
+ private TermInfo next;
public TermInfo( String text, int startOffset, int endOffset, int position, float weight ){
this.text = text;
@@ -180,8 +209,15 @@ public class FieldTermStack {
this.endOffset = endOffset;
this.position = position;
this.weight = weight;
+ this.next = this;
}
+ void setNext(TermInfo next) { this.next = next; }
+ /**
+ * Returns the next TermInfo at this same position.
+ * This is a circular list!
+ */
+ public TermInfo getNext() { return next; }
public String getText(){ return text; }
public int getStartOffset(){ return startOffset; }
public int getEndOffset(){ return endOffset; }
Modified: lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java?rev=1579255&r1=1579254&r2=1579255&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java (original)
+++ lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java Wed Mar 19 15:19:26 2014
@@ -31,6 +31,7 @@ import org.apache.lucene.analysis.Token;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
+import org.apache.lucene.document.StoredField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
@@ -38,6 +39,7 @@ import org.apache.lucene.index.IndexWrit
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.CommonTermsQuery;
import org.apache.lucene.search.BooleanClause.Occur;
+import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
@@ -504,6 +506,69 @@ public class FastVectorHighlighterTest e
writer.close();
dir.close();
}
+
+ public void testBooleanPhraseWithSynonym() throws IOException {
+ Directory dir = newDirectory();
+ IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())));
+ Document doc = new Document();
+ FieldType type = new FieldType(TextField.TYPE_NOT_STORED);
+ type.setStoreTermVectorOffsets(true);
+ type.setStoreTermVectorPositions(true);
+ type.setStoreTermVectors(true);
+ type.freeze();
+ Token syn = new Token("httpwwwfacebookcom", 6, 29);
+ syn.setPositionIncrement(0);
+ CannedTokenStream ts = new CannedTokenStream(
+ new Token("test", 0, 4),
+ new Token("http", 6, 10),
+ syn,
+ new Token("www", 13, 16),
+ new Token("facebook", 17, 25),
+ new Token("com", 26, 29)
+ );
+ Field field = new Field("field", ts, type);
+ doc.add(field);
+ doc.add(new StoredField("field", "Test: http://www.facebook.com"));
+ writer.addDocument(doc);
+ FastVectorHighlighter highlighter = new FastVectorHighlighter();
+
+ IndexReader reader = DirectoryReader.open(writer, true);
+ int docId = 0;
+
+ // query1: match
+ PhraseQuery pq = new PhraseQuery();
+ pq.add(new Term("field", "test"));
+ pq.add(new Term("field", "http"));
+ pq.add(new Term("field", "www"));
+ pq.add(new Term("field", "facebook"));
+ pq.add(new Term("field", "com"));
+ FieldQuery fieldQuery = highlighter.getFieldQuery(pq, reader);
+ String[] bestFragments = highlighter.getBestFragments(fieldQuery, reader, docId, "field", 54, 1);
+ assertEquals("<b>Test: http://www.facebook.com</b>", bestFragments[0]);
+
+ // query2: match
+ PhraseQuery pq2 = new PhraseQuery();
+ pq2.add(new Term("field", "test"));
+ pq2.add(new Term("field", "httpwwwfacebookcom"));
+ pq2.add(new Term("field", "www"));
+ pq2.add(new Term("field", "facebook"));
+ pq2.add(new Term("field", "com"));
+ fieldQuery = highlighter.getFieldQuery(pq2, reader);
+ bestFragments = highlighter.getBestFragments(fieldQuery, reader, docId, "field", 54, 1);
+ assertEquals("<b>Test: http://www.facebook.com</b>", bestFragments[0]);
+
+ // query3: OR query1 and query2 together
+ BooleanQuery bq = new BooleanQuery();
+ bq.add(pq, BooleanClause.Occur.SHOULD);
+ bq.add(pq2, BooleanClause.Occur.SHOULD);
+ fieldQuery = highlighter.getFieldQuery(bq, reader);
+ bestFragments = highlighter.getBestFragments(fieldQuery, reader, docId, "field", 54, 1);
+ assertEquals("<b>Test: http://www.facebook.com</b>", bestFragments[0]);
+
+ reader.close();
+ writer.close();
+ dir.close();
+ }
private void matchedFieldsTestCase( String fieldValue, String expected, Query... queryClauses ) throws IOException {
matchedFieldsTestCase( true, true, fieldValue, expected, queryClauses );
Modified: lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/IndexTimeSynonymTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/IndexTimeSynonymTest.java?rev=1579255&r1=1579254&r2=1579255&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/IndexTimeSynonymTest.java (original)
+++ lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/IndexTimeSynonymTest.java Wed Mar 19 15:19:26 2014
@@ -26,6 +26,7 @@ import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BooleanClause.Occur;
+import org.apache.lucene.search.vectorhighlight.FieldTermStack.TermInfo;
import org.apache.lucene.util.AttributeImpl;
public class IndexTimeSynonymTest extends AbstractTestCase {
@@ -47,12 +48,11 @@ public class IndexTimeSynonymTest extend
bq.add( tq( "MacBook" ), Occur.SHOULD );
FieldQuery fq = new FieldQuery( bq, true, true );
FieldTermStack stack = new FieldTermStack( reader, 0, F, fq );
- assertEquals( 2, stack.termList.size() );
- Set<String> expectedSet = new HashSet<>();
- expectedSet.add( "Mac(11,20,3)" );
- expectedSet.add( "MacBook(11,20,3)" );
- assertTrue( expectedSet.contains( stack.pop().toString() ) );
- assertTrue( expectedSet.contains( stack.pop().toString() ) );
+ assertEquals( 1, stack.termList.size() );
+ TermInfo ti = stack.pop();
+ assertEquals("Mac(11,20,3)", ti.toString());
+ assertEquals("MacBook(11,20,3)", ti.getNext().toString());
+ assertSame(ti, ti.getNext().getNext());
}
public void testFieldTermStackIndex1w2wSearch1term() throws Exception {
@@ -91,12 +91,11 @@ public class IndexTimeSynonymTest extend
bq.add( pqF( "personal", "computer" ), Occur.SHOULD );
FieldQuery fq = new FieldQuery( bq, true, true );
FieldTermStack stack = new FieldTermStack( reader, 0, F, fq );
- assertEquals( 3, stack.termList.size() );
- Set<String> expectedSet = new HashSet<>();
- expectedSet.add( "pc(3,5,1)" );
- expectedSet.add( "personal(3,5,1)" );
- assertTrue( expectedSet.contains( stack.pop().toString() ) );
- assertTrue( expectedSet.contains( stack.pop().toString() ) );
+ assertEquals( 2, stack.termList.size() );
+ TermInfo ti = stack.pop();
+ assertEquals( "pc(3,5,1)", ti.toString());
+ assertEquals( "personal(3,5,1)", ti.getNext().toString());
+ assertSame(ti, ti.getNext().getNext());
assertEquals( "computer(3,5,2)", stack.pop().toString() );
}
@@ -136,12 +135,11 @@ public class IndexTimeSynonymTest extend
bq.add( pqF( "personal", "computer" ), Occur.SHOULD );
FieldQuery fq = new FieldQuery( bq, true, true );
FieldTermStack stack = new FieldTermStack( reader, 0, F, fq );
- assertEquals( 3, stack.termList.size() );
- Set<String> expectedSet = new HashSet<>();
- expectedSet.add( "pc(3,20,1)" );
- expectedSet.add( "personal(3,20,1)" );
- assertTrue( expectedSet.contains( stack.pop().toString() ) );
- assertTrue( expectedSet.contains( stack.pop().toString() ) );
+ assertEquals( 2, stack.termList.size() );
+ TermInfo ti = stack.pop();
+ assertEquals("pc(3,20,1)", ti.toString());
+ assertEquals("personal(3,20,1)", ti.getNext().toString());
+ assertSame(ti, ti.getNext().getNext());
assertEquals( "computer(3,20,2)", stack.pop().toString() );
}