You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by si...@apache.org on 2013/08/20 17:50:05 UTC

svn commit: r1515850 - in /lucene/dev/branches/branch_4x: ./ lucene/ lucene/highlighter/ lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/ lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/

Author: simonw
Date: Tue Aug 20 15:50:05 2013
New Revision: 1515850

URL: http://svn.apache.org/r1515850
Log:
LUCENE-5182: Terminate phrase searches early if max phrase window is exceeded

Modified:
    lucene/dev/branches/branch_4x/   (props changed)
    lucene/dev/branches/branch_4x/lucene/   (props changed)
    lucene/dev/branches/branch_4x/lucene/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/branch_4x/lucene/highlighter/   (props changed)
    lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java
    lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java
    lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java

Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1515850&r1=1515849&r2=1515850&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Tue Aug 20 15:50:05 2013
@@ -170,6 +170,10 @@ Optimizations
   transition from DWPT into IndexWriter is now done via an Event-Queue
   processed from within the IndexWriter in order to prevent suituations
   where DWPT or DW calling int IW causing deadlocks. (Simon Willnauer)
+
+* LUCENE-5182: Terminate phrase searches early if max phrase window is 
+  exceeded in FastVectorHighlighter to prevent very long running phrase
+  extraction if phrase terms are high frequent. (Simon Willnauer)
   
 Documentation
 

Modified: lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java?rev=1515850&r1=1515849&r2=1515850&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java (original)
+++ lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java Tue Aug 20 15:50:05 2013
@@ -69,6 +69,9 @@ public class FieldPhraseList {
   }
 
   void extractPhrases(LinkedList<TermInfo> terms, QueryPhraseMap currMap, LinkedList<TermInfo> phraseCandidate, int longest) {
+    if (phraseCandidate.size() > 1 && phraseCandidate.getLast().getPosition() - phraseCandidate.getFirst().getPosition() > currMap.getMaxPhraseWindow()) {
+      return;
+    }
     if (terms.isEmpty()) {
       if (longest > 0) {
         addIfNoOverlap( new WeightedPhraseInfo( phraseCandidate.subList(0, longest), currMap.getBoost(), currMap.getTermOrPhraseNumber() ) );

Modified: lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java?rev=1515850&r1=1515849&r2=1515850&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java (original)
+++ lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java Tue Aug 20 15:50:05 2013
@@ -30,7 +30,6 @@ import java.util.Set;
 
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.Term;
-import org.apache.lucene.queries.CommonTermsQuery;
 import org.apache.lucene.search.BooleanClause;
 import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.ConstantScoreQuery;
@@ -63,6 +62,8 @@ public class FieldQuery {
 
   // The maximum number of different matching terms accumulated from any one MultiTermQuery
   private static final int MAX_MTQ_TERMS = 1024;
+  
+  private int maxPhraseWindow = 1;
 
   FieldQuery( Query query, IndexReader reader, boolean phraseHighlight, boolean fieldMatch ) throws IOException {
     this.fieldMatch = fieldMatch;
@@ -400,7 +401,7 @@ public class FieldQuery {
             return positions[i] - positions[j];
           }
         }.sort(0, terms.length);
-
+        
         addToMap(pq, terms, positions, 0, subMap, pq.getSlop());
       }
       else
@@ -474,8 +475,18 @@ public class FieldQuery {
         this.boost = boost;
         this.termOrPhraseNumber = fieldQuery.nextTermOrPhraseNumber();
         this.positions = positions;
+        if (positions != null) {
+          fieldQuery.maxPhraseWindow = Math.max(fieldQuery.maxPhraseWindow, slop + positions[positions.length-1] - positions[0]);
+        }
       }
     }
+   
+    /**
+     * The max phrase window based on the actual phrase positions and slop.
+     */ 
+    int getMaxPhraseWindow() {
+      return fieldQuery.maxPhraseWindow;
+    }
     
     public boolean isTerminal(){
       return terminal;

Modified: lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java?rev=1515850&r1=1515849&r2=1515850&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java (original)
+++ lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java Tue Aug 20 15:50:05 2013
@@ -47,6 +47,7 @@ import org.apache.lucene.search.TermQuer
 import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util._TestUtil;
 
 
 public class FastVectorHighlighterTest extends LuceneTestCase {
@@ -298,6 +299,49 @@ public class FastVectorHighlighterTest e
     writer.close();
     dir.close();
   }
+  
+  public void testLotsOfPhrases() throws IOException {
+    Directory dir = newDirectory();
+    IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT,  new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)));
+    FieldType type = new FieldType(TextField.TYPE_STORED);
+    type.setStoreTermVectorOffsets(true);
+    type.setStoreTermVectorPositions(true);
+    type.setStoreTermVectors(true);
+    type.freeze();
+    String[] terms = { "org", "apache", "lucene"};
+    int iters = atLeast(1000);
+    StringBuilder builder = new StringBuilder();
+    for (int i = 0; i < iters; i++) {
+      builder.append(terms[random().nextInt(terms.length)]).append(" ");
+      if (random().nextInt(6) == 3) {
+        builder.append("solr").append(" ");
+      }
+    }
+      Document doc = new Document();
+      Field field = new Field("field", builder.toString(), type);
+      doc.add(field);
+      writer.addDocument(doc);
+    PhraseQuery query = new PhraseQuery();
+    query.add(new Term("field", "org"));
+    query.add(new Term("field", "apache"));
+    query.add(new Term("field", "lucene"));
+    
+   
+    FastVectorHighlighter highlighter = new FastVectorHighlighter();
+    IndexReader reader = DirectoryReader.open(writer, true);
+    IndexSearcher searcher = newSearcher(reader);
+    TopDocs hits = searcher.search(query, 10);
+    assertEquals(1, hits.totalHits);
+    FieldQuery fieldQuery  = highlighter.getFieldQuery(query, reader);
+    String[] bestFragments = highlighter.getBestFragments(fieldQuery, reader, hits.scoreDocs[0].doc, "field", 1000, 1);
+    for (int i = 0; i < bestFragments.length; i++) {
+      String result = bestFragments[i].replaceAll("<b>org apache lucene</b>", "FOOBAR");
+      assertFalse(result.contains("org apache lucene"));
+    }
+    reader.close();
+    writer.close();
+    dir.close();
+  }
 
   public void testOverlappingPhrases() throws IOException {
     final Analyzer analyzer = new Analyzer() {