You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by da...@apache.org on 2018/01/03 10:51:57 UTC

[09/18] lucene-solr:jira/solr-11702: LUCENE-7993: Faster phrases if total hit counts are not required.

LUCENE-7993: Faster phrases if total hit counts are not required.


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/c95dc6d9
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/c95dc6d9
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/c95dc6d9

Branch: refs/heads/jira/solr-11702
Commit: c95dc6d95743f4a9a1ffe9baa04c3a9d1e3acdf9
Parents: b2f2481
Author: Adrien Grand <jp...@gmail.com>
Authored: Fri Dec 29 09:14:32 2017 +0100
Committer: Adrien Grand <jp...@gmail.com>
Committed: Fri Dec 29 10:06:00 2017 +0100

----------------------------------------------------------------------
 lucene/CHANGES.txt                              |  3 ++
 .../apache/lucene/search/ExactPhraseScorer.java | 23 ++++++++--
 .../apache/lucene/search/MultiPhraseQuery.java  | 14 +++---
 .../org/apache/lucene/search/PhraseQuery.java   | 14 +++---
 .../apache/lucene/search/TestPhraseQuery.java   | 47 ++++++++++++++++++++
 5 files changed, 84 insertions(+), 17 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c95dc6d9/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 0da2cfe..ef7e005 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -61,6 +61,9 @@ Optimizations
 * LUCENE-4100: Disjunctions now support faster collection of top hits when the
   total hit count is not required. (Stefan Pohl, Adrien Grand, Robert Muir)
 
+* LUCENE-7993: Phrase queries are now faster if total hit counts are not
+  required. (Adrien Grand)
+
 ======================= Lucene 7.3.0 =======================
 
 API Changes

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c95dc6d9/lucene/core/src/java/org/apache/lucene/search/ExactPhraseScorer.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/ExactPhraseScorer.java b/lucene/core/src/java/org/apache/lucene/search/ExactPhraseScorer.java
index 85a242e..f4a7ca7 100644
--- a/lucene/core/src/java/org/apache/lucene/search/ExactPhraseScorer.java
+++ b/lucene/core/src/java/org/apache/lucene/search/ExactPhraseScorer.java
@@ -43,15 +43,17 @@ final class ExactPhraseScorer extends Scorer {
   private int freq;
 
   private final Similarity.SimScorer docScorer;
-  private final boolean needsScores;
+  private final boolean needsScores, needsTotalHitCount;
   private float matchCost;
+  private float minCompetitiveScore;
 
   ExactPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings,
-                    Similarity.SimScorer docScorer, boolean needsScores,
+                    Similarity.SimScorer docScorer, ScoreMode scoreMode,
                     float matchCost) throws IOException {
     super(weight);
     this.docScorer = docScorer;
-    this.needsScores = needsScores;
+    this.needsScores = scoreMode.needsScores();
+    this.needsTotalHitCount = scoreMode != ScoreMode.TOP_SCORES;
 
     List<DocIdSetIterator> iterators = new ArrayList<>();
     List<PostingsAndPosition> postingsAndPositions = new ArrayList<>();
@@ -66,10 +68,25 @@ final class ExactPhraseScorer extends Scorer {
   }
 
   @Override
+  public void setMinCompetitiveScore(float minScore) {
+    minCompetitiveScore = minScore;
+  }
+
+  @Override
   public TwoPhaseIterator twoPhaseIterator() {
     return new TwoPhaseIterator(conjunction) {
       @Override
       public boolean matches() throws IOException {
+        if (needsTotalHitCount == false && minCompetitiveScore > 0) {
+          int minFreq = postings[0].postings.freq();
+          for (int i = 1; i < postings.length; ++i) {
+            minFreq = Math.min(postings[i].postings.freq(), minFreq);
+          }
+          if (docScorer.score(docID(), minFreq) < minCompetitiveScore) {
+            // The maximum score we could get is less than the min competitive score
+            return false;
+          }
+        }
         return phraseFreq() > 0;
       }
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c95dc6d9/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java b/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java
index 44a5ad0..34361a7 100644
--- a/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java
@@ -185,13 +185,13 @@ public class MultiPhraseQuery extends Query {
     private final Similarity similarity;
     private final Similarity.SimWeight stats;
     private final Map<Term,TermContext> termContexts = new HashMap<>();
-    private final boolean needsScores;
+    private final ScoreMode scoreMode;
 
-    public MultiPhraseWeight(IndexSearcher searcher, boolean needsScores, float boost)
+    public MultiPhraseWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost)
       throws IOException {
       super(MultiPhraseQuery.this);
-      this.needsScores = needsScores;
-      this.similarity = searcher.getSimilarity(needsScores);
+      this.scoreMode = scoreMode;
+      this.similarity = searcher.getSimilarity(scoreMode.needsScores());
       final IndexReaderContext context = searcher.getTopReaderContext();
 
       // compute idf
@@ -283,11 +283,11 @@ public class MultiPhraseQuery extends Query {
       if (slop == 0) {
         return new ExactPhraseScorer(this, postingsFreqs,
                                       similarity.simScorer(stats, context),
-                                      needsScores, totalMatchCost);
+                                      scoreMode, totalMatchCost);
       } else {
         return new SloppyPhraseScorer(this, postingsFreqs, slop,
                                         similarity.simScorer(stats, context),
-                                        needsScores, totalMatchCost);
+                                        scoreMode.needsScores(), totalMatchCost);
       }
     }
 
@@ -335,7 +335,7 @@ public class MultiPhraseQuery extends Query {
 
   @Override
   public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
-    return new MultiPhraseWeight(searcher, scoreMode.needsScores(), boost);
+    return new MultiPhraseWeight(searcher, scoreMode, boost);
   }
 
   /** Prints a user-readable version of this query. */

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c95dc6d9/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java
index e0b60be..3d359b4 100644
--- a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java
@@ -353,10 +353,10 @@ public class PhraseQuery extends Query {
   private class PhraseWeight extends Weight {
     private final Similarity similarity;
     private final Similarity.SimWeight stats;
-    private final boolean needsScores;
+    private final ScoreMode scoreMode;
     private transient TermContext states[];
 
-    public PhraseWeight(IndexSearcher searcher, boolean needsScores, float boost)
+    public PhraseWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost)
       throws IOException {
       super(PhraseQuery.this);
       final int[] positions = PhraseQuery.this.getPositions();
@@ -365,8 +365,8 @@ public class PhraseQuery extends Query {
       } else if (positions[0] != 0) {
         throw new IllegalStateException("PhraseWeight requires that the first position is 0, call rewrite first");
       }
-      this.needsScores = needsScores;
-      this.similarity = searcher.getSimilarity(needsScores);
+      this.scoreMode = scoreMode;
+      this.similarity = searcher.getSimilarity(scoreMode.needsScores());
       final IndexReaderContext context = searcher.getTopReaderContext();
       states = new TermContext[terms.length];
       TermStatistics termStats[] = new TermStatistics[terms.length];
@@ -434,11 +434,11 @@ public class PhraseQuery extends Query {
       if (slop == 0) {  // optimize exact case
         return new ExactPhraseScorer(this, postingsFreqs,
                                       similarity.simScorer(stats, context),
-                                      needsScores, totalMatchCost);
+                                      scoreMode, totalMatchCost);
       } else {
         return new SloppyPhraseScorer(this, postingsFreqs, slop,
                                         similarity.simScorer(stats, context),
-                                        needsScores, totalMatchCost);
+                                        scoreMode.needsScores(), totalMatchCost);
       }
     }
 
@@ -510,7 +510,7 @@ public class PhraseQuery extends Query {
 
   @Override
   public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
-    return new PhraseWeight(searcher, scoreMode.needsScores(), boost);
+    return new PhraseWeight(searcher, scoreMode, boost);
   }
 
   /** Prints a user-readable version of this query. */

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c95dc6d9/lucene/core/src/test/org/apache/lucene/search/TestPhraseQuery.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestPhraseQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestPhraseQuery.java
index 2bbd0dd..eb31128 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestPhraseQuery.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestPhraseQuery.java
@@ -19,6 +19,8 @@ package org.apache.lucene.search;
 
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
 import java.util.List;
 import java.util.Random;
 
@@ -33,7 +35,10 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.TextField;
+import org.apache.lucene.document.Field.Store;
+import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexWriterConfig.OpenMode;
 import org.apache.lucene.index.RandomIndexWriter;
 import org.apache.lucene.index.Term;
@@ -713,4 +718,46 @@ public class TestPhraseQuery extends LuceneTestCase {
       builder.add(new Term("field", "three"), 4);
     });
   }
+
+  static String[] DOCS = new String[] {
+      "a b c d e f g h",
+      "b c b",
+      "c d d d e f g b",
+      "c b a b c",
+      "a a b b c c d d",
+      "a b c d a b c d a b c d"
+  };
+
+  public void testTopPhrases() throws IOException {
+    Directory dir = newDirectory();
+    IndexWriter w = new IndexWriter(dir, newIndexWriterConfig());
+    String[] docs = Arrays.copyOf(DOCS, DOCS.length);
+    Collections.shuffle(Arrays.asList(docs), random());
+    for (String value : DOCS) {
+      Document doc = new Document();
+      doc.add(new TextField("f", value, Store.NO));
+      w.addDocument(doc);
+    }
+    IndexReader r = DirectoryReader.open(w);
+    w.close();
+    IndexSearcher searcher = newSearcher(r);
+    for (Query query : Arrays.asList(
+        new PhraseQuery("f", "b", "c"), // common phrase
+        new PhraseQuery("f", "e", "f"), // always appear next to each other
+        new PhraseQuery("f", "d", "d")  // repeated term
+        )) {
+      for (int topN = 1; topN <= 2; ++topN) {
+        TopScoreDocCollector collector1 = TopScoreDocCollector.create(topN, null, true);
+        searcher.search(query, collector1);
+        ScoreDoc[] hits1 = collector1.topDocs().scoreDocs;
+        TopScoreDocCollector collector2 = TopScoreDocCollector.create(topN, null, false);
+        searcher.search(query, collector2);
+        ScoreDoc[] hits2 = collector2.topDocs().scoreDocs;
+        assertTrue("" + query, hits1.length > 0);
+        CheckHits.checkEqual(query, hits1, hits2);
+      }
+    }
+    r.close();
+    dir.close();
+  }
 }