You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ju...@apache.org on 2022/05/30 18:02:45 UTC
[lucene] branch main updated: LUCENE-10582: Fix merging of CollectionStatistics in CombinedFieldQuery (#910)

This is an automated email from the ASF dual-hosted git repository.

julietibs pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/lucene.git


The following commit(s) were added to refs/heads/main by this push:
     new e319a5223ca LUCENE-10582: Fix merging of CollectionStatistics in CombinedFieldQuery (#910)
e319a5223ca is described below

commit e319a5223cac757f4d9c7a80d3b0587370f8aa5f
Author: Yannick Welsch <ya...@welsch.lu>
AuthorDate: Mon May 30 20:02:40 2022 +0200

    LUCENE-10582: Fix merging of CollectionStatistics in CombinedFieldQuery (#910)
    
    CombinedFieldQuery does not properly combine overridden collection statistics, resulting in an IllegalArgumentException during searches.
---
 lucene/CHANGES.txt                                 |  2 +
 .../lucene/sandbox/search/CombinedFieldQuery.java  |  3 +-
 .../sandbox/search/TestCombinedFieldQuery.java     | 82 ++++++++++++++++++++++
 3 files changed, 86 insertions(+), 1 deletion(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 603edacc0b8..4b3f15bb724 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -88,6 +88,8 @@ Bug Fixes
 
 * LUCENE-10574: Prevent pathological O(N^2) merging. (Adrien Grand)
 
+* LUCENE-10582: Fix merging of overridden CollectionStatistics in CombinedFieldQuery (Yannick Welsch)
+
 Other
 ---------------------
 
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java
index fccd6ce3eca..88a905d8f77 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java
@@ -352,13 +352,14 @@ public final class CombinedFieldQuery extends Query implements Accountable {
 
     private CollectionStatistics mergeCollectionStatistics(IndexSearcher searcher)
         throws IOException {
-      long maxDoc = searcher.getIndexReader().maxDoc();
+      long maxDoc = 0;
       long docCount = 0;
       long sumTotalTermFreq = 0;
       long sumDocFreq = 0;
       for (FieldAndWeight fieldWeight : fieldAndWeights.values()) {
         CollectionStatistics collectionStats = searcher.collectionStatistics(fieldWeight.field);
         if (collectionStats != null) {
+          maxDoc = Math.max(collectionStats.maxDoc(), maxDoc);
           docCount = Math.max(collectionStats.docCount(), docCount);
           sumDocFreq = Math.max(collectionStats.sumDocFreq(), sumDocFreq);
           sumTotalTermFreq += (double) fieldWeight.weight * collectionStats.sumTotalTermFreq();
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java
index 331abc7e7a7..739c9b85323 100644
--- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java
@@ -589,4 +589,86 @@ public class TestCombinedFieldQuery extends LuceneTestCase {
       return new BM25Similarity().scorer(boost, collectionStats, termStats);
     }
   }
+
+  public void testOverrideCollectionStatistics() throws IOException {
+    Directory dir = newDirectory();
+    IndexWriterConfig iwc = new IndexWriterConfig();
+    Similarity similarity = randomCompatibleSimilarity();
+    iwc.setSimilarity(similarity);
+    RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
+
+    int numMatch = atLeast(10);
+    for (int i = 0; i < numMatch; i++) {
+      Document doc = new Document();
+      if (random().nextBoolean()) {
+        doc.add(new TextField("a", "baz", Store.NO));
+        doc.add(new TextField("b", "baz", Store.NO));
+        for (int k = 0; k < 2; k++) {
+          doc.add(new TextField("ab", "baz", Store.NO));
+        }
+        w.addDocument(doc);
+        doc.clear();
+      }
+      int freqA = random().nextInt(5) + 1;
+      for (int j = 0; j < freqA; j++) {
+        doc.add(new TextField("a", "foo", Store.NO));
+      }
+      int freqB = random().nextInt(5) + 1;
+      for (int j = 0; j < freqB; j++) {
+        doc.add(new TextField("b", "foo", Store.NO));
+      }
+      int freqAB = freqA + freqB;
+      for (int j = 0; j < freqAB; j++) {
+        doc.add(new TextField("ab", "foo", Store.NO));
+      }
+      w.addDocument(doc);
+    }
+
+    IndexReader reader = w.getReader();
+
+    int extraMaxDoc = randomIntBetween(0, 10);
+    int extraDocCount = randomIntBetween(0, extraMaxDoc);
+    int extraSumDocFreq = extraDocCount + randomIntBetween(0, 10);
+
+    int extraSumTotalTermFreqA = extraSumDocFreq + randomIntBetween(0, 10);
+    int extraSumTotalTermFreqB = extraSumDocFreq + randomIntBetween(0, 10);
+    int extraSumTotalTermFreqAB = extraSumTotalTermFreqA + extraSumTotalTermFreqB;
+
+    IndexSearcher searcher =
+        new IndexSearcher(reader) {
+          @Override
+          public CollectionStatistics collectionStatistics(String field) throws IOException {
+            CollectionStatistics shardStatistics = super.collectionStatistics(field);
+            int extraSumTotalTermFreq;
+            if (field.equals("a")) {
+              extraSumTotalTermFreq = extraSumTotalTermFreqA;
+            } else if (field.equals("b")) {
+              extraSumTotalTermFreq = extraSumTotalTermFreqB;
+            } else if (field.equals("ab")) {
+              extraSumTotalTermFreq = extraSumTotalTermFreqAB;
+            } else {
+              throw new AssertionError("should never be called");
+            }
+            return new CollectionStatistics(
+                field,
+                shardStatistics.maxDoc() + extraMaxDoc,
+                shardStatistics.docCount() + extraDocCount,
+                shardStatistics.sumTotalTermFreq() + extraSumTotalTermFreq,
+                shardStatistics.sumDocFreq() + extraSumDocFreq);
+          }
+        };
+    searcher.setSimilarity(similarity);
+    CombinedFieldQuery query =
+        new CombinedFieldQuery.Builder()
+            .addField("a")
+            .addField("b")
+            .addTerm(new BytesRef("foo"))
+            .build();
+
+    checkExpectedHits(searcher, numMatch, query, new TermQuery(new Term("ab", "foo")));
+
+    reader.close();
+    w.close();
+    dir.close();
+  }
 }