You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ju...@apache.org on 2022/05/30 18:02:45 UTC
[lucene] branch main updated: LUCENE-10582: Fix merging of CollectionStatistics in CombinedFieldQuery (#910)
This is an automated email from the ASF dual-hosted git repository.
julietibs pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/lucene.git
The following commit(s) were added to refs/heads/main by this push:
new e319a5223ca LUCENE-10582: Fix merging of CollectionStatistics in CombinedFieldQuery (#910)
e319a5223ca is described below
commit e319a5223cac757f4d9c7a80d3b0587370f8aa5f
Author: Yannick Welsch <ya...@welsch.lu>
AuthorDate: Mon May 30 20:02:40 2022 +0200
LUCENE-10582: Fix merging of CollectionStatistics in CombinedFieldQuery (#910)
CombinedFieldQuery does not properly combine overridden collection statistics, resulting in an IllegalArgumentException during searches.
---
lucene/CHANGES.txt | 2 +
.../lucene/sandbox/search/CombinedFieldQuery.java | 3 +-
.../sandbox/search/TestCombinedFieldQuery.java | 82 ++++++++++++++++++++++
3 files changed, 86 insertions(+), 1 deletion(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 603edacc0b8..4b3f15bb724 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -88,6 +88,8 @@ Bug Fixes
* LUCENE-10574: Prevent pathological O(N^2) merging. (Adrien Grand)
+* LUCENE-10582: Fix merging of overridden CollectionStatistics in CombinedFieldQuery (Yannick Welsch)
+
Other
---------------------
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java
index fccd6ce3eca..88a905d8f77 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java
@@ -352,13 +352,14 @@ public final class CombinedFieldQuery extends Query implements Accountable {
private CollectionStatistics mergeCollectionStatistics(IndexSearcher searcher)
throws IOException {
- long maxDoc = searcher.getIndexReader().maxDoc();
+ long maxDoc = 0;
long docCount = 0;
long sumTotalTermFreq = 0;
long sumDocFreq = 0;
for (FieldAndWeight fieldWeight : fieldAndWeights.values()) {
CollectionStatistics collectionStats = searcher.collectionStatistics(fieldWeight.field);
if (collectionStats != null) {
+ maxDoc = Math.max(collectionStats.maxDoc(), maxDoc);
docCount = Math.max(collectionStats.docCount(), docCount);
sumDocFreq = Math.max(collectionStats.sumDocFreq(), sumDocFreq);
sumTotalTermFreq += (double) fieldWeight.weight * collectionStats.sumTotalTermFreq();
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java
index 331abc7e7a7..739c9b85323 100644
--- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java
@@ -589,4 +589,86 @@ public class TestCombinedFieldQuery extends LuceneTestCase {
return new BM25Similarity().scorer(boost, collectionStats, termStats);
}
}
+
+ public void testOverrideCollectionStatistics() throws IOException {
+ Directory dir = newDirectory();
+ IndexWriterConfig iwc = new IndexWriterConfig();
+ Similarity similarity = randomCompatibleSimilarity();
+ iwc.setSimilarity(similarity);
+ RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
+
+ int numMatch = atLeast(10);
+ for (int i = 0; i < numMatch; i++) {
+ Document doc = new Document();
+ if (random().nextBoolean()) {
+ doc.add(new TextField("a", "baz", Store.NO));
+ doc.add(new TextField("b", "baz", Store.NO));
+ for (int k = 0; k < 2; k++) {
+ doc.add(new TextField("ab", "baz", Store.NO));
+ }
+ w.addDocument(doc);
+ doc.clear();
+ }
+ int freqA = random().nextInt(5) + 1;
+ for (int j = 0; j < freqA; j++) {
+ doc.add(new TextField("a", "foo", Store.NO));
+ }
+ int freqB = random().nextInt(5) + 1;
+ for (int j = 0; j < freqB; j++) {
+ doc.add(new TextField("b", "foo", Store.NO));
+ }
+ int freqAB = freqA + freqB;
+ for (int j = 0; j < freqAB; j++) {
+ doc.add(new TextField("ab", "foo", Store.NO));
+ }
+ w.addDocument(doc);
+ }
+
+ IndexReader reader = w.getReader();
+
+ int extraMaxDoc = randomIntBetween(0, 10);
+ int extraDocCount = randomIntBetween(0, extraMaxDoc);
+ int extraSumDocFreq = extraDocCount + randomIntBetween(0, 10);
+
+ int extraSumTotalTermFreqA = extraSumDocFreq + randomIntBetween(0, 10);
+ int extraSumTotalTermFreqB = extraSumDocFreq + randomIntBetween(0, 10);
+ int extraSumTotalTermFreqAB = extraSumTotalTermFreqA + extraSumTotalTermFreqB;
+
+ IndexSearcher searcher =
+ new IndexSearcher(reader) {
+ @Override
+ public CollectionStatistics collectionStatistics(String field) throws IOException {
+ CollectionStatistics shardStatistics = super.collectionStatistics(field);
+ int extraSumTotalTermFreq;
+ if (field.equals("a")) {
+ extraSumTotalTermFreq = extraSumTotalTermFreqA;
+ } else if (field.equals("b")) {
+ extraSumTotalTermFreq = extraSumTotalTermFreqB;
+ } else if (field.equals("ab")) {
+ extraSumTotalTermFreq = extraSumTotalTermFreqAB;
+ } else {
+ throw new AssertionError("should never be called");
+ }
+ return new CollectionStatistics(
+ field,
+ shardStatistics.maxDoc() + extraMaxDoc,
+ shardStatistics.docCount() + extraDocCount,
+ shardStatistics.sumTotalTermFreq() + extraSumTotalTermFreq,
+ shardStatistics.sumDocFreq() + extraSumDocFreq);
+ }
+ };
+ searcher.setSimilarity(similarity);
+ CombinedFieldQuery query =
+ new CombinedFieldQuery.Builder()
+ .addField("a")
+ .addField("b")
+ .addTerm(new BytesRef("foo"))
+ .build();
+
+ checkExpectedHits(searcher, numMatch, query, new TermQuery(new Term("ab", "foo")));
+
+ reader.close();
+ w.close();
+ dir.close();
+ }
}