You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ju...@apache.org on 2021/07/17 04:22:17 UTC
[lucene-solr] 01/02: LUCENE-9999: CombinedFieldQuery can fail with
an exception when document is missing fields (#185)
This is an automated email from the ASF dual-hosted git repository.
julietibs pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
commit 215e4a5882c540ff9667455c30a707c629e83675
Author: Julie Tibshirani <ju...@gmail.com>
AuthorDate: Fri Jul 16 18:45:46 2021 -0700
LUCENE-9999: CombinedFieldQuery can fail with an exception when document is missing fields (#185)
This change fixes a bug in `MultiNormsLeafSimScorer` that assumes that each
field should have a norm for every term/document.
As part of the fix, it adds validation that the fields have consistent norms
settings.
---
lucene/CHANGES.txt | 3 +
.../apache/lucene/search/CombinedFieldQuery.java | 29 ++++++
.../lucene/search/MultiNormsLeafSimScorer.java | 21 +++--
.../lucene/search/TestCombinedFieldQuery.java | 102 +++++++++++++++++++++
4 files changed, 149 insertions(+), 6 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index ae8a845..c8ede43 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -64,6 +64,9 @@ Bug Fixes
* LUCENE-9988: Fix DrillSideways correctness bug introduced in LUCENE-9944 (Greg Miller)
+* LUCENE-9999: CombinedFieldQuery can fail with an exception when document
+ is missing some fields. (Jim Ferenczi, Julie Tibshirani)
+
* LUCENE-10020: DocComparator should not skip docs with the same docID on
multiple sorts with search after (Mayya Sharipova, Julie Tibshirani)
diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/CombinedFieldQuery.java b/lucene/sandbox/src/java/org/apache/lucene/search/CombinedFieldQuery.java
index 23d5047..6a4f16c 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/search/CombinedFieldQuery.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/search/CombinedFieldQuery.java
@@ -27,6 +27,8 @@ import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.TreeMap;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum;
@@ -62,6 +64,9 @@ import org.apache.lucene.util.SmallFloat;
* compute norms the same way as {@link SimilarityBase#computeNorm}, which includes {@link
* BM25Similarity} and {@link DFRSimilarity}. Per-field similarities are not supported.
*
+ * <p>The query also requires that either all fields or no fields have norms enabled. Having only
+ * some fields with norms enabled can result in errors.
+ *
* <p>The scoring is based on BM25F's simple formula described in:
* http://www.staff.city.ac.uk/~sb317/papers/foundations_bm25_review.pdf. This query implements the
* same approach but allows other similarities besides {@link
@@ -271,6 +276,7 @@ public final class CombinedFieldQuery extends Query implements Accountable {
@Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost)
throws IOException {
+ validateConsistentNorms(searcher.getIndexReader());
if (scoreMode.needsScores()) {
return new CombinedFieldWeight(this, searcher, scoreMode, boost);
} else {
@@ -280,6 +286,29 @@ public final class CombinedFieldQuery extends Query implements Accountable {
}
}
+ private void validateConsistentNorms(IndexReader reader) {
+ boolean allFieldsHaveNorms = true;
+ boolean noFieldsHaveNorms = true;
+
+ for (LeafReaderContext context : reader.leaves()) {
+ FieldInfos fieldInfos = context.reader().getFieldInfos();
+ for (String field : fieldAndWeights.keySet()) {
+ FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
+ if (fieldInfo != null) {
+ allFieldsHaveNorms &= fieldInfo.hasNorms();
+ noFieldsHaveNorms &= fieldInfo.omitsNorms();
+ }
+ }
+ }
+
+ if (allFieldsHaveNorms == false && noFieldsHaveNorms == false) {
+ throw new IllegalArgumentException(
+ getClass().getSimpleName()
+ + " requires norms to be consistent across fields: some fields cannot "
+ + " have norms enabled, while others have norms disabled");
+ }
+ }
+
class CombinedFieldWeight extends Weight {
private final IndexSearcher searcher;
private final TermStates termStates[];
diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/MultiNormsLeafSimScorer.java b/lucene/sandbox/src/java/org/apache/lucene/search/MultiNormsLeafSimScorer.java
index 8ebefcd..802a53a 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/search/MultiNormsLeafSimScorer.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/search/MultiNormsLeafSimScorer.java
@@ -28,7 +28,13 @@ import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.search.similarities.Similarity.SimScorer;
import org.apache.lucene.util.SmallFloat;
-/** Copy of {@link LeafSimScorer} that sums document's norms from multiple fields. */
+/**
+ * Copy of {@link LeafSimScorer} that sums document's norms from multiple fields.
+ *
+ * <p>For all fields, norms must be encoded using {@link SmallFloat#intToByte4}. This scorer also
+ * requires that either all fields or no fields have norms enabled. Having only some fields with
+ * norms enabled can result in errors or undefined behavior.
+ */
final class MultiNormsLeafSimScorer {
/** Cache of decoded norms. */
private static final float[] LENGTH_TABLE = new float[256];
@@ -60,6 +66,7 @@ final class MultiNormsLeafSimScorer {
weightList.add(field.weight);
}
}
+
if (normsList.isEmpty()) {
norms = null;
} else if (normsList.size() == 1) {
@@ -126,14 +133,16 @@ final class MultiNormsLeafSimScorer {
@Override
public boolean advanceExact(int target) throws IOException {
float normValue = 0;
+ boolean found = false;
for (int i = 0; i < normsArr.length; i++) {
- boolean found = normsArr[i].advanceExact(target);
- assert found;
- normValue +=
- weightArr[i] * LENGTH_TABLE[Byte.toUnsignedInt((byte) normsArr[i].longValue())];
+ if (normsArr[i].advanceExact(target)) {
+ normValue +=
+ weightArr[i] * LENGTH_TABLE[Byte.toUnsignedInt((byte) normsArr[i].longValue())];
+ found = true;
+ }
}
current = SmallFloat.intToByte4(Math.round(normValue));
- return true;
+ return found;
}
@Override
diff --git a/lucene/sandbox/src/test/org/apache/lucene/search/TestCombinedFieldQuery.java b/lucene/sandbox/src/test/org/apache/lucene/search/TestCombinedFieldQuery.java
index 0a1880f..8e2fba1 100644
--- a/lucene/sandbox/src/test/org/apache/lucene/search/TestCombinedFieldQuery.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/search/TestCombinedFieldQuery.java
@@ -165,6 +165,59 @@ public class TestCombinedFieldQuery extends LuceneTestCase {
dir.close();
}
+ public void testNormsDisabled() throws IOException {
+ Directory dir = newDirectory();
+ Similarity similarity = randomCompatibleSimilarity();
+
+ IndexWriterConfig iwc = new IndexWriterConfig();
+ iwc.setSimilarity(similarity);
+ RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
+
+ Document doc = new Document();
+ doc.add(new StringField("a", "value", Store.NO));
+ doc.add(new StringField("b", "value", Store.NO));
+ doc.add(new TextField("c", "value", Store.NO));
+ w.addDocument(doc);
+ w.commit();
+
+ doc = new Document();
+ doc.add(new StringField("a", "value", Store.NO));
+ doc.add(new TextField("c", "value", Store.NO));
+ w.addDocument(doc);
+
+ IndexReader reader = w.getReader();
+ IndexSearcher searcher = newSearcher(reader);
+
+ Similarity searchSimilarity = randomCompatibleSimilarity();
+ searcher.setSimilarity(searchSimilarity);
+ TopScoreDocCollector collector = TopScoreDocCollector.create(10, null, 10);
+
+ CombinedFieldQuery query =
+ new CombinedFieldQuery.Builder()
+ .addField("a", 1.0f)
+ .addField("b", 1.0f)
+ .addTerm(new BytesRef("value"))
+ .build();
+ searcher.search(query, collector);
+ TopDocs topDocs = collector.topDocs();
+ assertEquals(new TotalHits(2, TotalHits.Relation.EQUAL_TO), topDocs.totalHits);
+
+ CombinedFieldQuery invalidQuery =
+ new CombinedFieldQuery.Builder()
+ .addField("b", 1.0f)
+ .addField("c", 1.0f)
+ .addTerm(new BytesRef("value"))
+ .build();
+ IllegalArgumentException e =
+ expectThrows(
+ IllegalArgumentException.class, () -> searcher.search(invalidQuery, collector));
+ assertTrue(e.getMessage().contains("requires norms to be consistent across fields"));
+
+ reader.close();
+ w.close();
+ dir.close();
+ }
+
public void testCopyField() throws IOException {
Directory dir = newDirectory();
Similarity similarity = randomCompatibleSimilarity();
@@ -266,6 +319,55 @@ public class TestCombinedFieldQuery extends LuceneTestCase {
dir.close();
}
+ public void testCopyFieldWithMissingFields() throws IOException {
+ Directory dir = newDirectory();
+ Similarity similarity = randomCompatibleSimilarity();
+
+ IndexWriterConfig iwc = new IndexWriterConfig();
+ iwc.setSimilarity(similarity);
+ RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
+
+ int boost1 = Math.max(1, random().nextInt(5));
+ int boost2 = Math.max(1, random().nextInt(5));
+ int numMatch = atLeast(10);
+ for (int i = 0; i < numMatch; i++) {
+ Document doc = new Document();
+ int freqA = random().nextInt(5) + 1;
+ for (int j = 0; j < freqA; j++) {
+ doc.add(new TextField("a", "foo", Store.NO));
+ }
+
+ // Choose frequencies such that sometimes we don't add field B
+ int freqB = random().nextInt(3);
+ for (int j = 0; j < freqB; j++) {
+ doc.add(new TextField("b", "foo", Store.NO));
+ }
+
+ int freqAB = freqA * boost1 + freqB * boost2;
+ for (int j = 0; j < freqAB; j++) {
+ doc.add(new TextField("ab", "foo", Store.NO));
+ }
+
+ w.addDocument(doc);
+ }
+
+ IndexReader reader = w.getReader();
+ IndexSearcher searcher = newSearcher(reader);
+ searcher.setSimilarity(similarity);
+ CombinedFieldQuery query =
+ new CombinedFieldQuery.Builder()
+ .addField("a", (float) boost1)
+ .addField("b", (float) boost2)
+ .addTerm(new BytesRef("foo"))
+ .build();
+
+ checkExpectedHits(searcher, numMatch, query, new TermQuery(new Term("ab", "foo")));
+
+ reader.close();
+ w.close();
+ dir.close();
+ }
+
private static Similarity randomCompatibleSimilarity() {
return RandomPicks.randomFrom(
random(),