You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ju...@apache.org on 2021/07/17 01:40:40 UTC
[lucene] branch main updated: LUCENE-9999: CombinedFieldQuery can fail with an exception when document is missing fields (#185)

This is an automated email from the ASF dual-hosted git repository.

julietibs pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/lucene.git


The following commit(s) were added to refs/heads/main by this push:
     new f333b70  LUCENE-9999: CombinedFieldQuery can fail with an exception when document is missing fields (#185)
f333b70 is described below

commit f333b70dbff88c3670c73b7773f40b11e6712af3
Author: Jim Ferenczi <ji...@elastic.co>
AuthorDate: Sat Jul 17 03:40:28 2021 +0200

    LUCENE-9999: CombinedFieldQuery can fail with an exception when document is missing fields (#185)
    
    This change fixes a bug in `MultiNormsLeafSimScorer` that assumes that each
    field should have a norm for every term/document.
    
    As part of the fix, it adds validation that the fields have consistent norms
    settings.
---
 lucene/CHANGES.txt                                 |   3 +
 .../lucene/sandbox/search/CombinedFieldQuery.java  |  29 ++++++
 .../sandbox/search/MultiNormsLeafSimScorer.java    |  21 +++--
 .../sandbox/search/TestCombinedFieldQuery.java     | 103 +++++++++++++++++++++
 4 files changed, 150 insertions(+), 6 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index ce9efa7..14ca78d 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -398,6 +398,9 @@ Bug Fixes
 * LUCENE-9964: Duplicate long values in a document field should only be counted once when using SortedNumericDocValuesFields
   (Gautam Worah)
 
+* LUCENE-9999: CombinedFieldQuery can fail with an exception when document
+  is missing some fields. (Jim Ferenczi, Julie Tibshirani)
+
 * LUCENE-10020: DocComparator should not skip docs with the same docID on
   multiple sorts with search after (Mayya Sharipova, Julie Tibshirani)
 
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java
index 8bafe11..ffb9d13 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CombinedFieldQuery.java
@@ -27,6 +27,8 @@ import java.util.Map;
 import java.util.Objects;
 import java.util.Set;
 import java.util.TreeMap;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfos;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.PostingsEnum;
@@ -81,6 +83,9 @@ import org.apache.lucene.util.SmallFloat;
  * compute norms the same way as {@link SimilarityBase#computeNorm}, which includes {@link
  * BM25Similarity} and {@link DFRSimilarity}. Per-field similarities are not supported.
  *
+ * <p>The query also requires that either all fields or no fields have norms enabled. Having only
+ * some fields with norms enabled can result in errors.
+ *
  * <p>The scoring is based on BM25F's simple formula described in:
  * http://www.staff.city.ac.uk/~sb317/papers/foundations_bm25_review.pdf. This query implements the
  * same approach but allows other similarities besides {@link
@@ -278,6 +283,7 @@ public final class CombinedFieldQuery extends Query implements Accountable {
   @Override
   public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost)
       throws IOException {
+    validateConsistentNorms(searcher.getIndexReader());
     if (scoreMode.needsScores()) {
       return new CombinedFieldWeight(this, searcher, scoreMode, boost);
     } else {
@@ -287,6 +293,29 @@ public final class CombinedFieldQuery extends Query implements Accountable {
     }
   }
 
+  private void validateConsistentNorms(IndexReader reader) {
+    boolean allFieldsHaveNorms = true;
+    boolean noFieldsHaveNorms = true;
+
+    for (LeafReaderContext context : reader.leaves()) {
+      FieldInfos fieldInfos = context.reader().getFieldInfos();
+      for (String field : fieldAndWeights.keySet()) {
+        FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
+        if (fieldInfo != null) {
+          allFieldsHaveNorms &= fieldInfo.hasNorms();
+          noFieldsHaveNorms &= fieldInfo.omitsNorms();
+        }
+      }
+    }
+
+    if (allFieldsHaveNorms == false && noFieldsHaveNorms == false) {
+      throw new IllegalArgumentException(
+          getClass().getSimpleName()
+              + " requires norms to be consistent across fields: some fields cannot "
+              + " have norms enabled, while others have norms disabled");
+    }
+  }
+
   class CombinedFieldWeight extends Weight {
     private final IndexSearcher searcher;
     private final TermStates termStates[];
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/MultiNormsLeafSimScorer.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/MultiNormsLeafSimScorer.java
index e9fde6f..f2c6120 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/MultiNormsLeafSimScorer.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/MultiNormsLeafSimScorer.java
@@ -30,7 +30,13 @@ import org.apache.lucene.search.LeafSimScorer;
 import org.apache.lucene.search.similarities.Similarity.SimScorer;
 import org.apache.lucene.util.SmallFloat;
 
-/** Copy of {@link LeafSimScorer} that sums document's norms from multiple fields. */
+/**
+ * Copy of {@link LeafSimScorer} that sums document's norms from multiple fields.
+ *
+ * <p>For all fields, norms must be encoded using {@link SmallFloat#intToByte4}. This scorer also
+ * requires that either all fields or no fields have norms enabled. Having only some fields with
+ * norms enabled can result in errors or undefined behavior.
+ */
 final class MultiNormsLeafSimScorer {
   /** Cache of decoded norms. */
   private static final float[] LENGTH_TABLE = new float[256];
@@ -62,6 +68,7 @@ final class MultiNormsLeafSimScorer {
           weightList.add(field.weight);
         }
       }
+
       if (normsList.isEmpty()) {
         norms = null;
       } else if (normsList.size() == 1) {
@@ -128,14 +135,16 @@ final class MultiNormsLeafSimScorer {
     @Override
     public boolean advanceExact(int target) throws IOException {
       float normValue = 0;
+      boolean found = false;
       for (int i = 0; i < normsArr.length; i++) {
-        boolean found = normsArr[i].advanceExact(target);
-        assert found;
-        normValue +=
-            weightArr[i] * LENGTH_TABLE[Byte.toUnsignedInt((byte) normsArr[i].longValue())];
+        if (normsArr[i].advanceExact(target)) {
+          normValue +=
+              weightArr[i] * LENGTH_TABLE[Byte.toUnsignedInt((byte) normsArr[i].longValue())];
+          found = true;
+        }
       }
       current = SmallFloat.intToByte4(Math.round(normValue));
-      return true;
+      return found;
     }
 
     @Override
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java
index 4c2c0ae..77aa9ed 100644
--- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestCombinedFieldQuery.java
@@ -46,6 +46,7 @@ import org.apache.lucene.search.similarities.LMDirichletSimilarity;
 import org.apache.lucene.search.similarities.LMJelinekMercerSimilarity;
 import org.apache.lucene.search.similarities.Similarity;
 import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.MMapDirectory;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.LuceneTestCase;
 
@@ -164,6 +165,59 @@ public class TestCombinedFieldQuery extends LuceneTestCase {
     dir.close();
   }
 
+  public void testNormsDisabled() throws IOException {
+    Directory dir = newDirectory();
+    Similarity similarity = randomCompatibleSimilarity();
+
+    IndexWriterConfig iwc = new IndexWriterConfig();
+    iwc.setSimilarity(similarity);
+    RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
+
+    Document doc = new Document();
+    doc.add(new StringField("a", "value", Store.NO));
+    doc.add(new StringField("b", "value", Store.NO));
+    doc.add(new TextField("c", "value", Store.NO));
+    w.addDocument(doc);
+    w.commit();
+
+    doc = new Document();
+    doc.add(new StringField("a", "value", Store.NO));
+    doc.add(new TextField("c", "value", Store.NO));
+    w.addDocument(doc);
+
+    IndexReader reader = w.getReader();
+    IndexSearcher searcher = newSearcher(reader);
+
+    Similarity searchSimilarity = randomCompatibleSimilarity();
+    searcher.setSimilarity(searchSimilarity);
+    TopScoreDocCollector collector = TopScoreDocCollector.create(10, null, 10);
+
+    CombinedFieldQuery query =
+        new CombinedFieldQuery.Builder()
+            .addField("a", 1.0f)
+            .addField("b", 1.0f)
+            .addTerm(new BytesRef("value"))
+            .build();
+    searcher.search(query, collector);
+    TopDocs topDocs = collector.topDocs();
+    assertEquals(new TotalHits(2, TotalHits.Relation.EQUAL_TO), topDocs.totalHits);
+
+    CombinedFieldQuery invalidQuery =
+        new CombinedFieldQuery.Builder()
+            .addField("b", 1.0f)
+            .addField("c", 1.0f)
+            .addTerm(new BytesRef("value"))
+            .build();
+    IllegalArgumentException e =
+        expectThrows(
+            IllegalArgumentException.class, () -> searcher.search(invalidQuery, collector));
+    assertTrue(e.getMessage().contains("requires norms to be consistent across fields"));
+
+    reader.close();
+    w.close();
+    dir.close();
+  }
+
   public void testCopyField() throws IOException {
     Directory dir = newDirectory();
     Similarity similarity = randomCompatibleSimilarity();
@@ -265,6 +319,55 @@ public class TestCombinedFieldQuery extends LuceneTestCase {
     dir.close();
   }
 
+  public void testCopyFieldWithMissingFields() throws IOException {
+    Directory dir = new MMapDirectory(createTempDir());
+    Similarity similarity = randomCompatibleSimilarity();
+
+    IndexWriterConfig iwc = new IndexWriterConfig();
+    iwc.setSimilarity(similarity);
+    RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
+
+    int boost1 = Math.max(1, random().nextInt(5));
+    int boost2 = Math.max(1, random().nextInt(5));
+    int numMatch = atLeast(10);
+    for (int i = 0; i < numMatch; i++) {
+      Document doc = new Document();
+      int freqA = random().nextInt(5) + 1;
+      for (int j = 0; j < freqA; j++) {
+        doc.add(new TextField("a", "foo", Store.NO));
+      }
+
+      // Choose frequencies such that sometimes we don't add field B
+      int freqB = random().nextInt(3);
+      for (int j = 0; j < freqB; j++) {
+        doc.add(new TextField("b", "foo", Store.NO));
+      }
+
+      int freqAB = freqA * boost1 + freqB * boost2;
+      for (int j = 0; j < freqAB; j++) {
+        doc.add(new TextField("ab", "foo", Store.NO));
+      }
+
+      w.addDocument(doc);
+    }
+
+    IndexReader reader = w.getReader();
+    IndexSearcher searcher = newSearcher(reader);
+    searcher.setSimilarity(similarity);
+    CombinedFieldQuery query =
+        new CombinedFieldQuery.Builder()
+            .addField("a", (float) boost1)
+            .addField("b", (float) boost2)
+            .addTerm(new BytesRef("foo"))
+            .build();
+
+    checkExpectedHits(searcher, numMatch, query, new TermQuery(new Term("ab", "foo")));
+
+    reader.close();
+    w.close();
+    dir.close();
+  }
+
   private static Similarity randomCompatibleSimilarity() {
     return RandomPicks.randomFrom(
         random(),