You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@lucene.apache.org by ju...@apache.org on 2021/07/17 04:22:16 UTC

[lucene-solr] branch branch_8x updated (2ad2867 -> ac86325)

This is an automated email from the ASF dual-hosted git repository.

julietibs pushed a change to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git.


    from 2ad2867  LUCENE-10026: Fix CombinedFieldQuery equals and hashCode (#212) The previous equals and hashCode methods only compared query terms. This meant that queries on different fields, or with different field weights, were considered the same
     new 215e4a5  LUCENE-9999: CombinedFieldQuery can fail with an exception when document is missing fields (#185)
     new ac86325  Add missing changelog entry for LUCENE-10026

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 lucene/CHANGES.txt                                 |   6 ++
 .../apache/lucene/search/CombinedFieldQuery.java   |  29 ++++++
 .../lucene/search/MultiNormsLeafSimScorer.java     |  21 +++--
 .../lucene/search/TestCombinedFieldQuery.java      | 102 +++++++++++++++++++++
 4 files changed, 152 insertions(+), 6 deletions(-)

[lucene-solr] 02/02: Add missing changelog entry for LUCENE-10026

Posted by ju...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

julietibs pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git

commit ac86325d3a22ba4ee5d6fdbeea8883dfc0ebb5c2
Author: Julie Tibshirani <ju...@gmail.com>
AuthorDate: Fri Jul 16 21:19:11 2021 -0700

    Add missing changelog entry for LUCENE-10026
---
 lucene/CHANGES.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index c8ede43..1282ad2 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -70,6 +70,9 @@ Bug Fixes
 * LUCENE-10020: DocComparator should not skip docs with the same docID on
   multiple sorts with search after (Mayya Sharipova, Julie Tibshirani)
 
+* LUCENE-10026: Fix CombinedFieldQuery equals and hashCode, which ensures
+  query rewrites don't drop CombinedFieldQuery clauses. (Julie Tibshirani)
+
 Other
 ---------------------

[lucene-solr] 01/02: LUCENE-9999: CombinedFieldQuery can fail with an exception when document is missing fields (#185)

Posted by ju...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

julietibs pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git

commit 215e4a5882c540ff9667455c30a707c629e83675
Author: Julie Tibshirani <ju...@gmail.com>
AuthorDate: Fri Jul 16 18:45:46 2021 -0700

    LUCENE-9999: CombinedFieldQuery can fail with an exception when document is missing fields (#185)
    
    This change fixes a bug in `MultiNormsLeafSimScorer` that assumes that each
    field should have a norm for every term/document.
    
    As part of the fix, it adds validation that the fields have consistent norms
    settings.
---
 lucene/CHANGES.txt                                 |   3 +
 .../apache/lucene/search/CombinedFieldQuery.java   |  29 ++++++
 .../lucene/search/MultiNormsLeafSimScorer.java     |  21 +++--
 .../lucene/search/TestCombinedFieldQuery.java      | 102 +++++++++++++++++++++
 4 files changed, 149 insertions(+), 6 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index ae8a845..c8ede43 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -64,6 +64,9 @@ Bug Fixes
 
 * LUCENE-9988: Fix DrillSideways correctness bug introduced in LUCENE-9944 (Greg Miller)
 
+* LUCENE-9999: CombinedFieldQuery can fail with an exception when document
+  is missing some fields. (Jim Ferenczi, Julie Tibshirani)
+
 * LUCENE-10020: DocComparator should not skip docs with the same docID on
   multiple sorts with search after (Mayya Sharipova, Julie Tibshirani)
 
diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/CombinedFieldQuery.java b/lucene/sandbox/src/java/org/apache/lucene/search/CombinedFieldQuery.java
index 23d5047..6a4f16c 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/search/CombinedFieldQuery.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/search/CombinedFieldQuery.java
@@ -27,6 +27,8 @@ import java.util.Map;
 import java.util.Objects;
 import java.util.Set;
 import java.util.TreeMap;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfos;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.PostingsEnum;
@@ -62,6 +64,9 @@ import org.apache.lucene.util.SmallFloat;
  * compute norms the same way as {@link SimilarityBase#computeNorm}, which includes {@link
  * BM25Similarity} and {@link DFRSimilarity}. Per-field similarities are not supported.
  *
+ * <p>The query also requires that either all fields or no fields have norms enabled. Having only
+ * some fields with norms enabled can result in errors.
+ *
  * <p>The scoring is based on BM25F's simple formula described in:
  * http://www.staff.city.ac.uk/~sb317/papers/foundations_bm25_review.pdf. This query implements the
  * same approach but allows other similarities besides {@link
@@ -271,6 +276,7 @@ public final class CombinedFieldQuery extends Query implements Accountable {
   @Override
   public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost)
       throws IOException {
+    validateConsistentNorms(searcher.getIndexReader());
     if (scoreMode.needsScores()) {
       return new CombinedFieldWeight(this, searcher, scoreMode, boost);
     } else {
@@ -280,6 +286,29 @@ public final class CombinedFieldQuery extends Query implements Accountable {
     }
   }
 
+  private void validateConsistentNorms(IndexReader reader) {
+    boolean allFieldsHaveNorms = true;
+    boolean noFieldsHaveNorms = true;
+
+    for (LeafReaderContext context : reader.leaves()) {
+      FieldInfos fieldInfos = context.reader().getFieldInfos();
+      for (String field : fieldAndWeights.keySet()) {
+        FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
+        if (fieldInfo != null) {
+          allFieldsHaveNorms &= fieldInfo.hasNorms();
+          noFieldsHaveNorms &= fieldInfo.omitsNorms();
+        }
+      }
+    }
+
+    if (allFieldsHaveNorms == false && noFieldsHaveNorms == false) {
+      throw new IllegalArgumentException(
+          getClass().getSimpleName()
+              + " requires norms to be consistent across fields: some fields cannot "
+              + " have norms enabled, while others have norms disabled");
+    }
+  }
+
   class CombinedFieldWeight extends Weight {
     private final IndexSearcher searcher;
     private final TermStates termStates[];
diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/MultiNormsLeafSimScorer.java b/lucene/sandbox/src/java/org/apache/lucene/search/MultiNormsLeafSimScorer.java
index 8ebefcd..802a53a 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/search/MultiNormsLeafSimScorer.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/search/MultiNormsLeafSimScorer.java
@@ -28,7 +28,13 @@ import org.apache.lucene.index.NumericDocValues;
 import org.apache.lucene.search.similarities.Similarity.SimScorer;
 import org.apache.lucene.util.SmallFloat;
 
-/** Copy of {@link LeafSimScorer} that sums document's norms from multiple fields. */
+/**
+ * Copy of {@link LeafSimScorer} that sums document's norms from multiple fields.
+ *
+ * <p>For all fields, norms must be encoded using {@link SmallFloat#intToByte4}. This scorer also
+ * requires that either all fields or no fields have norms enabled. Having only some fields with
+ * norms enabled can result in errors or undefined behavior.
+ */
 final class MultiNormsLeafSimScorer {
   /** Cache of decoded norms. */
   private static final float[] LENGTH_TABLE = new float[256];
@@ -60,6 +66,7 @@ final class MultiNormsLeafSimScorer {
           weightList.add(field.weight);
         }
       }
+
       if (normsList.isEmpty()) {
         norms = null;
       } else if (normsList.size() == 1) {
@@ -126,14 +133,16 @@ final class MultiNormsLeafSimScorer {
     @Override
     public boolean advanceExact(int target) throws IOException {
       float normValue = 0;
+      boolean found = false;
       for (int i = 0; i < normsArr.length; i++) {
-        boolean found = normsArr[i].advanceExact(target);
-        assert found;
-        normValue +=
-            weightArr[i] * LENGTH_TABLE[Byte.toUnsignedInt((byte) normsArr[i].longValue())];
+        if (normsArr[i].advanceExact(target)) {
+          normValue +=
+              weightArr[i] * LENGTH_TABLE[Byte.toUnsignedInt((byte) normsArr[i].longValue())];
+          found = true;
+        }
       }
       current = SmallFloat.intToByte4(Math.round(normValue));
-      return true;
+      return found;
     }
 
     @Override
diff --git a/lucene/sandbox/src/test/org/apache/lucene/search/TestCombinedFieldQuery.java b/lucene/sandbox/src/test/org/apache/lucene/search/TestCombinedFieldQuery.java
index 0a1880f..8e2fba1 100644
--- a/lucene/sandbox/src/test/org/apache/lucene/search/TestCombinedFieldQuery.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/search/TestCombinedFieldQuery.java
@@ -165,6 +165,59 @@ public class TestCombinedFieldQuery extends LuceneTestCase {
     dir.close();
   }
 
+  public void testNormsDisabled() throws IOException {
+    Directory dir = newDirectory();
+    Similarity similarity = randomCompatibleSimilarity();
+
+    IndexWriterConfig iwc = new IndexWriterConfig();
+    iwc.setSimilarity(similarity);
+    RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
+
+    Document doc = new Document();
+    doc.add(new StringField("a", "value", Store.NO));
+    doc.add(new StringField("b", "value", Store.NO));
+    doc.add(new TextField("c", "value", Store.NO));
+    w.addDocument(doc);
+    w.commit();
+
+    doc = new Document();
+    doc.add(new StringField("a", "value", Store.NO));
+    doc.add(new TextField("c", "value", Store.NO));
+    w.addDocument(doc);
+
+    IndexReader reader = w.getReader();
+    IndexSearcher searcher = newSearcher(reader);
+
+    Similarity searchSimilarity = randomCompatibleSimilarity();
+    searcher.setSimilarity(searchSimilarity);
+    TopScoreDocCollector collector = TopScoreDocCollector.create(10, null, 10);
+
+    CombinedFieldQuery query =
+        new CombinedFieldQuery.Builder()
+            .addField("a", 1.0f)
+            .addField("b", 1.0f)
+            .addTerm(new BytesRef("value"))
+            .build();
+    searcher.search(query, collector);
+    TopDocs topDocs = collector.topDocs();
+    assertEquals(new TotalHits(2, TotalHits.Relation.EQUAL_TO), topDocs.totalHits);
+
+    CombinedFieldQuery invalidQuery =
+        new CombinedFieldQuery.Builder()
+            .addField("b", 1.0f)
+            .addField("c", 1.0f)
+            .addTerm(new BytesRef("value"))
+            .build();
+    IllegalArgumentException e =
+        expectThrows(
+            IllegalArgumentException.class, () -> searcher.search(invalidQuery, collector));
+    assertTrue(e.getMessage().contains("requires norms to be consistent across fields"));
+
+    reader.close();
+    w.close();
+    dir.close();
+  }
+
   public void testCopyField() throws IOException {
     Directory dir = newDirectory();
     Similarity similarity = randomCompatibleSimilarity();
@@ -266,6 +319,55 @@ public class TestCombinedFieldQuery extends LuceneTestCase {
     dir.close();
   }
 
+  public void testCopyFieldWithMissingFields() throws IOException {
+    Directory dir = newDirectory();
+    Similarity similarity = randomCompatibleSimilarity();
+
+    IndexWriterConfig iwc = new IndexWriterConfig();
+    iwc.setSimilarity(similarity);
+    RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
+
+    int boost1 = Math.max(1, random().nextInt(5));
+    int boost2 = Math.max(1, random().nextInt(5));
+    int numMatch = atLeast(10);
+    for (int i = 0; i < numMatch; i++) {
+      Document doc = new Document();
+      int freqA = random().nextInt(5) + 1;
+      for (int j = 0; j < freqA; j++) {
+        doc.add(new TextField("a", "foo", Store.NO));
+      }
+
+      // Choose frequencies such that sometimes we don't add field B
+      int freqB = random().nextInt(3);
+      for (int j = 0; j < freqB; j++) {
+        doc.add(new TextField("b", "foo", Store.NO));
+      }
+
+      int freqAB = freqA * boost1 + freqB * boost2;
+      for (int j = 0; j < freqAB; j++) {
+        doc.add(new TextField("ab", "foo", Store.NO));
+      }
+
+      w.addDocument(doc);
+    }
+
+    IndexReader reader = w.getReader();
+    IndexSearcher searcher = newSearcher(reader);
+    searcher.setSimilarity(similarity);
+    CombinedFieldQuery query =
+        new CombinedFieldQuery.Builder()
+            .addField("a", (float) boost1)
+            .addField("b", (float) boost2)
+            .addTerm(new BytesRef("foo"))
+            .build();
+
+    checkExpectedHits(searcher, numMatch, query, new TermQuery(new Term("ab", "foo")));
+
+    reader.close();
+    w.close();
+    dir.close();
+  }
+
   private static Similarity randomCompatibleSimilarity() {
     return RandomPicks.randomFrom(
         random(),