You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2020/12/18 15:15:16 UTC
[lucene-solr] 02/02: LUCENE-9635: BM25FQuery - Mask encoded norm long value in array lookup to avoid negative norms in long documents (#2138)

This is an automated email from the ASF dual-hosted git repository.

jpountz pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git

commit 9a5315532b1ba1bcb479dcd7f55a16c444bc7308
Author: yiluncui <75...@users.noreply.github.com>
AuthorDate: Fri Dec 18 06:56:31 2020 -0800

    LUCENE-9635: BM25FQuery - Mask encoded norm long value in array lookup to avoid negative norms in long documents (#2138)
---
 lucene/CHANGES.txt                                 |  3 +
 .../lucene/search/MultiNormsLeafSimScorer.java     |  2 +-
 .../org/apache/lucene/search/TestBM25FQuery.java   | 79 ++++++++++++++++++++++
 3 files changed, 83 insertions(+), 1 deletion(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 8faeab5..6e39b2e 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -59,6 +59,9 @@ Bug Fixes
   (Ignacio Vera)   
 
 * LUCENE-9606: Wrap boolean queries generated by shape fields with a Constant score query. (Ignacio Vera)  
+
+* LUCENE-9635: BM25FQuery - Mask encoded norm long value in array lookup.
+  (Yilun Cui)
   
 Other
 ---------------------
diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/MultiNormsLeafSimScorer.java b/lucene/sandbox/src/java/org/apache/lucene/search/MultiNormsLeafSimScorer.java
index 75c9801..9a37d3d 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/search/MultiNormsLeafSimScorer.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/search/MultiNormsLeafSimScorer.java
@@ -126,7 +126,7 @@ final class MultiNormsLeafSimScorer {
       for (int i = 0; i < normsArr.length; i++) {
         boolean found = normsArr[i].advanceExact(target);
         assert found;
-        normValue += weightArr[i] * LENGTH_TABLE[(byte) normsArr[i].longValue()];
+        normValue += weightArr[i] * LENGTH_TABLE[Byte.toUnsignedInt((byte) normsArr[i].longValue())];
       }
       current = SmallFloat.intToByte4(Math.round(normValue));
       return true;
diff --git a/lucene/sandbox/src/test/org/apache/lucene/search/TestBM25FQuery.java b/lucene/sandbox/src/test/org/apache/lucene/search/TestBM25FQuery.java
index 36c0054..5900bbb 100644
--- a/lucene/sandbox/src/test/org/apache/lucene/search/TestBM25FQuery.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/search/TestBM25FQuery.java
@@ -23,11 +23,14 @@ import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field.Store;
 import org.apache.lucene.document.StringField;
 import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.FieldInvertState;
 import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.index.MultiReader;
 import org.apache.lucene.index.RandomIndexWriter;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.similarities.BM25Similarity;
+import org.apache.lucene.search.similarities.Similarity;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.LuceneTestCase;
@@ -166,4 +169,80 @@ public class TestBM25FQuery extends LuceneTestCase {
     w.close();
     dir.close();
   }
+
+  public void testDocWithNegativeNorms() throws IOException {
+    Directory dir = newDirectory();
+    IndexWriterConfig iwc = new IndexWriterConfig();
+    iwc.setSimilarity(new NegativeNormSimilarity());
+    RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
+
+    String queryString = "foo";
+
+    Document doc = new Document();
+    //both fields must contain tokens that match the query string "foo"
+    doc.add(new TextField("f", "foo", Store.NO));
+    doc.add(new TextField("g", "foo baz", Store.NO));
+    w.addDocument(doc);
+
+    IndexReader reader = w.getReader();
+    IndexSearcher searcher = newSearcher(reader);
+    BM25FQuery query = new BM25FQuery.Builder()
+            .addField("f")
+            .addField("g")
+            .addTerm(new BytesRef(queryString))
+            .build();
+    TopDocs topDocs = searcher.search(query, 10);
+    CheckHits.checkDocIds("queried docs do not match", new int[]{0}, topDocs.scoreDocs);
+
+    reader.close();
+    w.close();
+    dir.close();
+  }
+
+  public void testMultipleDocsNegativeNorms() throws IOException {
+    Directory dir = newDirectory();
+    IndexWriterConfig iwc = new IndexWriterConfig();
+    iwc.setSimilarity(new NegativeNormSimilarity());
+    RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
+
+    String queryString = "foo";
+
+    Document doc0 = new Document();
+    doc0.add(new TextField("f", "foo", Store.NO));
+    doc0.add(new TextField("g", "foo baz", Store.NO));
+    w.addDocument(doc0);
+
+    Document doc1 = new Document();
+    // add another match on the query string to the second doc
+    doc1.add(new TextField("f", "foo is foo", Store.NO));
+    doc1.add(new TextField("g", "foo baz", Store.NO));
+    w.addDocument(doc1);
+
+    IndexReader reader = w.getReader();
+    IndexSearcher searcher = newSearcher(reader);
+    BM25FQuery query = new BM25FQuery.Builder()
+            .addField("f")
+            .addField("g")
+            .addTerm(new BytesRef(queryString))
+            .build();
+    TopDocs topDocs = searcher.search(query, 10);
+    //Return doc1 ahead of doc0 since its tf is higher
+    CheckHits.checkDocIds("queried docs do not match", new int[]{1,0}, topDocs.scoreDocs);
+
+    reader.close();
+    w.close();
+    dir.close();
+  }
+
+  private static final class NegativeNormSimilarity extends Similarity {
+    @Override
+    public long computeNorm(FieldInvertState state) {
+      return -128;
+    }
+
+    @Override
+    public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
+      return new BM25Similarity().scorer(boost, collectionStats, termStats);
+    }
+  }
 }