You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2020/12/18 15:15:16 UTC
[lucene-solr] 02/02: LUCENE-9635: BM25FQuery - Mask encoded norm
long value in array lookup to avoid negative norms in long documents
(#2138)
This is an automated email from the ASF dual-hosted git repository.
jpountz pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
commit 9a5315532b1ba1bcb479dcd7f55a16c444bc7308
Author: yiluncui <75...@users.noreply.github.com>
AuthorDate: Fri Dec 18 06:56:31 2020 -0800
LUCENE-9635: BM25FQuery - Mask encoded norm long value in array lookup to avoid negative norms in long documents (#2138)
---
lucene/CHANGES.txt | 3 +
.../lucene/search/MultiNormsLeafSimScorer.java | 2 +-
.../org/apache/lucene/search/TestBM25FQuery.java | 79 ++++++++++++++++++++++
3 files changed, 83 insertions(+), 1 deletion(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 8faeab5..6e39b2e 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -59,6 +59,9 @@ Bug Fixes
(Ignacio Vera)
* LUCENE-9606: Wrap boolean queries generated by shape fields with a Constant score query. (Ignacio Vera)
+
+* LUCENE-9635: BM25FQuery - Mask encoded norm long value in array lookup.
+ (Yilun Cui)
Other
---------------------
diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/MultiNormsLeafSimScorer.java b/lucene/sandbox/src/java/org/apache/lucene/search/MultiNormsLeafSimScorer.java
index 75c9801..9a37d3d 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/search/MultiNormsLeafSimScorer.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/search/MultiNormsLeafSimScorer.java
@@ -126,7 +126,7 @@ final class MultiNormsLeafSimScorer {
for (int i = 0; i < normsArr.length; i++) {
boolean found = normsArr[i].advanceExact(target);
assert found;
- normValue += weightArr[i] * LENGTH_TABLE[(byte) normsArr[i].longValue()];
+ normValue += weightArr[i] * LENGTH_TABLE[Byte.toUnsignedInt((byte) normsArr[i].longValue())];
}
current = SmallFloat.intToByte4(Math.round(normValue));
return true;
diff --git a/lucene/sandbox/src/test/org/apache/lucene/search/TestBM25FQuery.java b/lucene/sandbox/src/test/org/apache/lucene/search/TestBM25FQuery.java
index 36c0054..5900bbb 100644
--- a/lucene/sandbox/src/test/org/apache/lucene/search/TestBM25FQuery.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/search/TestBM25FQuery.java
@@ -23,11 +23,14 @@ import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.MultiReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.similarities.BM25Similarity;
+import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
@@ -166,4 +169,80 @@ public class TestBM25FQuery extends LuceneTestCase {
w.close();
dir.close();
}
+
+ public void testDocWithNegativeNorms() throws IOException {
+ Directory dir = newDirectory();
+ IndexWriterConfig iwc = new IndexWriterConfig();
+ iwc.setSimilarity(new NegativeNormSimilarity());
+ RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
+
+ String queryString = "foo";
+
+ Document doc = new Document();
+ //both fields must contain tokens that match the query string "foo"
+ doc.add(new TextField("f", "foo", Store.NO));
+ doc.add(new TextField("g", "foo baz", Store.NO));
+ w.addDocument(doc);
+
+ IndexReader reader = w.getReader();
+ IndexSearcher searcher = newSearcher(reader);
+ BM25FQuery query = new BM25FQuery.Builder()
+ .addField("f")
+ .addField("g")
+ .addTerm(new BytesRef(queryString))
+ .build();
+ TopDocs topDocs = searcher.search(query, 10);
+ CheckHits.checkDocIds("queried docs do not match", new int[]{0}, topDocs.scoreDocs);
+
+ reader.close();
+ w.close();
+ dir.close();
+ }
+
+ public void testMultipleDocsNegativeNorms() throws IOException {
+ Directory dir = newDirectory();
+ IndexWriterConfig iwc = new IndexWriterConfig();
+ iwc.setSimilarity(new NegativeNormSimilarity());
+ RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
+
+ String queryString = "foo";
+
+ Document doc0 = new Document();
+ doc0.add(new TextField("f", "foo", Store.NO));
+ doc0.add(new TextField("g", "foo baz", Store.NO));
+ w.addDocument(doc0);
+
+ Document doc1 = new Document();
+ // add another match on the query string to the second doc
+ doc1.add(new TextField("f", "foo is foo", Store.NO));
+ doc1.add(new TextField("g", "foo baz", Store.NO));
+ w.addDocument(doc1);
+
+ IndexReader reader = w.getReader();
+ IndexSearcher searcher = newSearcher(reader);
+ BM25FQuery query = new BM25FQuery.Builder()
+ .addField("f")
+ .addField("g")
+ .addTerm(new BytesRef(queryString))
+ .build();
+ TopDocs topDocs = searcher.search(query, 10);
+ //Return doc1 ahead of doc0 since its tf is higher
+ CheckHits.checkDocIds("queried docs do not match", new int[]{1,0}, topDocs.scoreDocs);
+
+ reader.close();
+ w.close();
+ dir.close();
+ }
+
+ private static final class NegativeNormSimilarity extends Similarity {
+ @Override
+ public long computeNorm(FieldInvertState state) {
+ return -128;
+ }
+
+ @Override
+ public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
+ return new BM25Similarity().scorer(boost, collectionStats, termStats);
+ }
+ }
}