You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2016/11/10 14:38:39 UTC
[2/2] lucene-solr:branch_6x: LUCENE-5867: Add a BooleanSimilarity.
LUCENE-5867: Add a BooleanSimilarity.
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/74da1cff
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/74da1cff
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/74da1cff
Branch: refs/heads/branch_6x
Commit: 74da1cff27467ab540c343dd589832d9f417dd25
Parents: be47009
Author: Adrien Grand <jp...@gmail.com>
Authored: Thu Nov 10 14:05:13 2016 +0100
Committer: Adrien Grand <jp...@gmail.com>
Committed: Thu Nov 10 15:21:02 2016 +0100
----------------------------------------------------------------------
lucene/CHANGES.txt | 5 +-
.../search/similarities/BooleanSimilarity.java | 101 ++++++++++++++++
.../similarities/TestBooleanSimilarity.java | 117 +++++++++++++++++++
.../search/similarities/RandomSimilarity.java | 1 +
.../util/TestRuleSetupAndRestoreClassEnv.java | 3 +-
5 files changed, 224 insertions(+), 3 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/74da1cff/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index f7f54c0..9979eb9 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -4,7 +4,10 @@ For more information on past and future Lucene versions, please see:
http://s.apache.org/luceneversions
======================= Lucene 6.4.0 =======================
-(No Changes)
+
+New features
+
+* LUCENE-5867: Added BooleanSimilarity. (Robert Muir, Adrien Grand)
Improvements
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/74da1cff/lucene/core/src/java/org/apache/lucene/search/similarities/BooleanSimilarity.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BooleanSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BooleanSimilarity.java
new file mode 100644
index 0000000..2eb5699
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BooleanSimilarity.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.similarities;
+
+import java.io.IOException;
+
+import org.apache.lucene.index.FieldInvertState;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.search.CollectionStatistics;
+import org.apache.lucene.search.Explanation;
+import org.apache.lucene.search.TermStatistics;
+import org.apache.lucene.util.BytesRef;
+
+/**
+ * Simple similarity that gives terms a score that is equal to their query
+ * boost. This similarity is typically used with disabled norms since neither
+ * document statistics nor index statistics are used for scoring. That said,
+ * if norms are enabled, they will be computed the same way as
+ * {@link SimilarityBase} and {@link BM25Similarity} with
+ * {@link SimilarityBase#setDiscountOverlaps(boolean) discounted overlaps}
+ * so that the {@link Similarity} can be changed after the index has been
+ * created.
+ */
+public class BooleanSimilarity extends Similarity {
+
+ private static final Similarity BM25_SIM = new BM25Similarity();
+
+ /** Sole constructor */
+ public BooleanSimilarity() {}
+
+ @Override
+ public long computeNorm(FieldInvertState state) {
+ return BM25_SIM.computeNorm(state);
+ }
+
+ @Override
+ public SimWeight computeWeight(CollectionStatistics collectionStats, TermStatistics... termStats) {
+ return new BooleanWeight();
+ }
+
+ private static class BooleanWeight extends SimWeight {
+ float boost = 1f;
+
+ @Override
+ public void normalize(float queryNorm, float boost) {
+ this.boost = boost;
+ }
+
+ @Override
+ public float getValueForNormalization() {
+ return boost * boost;
+ }
+ }
+
+ @Override
+ public SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException {
+ final float boost = ((BooleanWeight) weight).boost;
+
+ return new SimScorer() {
+
+ @Override
+ public float score(int doc, float freq) {
+ return boost;
+ }
+
+ @Override
+ public Explanation explain(int doc, Explanation freq) {
+ Explanation queryBoostExpl = Explanation.match(boost, "query boost");
+ return Explanation.match(
+ queryBoostExpl.getValue(),
+ "score(" + getClass().getSimpleName() + ", doc=" + doc + "), computed from:",
+ queryBoostExpl);
+ }
+
+ @Override
+ public float computeSlopFactor(int distance) {
+ return 1f;
+ }
+
+ @Override
+ public float computePayloadFactor(int doc, int start, int end, BytesRef payload) {
+ return 1f;
+ }
+ };
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/74da1cff/lucene/core/src/test/org/apache/lucene/search/similarities/TestBooleanSimilarity.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestBooleanSimilarity.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestBooleanSimilarity.java
new file mode 100644
index 0000000..15b1448
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestBooleanSimilarity.java
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.similarities;
+
+import java.io.IOException;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field.Store;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.FieldInvertState;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.BoostQuery;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.PhraseQuery;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.TestUtil;
+
+public class TestBooleanSimilarity extends LuceneTestCase {
+
+ public void testTermScoreIsEqualToBoost() throws IOException {
+ Directory dir = newDirectory();
+ RandomIndexWriter w = new RandomIndexWriter(random(), dir,
+ newIndexWriterConfig());
+ Document doc = new Document();
+ doc.add(new StringField("foo", "bar", Store.NO));
+ doc.add(new StringField("foo", "baz", Store.NO));
+ w.addDocument(doc);
+ doc = new Document();
+ doc.add(new StringField("foo", "bar", Store.NO));
+ doc.add(new StringField("foo", "bar", Store.NO));
+ w.addDocument(doc);
+
+ DirectoryReader reader = w.getReader();
+ w.close();
+ IndexSearcher searcher = newSearcher(reader);
+ searcher.setSimilarity(new BooleanSimilarity());
+ TopDocs topDocs = searcher.search(new TermQuery(new Term("foo", "bar")), 2);
+ assertEquals(2, topDocs.totalHits);
+ assertEquals(1f, topDocs.scoreDocs[0].score, 0f);
+ assertEquals(1f, topDocs.scoreDocs[1].score, 0f);
+
+ topDocs = searcher.search(new TermQuery(new Term("foo", "baz")), 1);
+ assertEquals(1, topDocs.totalHits);
+ assertEquals(1f, topDocs.scoreDocs[0].score, 0f);
+
+ topDocs = searcher.search(new BoostQuery(new TermQuery(new Term("foo", "baz")), 3f), 1);
+ assertEquals(1, topDocs.totalHits);
+ assertEquals(3f, topDocs.scoreDocs[0].score, 0f);
+
+ reader.close();
+ dir.close();
+ }
+
+ public void testPhraseScoreIsEqualToBoost() throws IOException {
+ Directory dir = newDirectory();
+ RandomIndexWriter w = new RandomIndexWriter(random(), dir,
+ newIndexWriterConfig().setSimilarity(new BooleanSimilarity()));
+ Document doc = new Document();
+ doc.add(new TextField("foo", "bar baz quux", Store.NO));
+ w.addDocument(doc);
+
+ DirectoryReader reader = w.getReader();
+ w.close();
+ IndexSearcher searcher = newSearcher(reader);
+ searcher.setSimilarity(new BooleanSimilarity());
+
+ PhraseQuery query = new PhraseQuery(2, "foo", "bar", "quux");
+
+ TopDocs topDocs = searcher.search(query, 2);
+ assertEquals(1, topDocs.totalHits);
+ assertEquals(1f, topDocs.scoreDocs[0].score, 0f);
+
+ topDocs = searcher.search(new BoostQuery(query, 7), 2);
+ assertEquals(1, topDocs.totalHits);
+ assertEquals(7f, topDocs.scoreDocs[0].score, 0f);
+
+ reader.close();
+ dir.close();
+ }
+
+ public void testSameNormsAsBM25() {
+ BooleanSimilarity sim1 = new BooleanSimilarity();
+ BM25Similarity sim2 = new BM25Similarity();
+ sim2.setDiscountOverlaps(true);
+ for (int iter = 0; iter < 100; ++iter) {
+ final int length = TestUtil.nextInt(random(), 1, 100);
+ final int position = random().nextInt(length);
+ final int numOverlaps = random().nextInt(50);
+ final float boost = random().nextFloat() * 10;
+ FieldInvertState state = new FieldInvertState("foo", position, length, numOverlaps, 100, boost);
+ assertEquals(
+ sim2.computeNorm(state),
+ sim1.computeNorm(state),
+ 0f);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/74da1cff/lucene/test-framework/src/java/org/apache/lucene/search/similarities/RandomSimilarity.java
----------------------------------------------------------------------
diff --git a/lucene/test-framework/src/java/org/apache/lucene/search/similarities/RandomSimilarity.java b/lucene/test-framework/src/java/org/apache/lucene/search/similarities/RandomSimilarity.java
index 6231739..e6f43b6 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/search/similarities/RandomSimilarity.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/search/similarities/RandomSimilarity.java
@@ -126,6 +126,7 @@ public class RandomSimilarity extends PerFieldSimilarityWrapper {
allSims = new ArrayList<>();
allSims.add(new ClassicSimilarity());
allSims.add(new BM25Similarity());
+ allSims.add(new BooleanSimilarity());
for (BasicModel basicModel : BASIC_MODELS) {
for (AfterEffect afterEffect : AFTER_EFFECTS) {
for (Normalization normalization : NORMALIZATIONS) {
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/74da1cff/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java
----------------------------------------------------------------------
diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java b/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java
index d71510e..4a47f59 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java
@@ -36,7 +36,6 @@ import org.apache.lucene.codecs.lucene62.Lucene62Codec;
import org.apache.lucene.codecs.mockrandom.MockRandomPostingsFormat;
import org.apache.lucene.codecs.simpletext.SimpleTextCodec;
import org.apache.lucene.index.RandomCodec;
-import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.search.similarities.RandomSimilarity;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
@@ -213,7 +212,7 @@ final class TestRuleSetupAndRestoreClassEnv extends AbstractBeforeAfterRule {
TimeZone randomTimeZone = randomTimeZone(random());
timeZone = testTimeZone.equals("random") ? randomTimeZone : TimeZone.getTimeZone(testTimeZone);
TimeZone.setDefault(timeZone);
- similarity = random().nextBoolean() ? new ClassicSimilarity() : new RandomSimilarity(random());
+ similarity = new RandomSimilarity(random());
// Check codec restrictions once at class level.
try {