You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2016/11/10 14:38:39 UTC

[2/2] lucene-solr:branch_6x: LUCENE-5867: Add a BooleanSimilarity.

LUCENE-5867: Add a BooleanSimilarity.


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/74da1cff
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/74da1cff
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/74da1cff

Branch: refs/heads/branch_6x
Commit: 74da1cff27467ab540c343dd589832d9f417dd25
Parents: be47009
Author: Adrien Grand <jp...@gmail.com>
Authored: Thu Nov 10 14:05:13 2016 +0100
Committer: Adrien Grand <jp...@gmail.com>
Committed: Thu Nov 10 15:21:02 2016 +0100

----------------------------------------------------------------------
 lucene/CHANGES.txt                              |   5 +-
 .../search/similarities/BooleanSimilarity.java  | 101 ++++++++++++++++
 .../similarities/TestBooleanSimilarity.java     | 117 +++++++++++++++++++
 .../search/similarities/RandomSimilarity.java   |   1 +
 .../util/TestRuleSetupAndRestoreClassEnv.java   |   3 +-
 5 files changed, 224 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/74da1cff/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index f7f54c0..9979eb9 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -4,7 +4,10 @@ For more information on past and future Lucene versions, please see:
 http://s.apache.org/luceneversions
 
 ======================= Lucene 6.4.0 =======================
-(No Changes)
+
+New features
+
+* LUCENE-5867: Added BooleanSimilarity. (Robert Muir, Adrien Grand)
 
 Improvements
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/74da1cff/lucene/core/src/java/org/apache/lucene/search/similarities/BooleanSimilarity.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BooleanSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BooleanSimilarity.java
new file mode 100644
index 0000000..2eb5699
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BooleanSimilarity.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.similarities;
+
+import java.io.IOException;
+
+import org.apache.lucene.index.FieldInvertState;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.search.CollectionStatistics;
+import org.apache.lucene.search.Explanation;
+import org.apache.lucene.search.TermStatistics;
+import org.apache.lucene.util.BytesRef;
+
+/**
+ * Simple similarity that gives terms a score that is equal to their query
+ * boost. This similarity is typically used with disabled norms since neither
+ * document statistics nor index statistics are used for scoring. That said,
+ * if norms are enabled, they will be computed the same way as
+ * {@link SimilarityBase} and {@link BM25Similarity} with
+ * {@link SimilarityBase#setDiscountOverlaps(boolean) discounted overlaps}
+ * so that the {@link Similarity} can be changed after the index has been
+ * created.
+ */
+public class BooleanSimilarity extends Similarity {
+
+  private static final Similarity BM25_SIM = new BM25Similarity();
+
+  /** Sole constructor */
+  public BooleanSimilarity() {}
+
+  @Override
+  public long computeNorm(FieldInvertState state) {
+    return BM25_SIM.computeNorm(state);
+  }
+
+  @Override
+  public SimWeight computeWeight(CollectionStatistics collectionStats, TermStatistics... termStats) {
+    return new BooleanWeight();
+  }
+
+  private static class BooleanWeight extends SimWeight {
+    float boost = 1f;
+
+    @Override
+    public void normalize(float queryNorm, float boost) {
+      this.boost = boost;
+    }
+
+    @Override
+    public float getValueForNormalization() {
+      return boost * boost;
+    }
+  }
+
+  @Override
+  public SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException {
+    final float boost = ((BooleanWeight) weight).boost;
+
+    return new SimScorer() {
+
+      @Override
+      public float score(int doc, float freq) {
+        return boost;
+      }
+
+      @Override
+      public Explanation explain(int doc, Explanation freq) {
+        Explanation queryBoostExpl = Explanation.match(boost, "query boost");
+        return Explanation.match(
+            queryBoostExpl.getValue(),
+            "score(" + getClass().getSimpleName() + ", doc=" + doc + "), computed from:",
+            queryBoostExpl);
+      }
+
+      @Override
+      public float computeSlopFactor(int distance) {
+        return 1f;
+      }
+
+      @Override
+      public float computePayloadFactor(int doc, int start, int end, BytesRef payload) {
+        return 1f;
+      }
+    };
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/74da1cff/lucene/core/src/test/org/apache/lucene/search/similarities/TestBooleanSimilarity.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestBooleanSimilarity.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestBooleanSimilarity.java
new file mode 100644
index 0000000..15b1448
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestBooleanSimilarity.java
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.similarities;
+
+import java.io.IOException;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field.Store;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.FieldInvertState;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.BoostQuery;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.PhraseQuery;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.TestUtil;
+
+public class TestBooleanSimilarity extends LuceneTestCase {
+
+  public void testTermScoreIsEqualToBoost() throws IOException {
+    Directory dir = newDirectory();
+    RandomIndexWriter w = new RandomIndexWriter(random(), dir,
+        newIndexWriterConfig());
+    Document doc = new Document();
+    doc.add(new StringField("foo", "bar", Store.NO));
+    doc.add(new StringField("foo", "baz", Store.NO));
+    w.addDocument(doc);
+    doc = new Document();
+    doc.add(new StringField("foo", "bar", Store.NO));
+    doc.add(new StringField("foo", "bar", Store.NO));
+    w.addDocument(doc);
+
+    DirectoryReader reader = w.getReader();
+    w.close();
+    IndexSearcher searcher = newSearcher(reader);
+    searcher.setSimilarity(new BooleanSimilarity());
+    TopDocs topDocs = searcher.search(new TermQuery(new Term("foo", "bar")), 2);
+    assertEquals(2, topDocs.totalHits);
+    assertEquals(1f, topDocs.scoreDocs[0].score, 0f);
+    assertEquals(1f, topDocs.scoreDocs[1].score, 0f);
+
+    topDocs = searcher.search(new TermQuery(new Term("foo", "baz")), 1);
+    assertEquals(1, topDocs.totalHits);
+    assertEquals(1f, topDocs.scoreDocs[0].score, 0f);
+
+    topDocs = searcher.search(new BoostQuery(new TermQuery(new Term("foo", "baz")), 3f), 1);
+    assertEquals(1, topDocs.totalHits);
+    assertEquals(3f, topDocs.scoreDocs[0].score, 0f);
+
+    reader.close();
+    dir.close();
+  }
+
+  public void testPhraseScoreIsEqualToBoost() throws IOException {
+    Directory dir = newDirectory();
+    RandomIndexWriter w = new RandomIndexWriter(random(), dir,
+        newIndexWriterConfig().setSimilarity(new BooleanSimilarity()));
+    Document doc = new Document();
+    doc.add(new TextField("foo", "bar baz quux", Store.NO));
+    w.addDocument(doc);
+
+    DirectoryReader reader = w.getReader();
+    w.close();
+    IndexSearcher searcher = newSearcher(reader);
+    searcher.setSimilarity(new BooleanSimilarity());
+
+    PhraseQuery query = new PhraseQuery(2, "foo", "bar", "quux");
+
+    TopDocs topDocs = searcher.search(query, 2);
+    assertEquals(1, topDocs.totalHits);
+    assertEquals(1f, topDocs.scoreDocs[0].score, 0f);
+
+    topDocs = searcher.search(new BoostQuery(query, 7), 2);
+    assertEquals(1, topDocs.totalHits);
+    assertEquals(7f, topDocs.scoreDocs[0].score, 0f);
+
+    reader.close();
+    dir.close();
+  }
+
+  public void testSameNormsAsBM25() {
+    BooleanSimilarity sim1 = new BooleanSimilarity();
+    BM25Similarity sim2 = new BM25Similarity();
+    sim2.setDiscountOverlaps(true);
+    for (int iter = 0; iter < 100; ++iter) {
+      final int length = TestUtil.nextInt(random(), 1, 100);
+      final int position = random().nextInt(length);
+      final int numOverlaps = random().nextInt(50);
+      final float boost = random().nextFloat() * 10;
+      FieldInvertState state = new FieldInvertState("foo", position, length, numOverlaps, 100, boost);
+      assertEquals(
+          sim2.computeNorm(state),
+          sim1.computeNorm(state),
+          0f);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/74da1cff/lucene/test-framework/src/java/org/apache/lucene/search/similarities/RandomSimilarity.java
----------------------------------------------------------------------
diff --git a/lucene/test-framework/src/java/org/apache/lucene/search/similarities/RandomSimilarity.java b/lucene/test-framework/src/java/org/apache/lucene/search/similarities/RandomSimilarity.java
index 6231739..e6f43b6 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/search/similarities/RandomSimilarity.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/search/similarities/RandomSimilarity.java
@@ -126,6 +126,7 @@ public class RandomSimilarity extends PerFieldSimilarityWrapper {
     allSims = new ArrayList<>();
     allSims.add(new ClassicSimilarity());
     allSims.add(new BM25Similarity());
+    allSims.add(new BooleanSimilarity());
     for (BasicModel basicModel : BASIC_MODELS) {
       for (AfterEffect afterEffect : AFTER_EFFECTS) {
         for (Normalization normalization : NORMALIZATIONS) {

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/74da1cff/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java
----------------------------------------------------------------------
diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java b/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java
index d71510e..4a47f59 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java
@@ -36,7 +36,6 @@ import org.apache.lucene.codecs.lucene62.Lucene62Codec;
 import org.apache.lucene.codecs.mockrandom.MockRandomPostingsFormat;
 import org.apache.lucene.codecs.simpletext.SimpleTextCodec;
 import org.apache.lucene.index.RandomCodec;
-import org.apache.lucene.search.similarities.ClassicSimilarity;
 import org.apache.lucene.search.similarities.RandomSimilarity;
 import org.apache.lucene.search.similarities.Similarity;
 import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
@@ -213,7 +212,7 @@ final class TestRuleSetupAndRestoreClassEnv extends AbstractBeforeAfterRule {
     TimeZone randomTimeZone = randomTimeZone(random());
     timeZone = testTimeZone.equals("random") ? randomTimeZone : TimeZone.getTimeZone(testTimeZone);
     TimeZone.setDefault(timeZone);
-    similarity = random().nextBoolean() ? new ClassicSimilarity() : new RandomSimilarity(random());
+    similarity = new RandomSimilarity(random());
 
     // Check codec restrictions once at class level.
     try {