You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2018/01/04 14:23:14 UTC
[3/3] lucene-solr:master: LUCENE-8116: SimScorer now only takes a
frequency and a norm as per-document scoring factors.
LUCENE-8116: SimScorer now only takes a frequency and a norm as per-document scoring factors.
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/8fd7ead9
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/8fd7ead9
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/8fd7ead9
Branch: refs/heads/master
Commit: 8fd7ead940f69a892dfc951a1aa042e8cae806c1
Parents: 8836fda
Author: Adrien Grand <jp...@gmail.com>
Authored: Thu Jan 4 15:13:36 2018 +0100
Committer: Adrien Grand <jp...@gmail.com>
Committed: Thu Jan 4 15:13:36 2018 +0100
----------------------------------------------------------------------
lucene/CHANGES.txt | 6 +
.../classification/KNearestFuzzyClassifier.java | 2 +-
.../KNearestNeighborClassifier.java | 2 +-
.../org/apache/lucene/search/BooleanWeight.java | 2 +-
.../apache/lucene/search/ExactPhraseScorer.java | 7 +-
.../org/apache/lucene/search/IndexSearcher.java | 45 +----
.../org/apache/lucene/search/LeafSimScorer.java | 74 +++++++
.../apache/lucene/search/MultiPhraseQuery.java | 29 +--
.../org/apache/lucene/search/PhraseQuery.java | 13 +-
.../lucene/search/SloppyPhraseScorer.java | 7 +-
.../org/apache/lucene/search/SynonymQuery.java | 35 ++--
.../org/apache/lucene/search/TermQuery.java | 15 +-
.../org/apache/lucene/search/TermScorer.java | 11 +-
.../org/apache/lucene/search/package-info.java | 4 +-
.../lucene/search/similarities/Axiomatic.java | 10 +-
.../search/similarities/BM25Similarity.java | 142 +++++---------
.../lucene/search/similarities/BasicStats.java | 2 +-
.../search/similarities/BooleanSimilarity.java | 51 ++---
.../search/similarities/DFISimilarity.java | 6 +-
.../search/similarities/DFRSimilarity.java | 8 +-
.../search/similarities/IBSimilarity.java | 8 +-
.../similarities/LMDirichletSimilarity.java | 10 +-
.../similarities/LMJelinekMercerSimilarity.java | 10 +-
.../search/similarities/LMSimilarity.java | 2 +-
.../search/similarities/MultiSimilarity.java | 37 +---
.../similarities/PerFieldSimilarityWrapper.java | 22 +--
.../lucene/search/similarities/Similarity.java | 99 ++++------
.../search/similarities/SimilarityBase.java | 79 +++-----
.../search/similarities/TFIDFSimilarity.java | 133 +++++--------
.../apache/lucene/search/spans/SpanScorer.java | 6 +-
.../apache/lucene/search/spans/SpanWeight.java | 22 +--
.../apache/lucene/search/spans/TermSpans.java | 4 +-
.../apache/lucene/index/TestCustomNorms.java | 7 +-
.../apache/lucene/index/TestCustomTermFreq.java | 9 +-
.../lucene/index/TestFieldInvertState.java | 8 +-
.../apache/lucene/index/TestIndexSorting.java | 9 +-
.../lucene/index/TestMaxTermFrequency.java | 12 +-
.../test/org/apache/lucene/index/TestNorms.java | 7 +-
.../lucene/index/TestUniqueTermCount.java | 8 +-
.../apache/lucene/search/JustCompileSearch.java | 7 +-
.../org/apache/lucene/search/TestBoolean2.java | 4 +-
.../search/TestBooleanQueryVisitSubscorers.java | 12 +-
.../lucene/search/TestBooleanRewrites.java | 2 +-
.../apache/lucene/search/TestConjunctions.java | 12 +-
.../lucene/search/TestDocValuesScoring.java | 192 -------------------
.../lucene/search/TestMinShouldMatch2.java | 9 +-
.../lucene/search/TestSimilarityProvider.java | 23 +--
.../lucene/search/TestSubScorerFreqs.java | 12 +-
.../similarities/TestClassicSimilarity.java | 3 +-
.../search/similarities/TestSimilarityBase.java | 11 +-
.../search/spans/TestFieldMaskingSpanQuery.java | 4 +-
.../lucene/index/memory/TestMemoryIndex.java | 8 +-
.../search/TestDiversifiedTopDocsCollector.java | 132 ++++++++-----
.../function/valuesource/IDFValueSource.java | 2 +-
.../function/valuesource/NormValueSource.java | 10 +-
.../function/valuesource/TFValueSource.java | 2 +-
.../queries/payloads/PayloadScoreQuery.java | 7 +-
.../queries/payloads/SpanPayloadCheckQuery.java | 4 +-
.../function/TestLongNormValueSource.java | 2 +-
.../queries/function/TestValueSources.java | 8 +-
.../lucene/search/TermAutomatonQuery.java | 8 +-
.../lucene/search/TermAutomatonScorer.java | 7 +-
.../lucene/index/BaseNormsFormatTestCase.java | 7 +-
.../org/apache/lucene/search/QueryUtils.java | 10 +-
.../similarities/AssertingSimilarity.java | 95 ++++-----
.../similarities/BaseSimilarityTestCase.java | 140 ++------------
.../search/spans/AssertingSpanWeight.java | 4 +-
.../similarities/BaseSimilarityTestCase.java | 2 +-
68 files changed, 606 insertions(+), 1096 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8fd7ead9/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index ff94809..16050d1 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -32,6 +32,9 @@ API Changes
* LUCENE-8012: Explanation now takes Number rather than float (Alan Woodward,
Robert Muir)
+* LUCENE-8116: SimScorer now only takes a frequency and a norm as per-document
+ scoring factors. (Adrien Grand)
+
Changes in Runtime Behavior
* LUCENE-7837: Indices that were created before the previous major version
@@ -46,6 +49,9 @@ Changes in Runtime Behavior
* LUCENE-7996: FunctionQuery and FunctionScoreQuery now return a score of 0
when the function produces a negative value. (Adrien Grand)
+* LUCENE-8116: Similarities now score fields that omit norms as if the norm was
+ 1. This might change score values on fields that omit norms. (Adrien Grand)
+
Improvements
* LUCENE-7997: Add BaseSimilarityTestCase to sanity check similarities.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8fd7ead9/lucene/classification/src/java/org/apache/lucene/classification/KNearestFuzzyClassifier.java
----------------------------------------------------------------------
diff --git a/lucene/classification/src/java/org/apache/lucene/classification/KNearestFuzzyClassifier.java b/lucene/classification/src/java/org/apache/lucene/classification/KNearestFuzzyClassifier.java
index cbd241b..14f9a27 100644
--- a/lucene/classification/src/java/org/apache/lucene/classification/KNearestFuzzyClassifier.java
+++ b/lucene/classification/src/java/org/apache/lucene/classification/KNearestFuzzyClassifier.java
@@ -213,7 +213,7 @@ public class KNearestFuzzyClassifier implements Classifier<BytesRef> {
", classFieldName='" + classFieldName + '\'' +
", k=" + k +
", query=" + query +
- ", similarity=" + indexSearcher.getSimilarity(true) +
+ ", similarity=" + indexSearcher.getSimilarity() +
'}';
}
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8fd7ead9/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java
----------------------------------------------------------------------
diff --git a/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java b/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java
index f0391f4..e6ad4a3 100644
--- a/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java
+++ b/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java
@@ -251,7 +251,7 @@ public class KNearestNeighborClassifier implements Classifier<BytesRef> {
", classFieldName='" + classFieldName + '\'' +
", k=" + k +
", query=" + query +
- ", similarity=" + indexSearcher.getSimilarity(true) +
+ ", similarity=" + indexSearcher.getSimilarity() +
'}';
}
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8fd7ead9/lucene/core/src/java/org/apache/lucene/search/BooleanWeight.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/BooleanWeight.java b/lucene/core/src/java/org/apache/lucene/search/BooleanWeight.java
index 900a77f..fffdd09 100644
--- a/lucene/core/src/java/org/apache/lucene/search/BooleanWeight.java
+++ b/lucene/core/src/java/org/apache/lucene/search/BooleanWeight.java
@@ -48,7 +48,7 @@ final class BooleanWeight extends Weight {
super(query);
this.query = query;
this.scoreMode = scoreMode;
- this.similarity = searcher.getSimilarity(scoreMode.needsScores());
+ this.similarity = searcher.getSimilarity();
weights = new ArrayList<>();
for (BooleanClause c : query) {
Weight w = searcher.createWeight(c.getQuery(), c.isScoring() ? scoreMode : ScoreMode.COMPLETE_NO_SCORES, boost);
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8fd7ead9/lucene/core/src/java/org/apache/lucene/search/ExactPhraseScorer.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/ExactPhraseScorer.java b/lucene/core/src/java/org/apache/lucene/search/ExactPhraseScorer.java
index f4a7ca7..e2d6d80 100644
--- a/lucene/core/src/java/org/apache/lucene/search/ExactPhraseScorer.java
+++ b/lucene/core/src/java/org/apache/lucene/search/ExactPhraseScorer.java
@@ -22,7 +22,6 @@ import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.index.PostingsEnum;
-import org.apache.lucene.search.similarities.Similarity;
final class ExactPhraseScorer extends Scorer {
@@ -42,13 +41,13 @@ final class ExactPhraseScorer extends Scorer {
private int freq;
- private final Similarity.SimScorer docScorer;
+ private final LeafSimScorer docScorer;
private final boolean needsScores, needsTotalHitCount;
private float matchCost;
private float minCompetitiveScore;
ExactPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings,
- Similarity.SimScorer docScorer, ScoreMode scoreMode,
+ LeafSimScorer docScorer, ScoreMode scoreMode,
float matchCost) throws IOException {
super(weight);
this.docScorer = docScorer;
@@ -123,7 +122,7 @@ final class ExactPhraseScorer extends Scorer {
@Override
public float maxScore() {
- return docScorer.maxScore(Integer.MAX_VALUE);
+ return docScorer.maxScore();
}
/** Advance the given pos enum to the first doc on or after {@code target}.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8fd7ead9/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java b/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java
index 5ee815c..fc87563 100644
--- a/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java
+++ b/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java
@@ -32,7 +32,6 @@ import java.util.concurrent.Future;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
-import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReaderContext;
import org.apache.lucene.index.IndexWriter;
@@ -75,36 +74,6 @@ import org.apache.lucene.util.ThreadInterruptedException;
*/
public class IndexSearcher {
- /** A search-time {@link Similarity} that does not make use of scoring factors
- * and may be used when scores are not needed. */
- private static final Similarity NON_SCORING_SIMILARITY = new Similarity() {
-
- @Override
- public long computeNorm(FieldInvertState state) {
- throw new UnsupportedOperationException("This Similarity may only be used for searching, not indexing");
- }
-
- @Override
- public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
- return new SimWeight() {};
- }
-
- @Override
- public SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException {
- return new SimScorer() {
- @Override
- public float score(int doc, float freq) {
- return 0f;
- }
- @Override
- public float maxScore(float maxFreq) {
- return 0f;
- }
- };
- }
-
- };
-
private static QueryCache DEFAULT_QUERY_CACHE;
private static QueryCachingPolicy DEFAULT_CACHING_POLICY = new UsageTrackingQueryCachingPolicy();
static {
@@ -136,7 +105,7 @@ public class IndexSearcher {
* Expert: returns a default Similarity instance.
* In general, this method is only called to initialize searchers and writers.
* User code and query implementations should respect
- * {@link IndexSearcher#getSimilarity(boolean)}.
+ * {@link IndexSearcher#getSimilarity()}.
* @lucene.internal
*/
public static Similarity getDefaultSimilarity() {
@@ -329,15 +298,11 @@ public class IndexSearcher {
this.similarity = similarity;
}
- /** Expert: Get the {@link Similarity} to use to compute scores. When
- * {@code needsScores} is {@code false}, this method will return a simple
- * {@link Similarity} that does not leverage scoring factors such as norms.
- * When {@code needsScores} is {@code true}, this returns the
+ /** Expert: Get the {@link Similarity} to use to compute scores. This returns the
* {@link Similarity} that has been set through {@link #setSimilarity(Similarity)}
- * or the {@link #getDefaultSimilarity()} default {@link Similarity} if none
- * has been set explicitly. */
- public Similarity getSimilarity(boolean needsScores) {
- return needsScores ? similarity : NON_SCORING_SIMILARITY;
+ * or the default {@link Similarity} if none has been set explicitly. */
+ public Similarity getSimilarity() {
+ return similarity;
}
/**
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8fd7ead9/lucene/core/src/java/org/apache/lucene/search/LeafSimScorer.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/LeafSimScorer.java b/lucene/core/src/java/org/apache/lucene/search/LeafSimScorer.java
new file mode 100644
index 0000000..52b7d92
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/search/LeafSimScorer.java
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.NumericDocValues;
+import org.apache.lucene.search.similarities.Similarity.SimScorer;
+
+/**
+ * {@link SimScorer} on a specific {@link LeafReader}.
+ */
+public final class LeafSimScorer {
+
+ private final SimScorer scorer;
+ private final NumericDocValues norms;
+ private final float maxScore;
+
+ /**
+ * Sole constructor: Score documents of {@code reader} with {@code scorer}.
+ */
+ public LeafSimScorer(SimScorer scorer, LeafReader reader, boolean needsScores, float maxFreq) throws IOException {
+ this.scorer = scorer;
+ norms = needsScores ? reader.getNormValues(scorer.getField()) : null;
+ maxScore = scorer.maxScore(maxFreq);
+ }
+
+ private long getNormValue(int doc) throws IOException {
+ if (norms != null) {
+ boolean found = norms.advanceExact(doc);
+ assert found;
+ return norms.longValue();
+ } else {
+ return 1L; // default norm
+ }
+ }
+
+ /** Score the provided document assuming the given term document frequency.
+ * This method must be called on non-decreasing sequences of doc ids.
+ * @see SimScorer#score(float, long) */
+ public float score(int doc, float freq) throws IOException {
+ return scorer.score(freq, getNormValue(doc));
+ }
+
+ /** Explain the score for the provided document assuming the given term document frequency.
+ * This method must be called on non-decreasing sequences of doc ids.
+ * @see SimScorer#explain(Explanation, long) */
+ public Explanation explain(int doc, Explanation freqExpl) throws IOException {
+ return scorer.explain(freqExpl, getNormValue(doc));
+ }
+
+ /**
+ * Return an upper bound of the score.
+ * @see SimScorer#maxScore(float)
+ */
+ public float maxScore() {
+ return maxScore;
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8fd7ead9/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java b/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java
index 34361a7..941416e 100644
--- a/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java
@@ -18,19 +18,26 @@ package org.apache.lucene.search;
import java.io.IOException;
-import java.util.*;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+import java.util.Set;
-import org.apache.lucene.index.LeafReaderContext;
-import org.apache.lucene.index.LeafReader;
-import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReaderContext;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
-import org.apache.lucene.search.similarities.Similarity.SimScorer;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
@@ -183,7 +190,7 @@ public class MultiPhraseQuery extends Query {
private class MultiPhraseWeight extends Weight {
private final Similarity similarity;
- private final Similarity.SimWeight stats;
+ private final Similarity.SimScorer stats;
private final Map<Term,TermContext> termContexts = new HashMap<>();
private final ScoreMode scoreMode;
@@ -191,7 +198,7 @@ public class MultiPhraseQuery extends Query {
throws IOException {
super(MultiPhraseQuery.this);
this.scoreMode = scoreMode;
- this.similarity = searcher.getSimilarity(scoreMode.needsScores());
+ this.similarity = searcher.getSimilarity();
final IndexReaderContext context = searcher.getTopReaderContext();
// compute idf
@@ -212,7 +219,7 @@ public class MultiPhraseQuery extends Query {
if (allTermStats.isEmpty()) {
stats = null; // none of the terms were found, we won't use sim at all
} else {
- stats = similarity.computeWeight(
+ stats = similarity.scorer(
boost,
searcher.collectionStatistics(field),
allTermStats.toArray(new TermStatistics[allTermStats.size()]));
@@ -282,11 +289,11 @@ public class MultiPhraseQuery extends Query {
if (slop == 0) {
return new ExactPhraseScorer(this, postingsFreqs,
- similarity.simScorer(stats, context),
+ new LeafSimScorer(stats, context.reader(), scoreMode.needsScores(), Integer.MAX_VALUE),
scoreMode, totalMatchCost);
} else {
return new SloppyPhraseScorer(this, postingsFreqs, slop,
- similarity.simScorer(stats, context),
+ new LeafSimScorer(stats, context.reader(), scoreMode.needsScores(), Float.POSITIVE_INFINITY),
scoreMode.needsScores(), totalMatchCost);
}
}
@@ -303,7 +310,7 @@ public class MultiPhraseQuery extends Query {
int newDoc = scorer.iterator().advance(doc);
if (newDoc == doc) {
float freq = slop == 0 ? ((ExactPhraseScorer)scorer).freq() : ((SloppyPhraseScorer)scorer).sloppyFreq();
- SimScorer docScorer = similarity.simScorer(stats, context);
+ LeafSimScorer docScorer = new LeafSimScorer(stats, context.reader(), scoreMode.needsScores(), Float.POSITIVE_INFINITY);
Explanation freqExplanation = Explanation.match(freq, "phraseFreq=" + freq);
Explanation scoreExplanation = docScorer.explain(doc, freqExplanation);
return Explanation.match(
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8fd7ead9/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java
index 3d359b4..295cc90 100644
--- a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java
@@ -37,7 +37,6 @@ import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.similarities.Similarity;
-import org.apache.lucene.search.similarities.Similarity.SimScorer;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
@@ -352,7 +351,7 @@ public class PhraseQuery extends Query {
private class PhraseWeight extends Weight {
private final Similarity similarity;
- private final Similarity.SimWeight stats;
+ private final Similarity.SimScorer stats;
private final ScoreMode scoreMode;
private transient TermContext states[];
@@ -366,7 +365,7 @@ public class PhraseQuery extends Query {
throw new IllegalStateException("PhraseWeight requires that the first position is 0, call rewrite first");
}
this.scoreMode = scoreMode;
- this.similarity = searcher.getSimilarity(scoreMode.needsScores());
+ this.similarity = searcher.getSimilarity();
final IndexReaderContext context = searcher.getTopReaderContext();
states = new TermContext[terms.length];
TermStatistics termStats[] = new TermStatistics[terms.length];
@@ -380,7 +379,7 @@ public class PhraseQuery extends Query {
}
}
if (termUpTo > 0) {
- stats = similarity.computeWeight(boost, searcher.collectionStatistics(field), Arrays.copyOf(termStats, termUpTo));
+ stats = similarity.scorer(boost, searcher.collectionStatistics(field), Arrays.copyOf(termStats, termUpTo));
} else {
stats = null; // no terms at all, we won't use similarity
}
@@ -433,11 +432,11 @@ public class PhraseQuery extends Query {
if (slop == 0) { // optimize exact case
return new ExactPhraseScorer(this, postingsFreqs,
- similarity.simScorer(stats, context),
+ new LeafSimScorer(stats, context.reader(), scoreMode.needsScores(), Integer.MAX_VALUE),
scoreMode, totalMatchCost);
} else {
return new SloppyPhraseScorer(this, postingsFreqs, slop,
- similarity.simScorer(stats, context),
+ new LeafSimScorer(stats, context.reader(), scoreMode.needsScores(), Float.POSITIVE_INFINITY),
scoreMode.needsScores(), totalMatchCost);
}
}
@@ -459,7 +458,7 @@ public class PhraseQuery extends Query {
int newDoc = scorer.iterator().advance(doc);
if (newDoc == doc) {
float freq = slop == 0 ? ((ExactPhraseScorer)scorer).freq() : ((SloppyPhraseScorer)scorer).sloppyFreq();
- SimScorer docScorer = similarity.simScorer(stats, context);
+ LeafSimScorer docScorer = new LeafSimScorer(stats, context.reader(), scoreMode.needsScores(), Float.POSITIVE_INFINITY);
Explanation freqExplanation = Explanation.match(freq, "phraseFreq=" + freq);
Explanation scoreExplanation = docScorer.explain(doc, freqExplanation);
return Explanation.match(
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8fd7ead9/lucene/core/src/java/org/apache/lucene/search/SloppyPhraseScorer.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/SloppyPhraseScorer.java b/lucene/core/src/java/org/apache/lucene/search/SloppyPhraseScorer.java
index dc5490a..60b77c5 100644
--- a/lucene/core/src/java/org/apache/lucene/search/SloppyPhraseScorer.java
+++ b/lucene/core/src/java/org/apache/lucene/search/SloppyPhraseScorer.java
@@ -26,7 +26,6 @@ import java.util.HashSet;
import java.util.LinkedHashMap;
import org.apache.lucene.index.Term;
-import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.util.FixedBitSet;
final class SloppyPhraseScorer extends Scorer {
@@ -36,7 +35,7 @@ final class SloppyPhraseScorer extends Scorer {
private float sloppyFreq; //phrase frequency in current doc as computed by phraseFreq().
- private final Similarity.SimScorer docScorer;
+ private final LeafSimScorer docScorer;
private final int slop;
private final int numPostings;
@@ -55,7 +54,7 @@ final class SloppyPhraseScorer extends Scorer {
private final float matchCost;
SloppyPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings,
- int slop, Similarity.SimScorer docScorer, boolean needsScores,
+ int slop, LeafSimScorer docScorer, boolean needsScores,
float matchCost) {
super(weight);
this.docScorer = docScorer;
@@ -558,7 +557,7 @@ final class SloppyPhraseScorer extends Scorer {
@Override
public float maxScore() {
- return docScorer.maxScore(Float.POSITIVE_INFINITY);
+ return docScorer.maxScore();
}
@Override
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8fd7ead9/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java b/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java
index ce9d6e0..3f4c06d 100644
--- a/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java
@@ -35,7 +35,6 @@ import org.apache.lucene.index.TermContext;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.similarities.Similarity;
-import org.apache.lucene.search.similarities.Similarity.SimScorer;
import org.apache.lucene.util.BytesRef;
/**
@@ -129,7 +128,7 @@ public final class SynonymQuery extends Query {
class SynonymWeight extends Weight {
private final TermContext termContexts[];
private final Similarity similarity;
- private final Similarity.SimWeight simWeight;
+ private final Similarity.SimScorer simWeight;
SynonymWeight(Query query, IndexSearcher searcher, float boost) throws IOException {
super(query);
@@ -145,10 +144,10 @@ public final class SynonymQuery extends Query {
totalTermFreq += termStats.totalTermFreq();
}
}
- this.similarity = searcher.getSimilarity(true);
+ this.similarity = searcher.getSimilarity();
if (docFreq > 0) {
TermStatistics pseudoStats = new TermStatistics(new BytesRef("synonym pseudo-term"), docFreq, totalTermFreq);
- this.simWeight = similarity.computeWeight(boost, collectionStats, pseudoStats);
+ this.simWeight = similarity.scorer(boost, collectionStats, pseudoStats);
} else {
this.simWeight = null; // no terms exist at all, we won't use similarity
}
@@ -175,7 +174,7 @@ public final class SynonymQuery extends Query {
assert scorer instanceof TermScorer;
freq = ((TermScorer)scorer).freq();
}
- SimScorer docScorer = similarity.simScorer(simWeight, context);
+ LeafSimScorer docScorer = new LeafSimScorer(simWeight, context.reader(), true, Float.POSITIVE_INFINITY);
Explanation freqExplanation = Explanation.match(freq, "termFreq=" + freq);
Explanation scoreExplanation = docScorer.explain(doc, freqExplanation);
return Explanation.match(
@@ -190,7 +189,6 @@ public final class SynonymQuery extends Query {
@Override
public Scorer scorer(LeafReaderContext context) throws IOException {
- Similarity.SimScorer simScorer = null;
IndexOptions indexOptions = IndexOptions.NONE;
if (terms.length > 0) {
FieldInfo info = context.reader()
@@ -202,21 +200,17 @@ public final class SynonymQuery extends Query {
}
// we use termscorers + disjunction as an impl detail
List<Scorer> subScorers = new ArrayList<>();
- long maxFreq = 0;
+ long totalMaxFreq = 0;
for (int i = 0; i < terms.length; i++) {
TermState state = termContexts[i].get(context.ord);
if (state != null) {
TermsEnum termsEnum = context.reader().terms(terms[i].field()).iterator();
termsEnum.seekExact(terms[i].bytes(), state);
-
- maxFreq += getMaxFreq(indexOptions, termsEnum.totalTermFreq(), termsEnum.docFreq());
-
+ long termMaxFreq = getMaxFreq(indexOptions, termsEnum.totalTermFreq(), termsEnum.docFreq());
+ totalMaxFreq += termMaxFreq;
PostingsEnum postings = termsEnum.postings(null, PostingsEnum.FREQS);
- // lazy init sim, in case no terms exist
- if (simScorer == null) {
- simScorer = similarity.simScorer(simWeight, context);
- }
- subScorers.add(new TermScorer(this, postings, simScorer, Float.POSITIVE_INFINITY));
+ LeafSimScorer simScorer = new LeafSimScorer(simWeight, context.reader(), true, termMaxFreq);
+ subScorers.add(new TermScorer(this, postings, simScorer));
}
}
if (subScorers.isEmpty()) {
@@ -225,7 +219,8 @@ public final class SynonymQuery extends Query {
// we must optimize this case (term not in segment), disjunctionscorer requires >= 2 subs
return subScorers.get(0);
} else {
- return new SynonymScorer(simScorer, this, subScorers, maxFreq);
+ LeafSimScorer simScorer = new LeafSimScorer(simWeight, context.reader(), true, totalMaxFreq);
+ return new SynonymScorer(simScorer, this, subScorers);
}
}
@@ -248,13 +243,11 @@ public final class SynonymQuery extends Query {
}
static class SynonymScorer extends DisjunctionScorer {
- private final Similarity.SimScorer similarity;
- private final float maxFreq;
+ private final LeafSimScorer similarity;
- SynonymScorer(Similarity.SimScorer similarity, Weight weight, List<Scorer> subScorers, float maxFreq) {
+ SynonymScorer(LeafSimScorer similarity, Weight weight, List<Scorer> subScorers) {
super(weight, subScorers, true);
this.similarity = similarity;
- this.maxFreq = maxFreq;
}
@Override
@@ -264,7 +257,7 @@ public final class SynonymQuery extends Query {
@Override
public float maxScore() {
- return similarity.maxScore(maxFreq);
+ return similarity.maxScore();
}
/** combines TF of all subs. */
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8fd7ead9/lucene/core/src/java/org/apache/lucene/search/TermQuery.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/TermQuery.java b/lucene/core/src/java/org/apache/lucene/search/TermQuery.java
index 925fe93..3fa465d 100644
--- a/lucene/core/src/java/org/apache/lucene/search/TermQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/TermQuery.java
@@ -33,7 +33,6 @@ import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.similarities.Similarity;
-import org.apache.lucene.search.similarities.Similarity.SimScorer;
/**
* A Query that matches documents containing a term. This may be combined with
@@ -46,7 +45,7 @@ public class TermQuery extends Query {
final class TermWeight extends Weight {
private final Similarity similarity;
- private final Similarity.SimWeight stats;
+ private final Similarity.SimScorer simScorer;
private final TermContext termStates;
private final boolean needsScores;
@@ -58,7 +57,7 @@ public class TermQuery extends Query {
}
this.needsScores = needsScores;
this.termStates = termStates;
- this.similarity = searcher.getSimilarity(needsScores);
+ this.similarity = searcher.getSimilarity();
final CollectionStatistics collectionStats;
final TermStatistics termStats;
@@ -72,9 +71,9 @@ public class TermQuery extends Query {
}
if (termStats == null) {
- this.stats = null; // term doesn't exist in any segment, we won't use similarity at all
+ this.simScorer = null; // term doesn't exist in any segment, we won't use similarity at all
} else {
- this.stats = similarity.computeWeight(boost, collectionStats, termStats);
+ this.simScorer = similarity.scorer(boost, collectionStats, termStats);
}
}
@@ -101,8 +100,8 @@ public class TermQuery extends Query {
.getIndexOptions();
PostingsEnum docs = termsEnum.postings(null, needsScores ? PostingsEnum.FREQS : PostingsEnum.NONE);
assert docs != null;
- return new TermScorer(this, docs, similarity.simScorer(stats, context),
- getMaxFreq(indexOptions, termsEnum.totalTermFreq(), termsEnum.docFreq()));
+ float maxFreq = getMaxFreq(indexOptions, termsEnum.totalTermFreq(), termsEnum.docFreq());
+ return new TermScorer(this, docs, new LeafSimScorer(simScorer, context.reader(), needsScores, maxFreq));
}
private long getMaxFreq(IndexOptions indexOptions, long ttf, long df) {
@@ -166,7 +165,7 @@ public class TermQuery extends Query {
int newDoc = scorer.iterator().advance(doc);
if (newDoc == doc) {
float freq = scorer.freq();
- SimScorer docScorer = similarity.simScorer(stats, context);
+ LeafSimScorer docScorer = new LeafSimScorer(simScorer, context.reader(), true, Integer.MAX_VALUE);
Explanation freqExplanation = Explanation.match(freq, "freq, occurrences of term within document");
Explanation scoreExplanation = docScorer.explain(doc, freqExplanation);
return Explanation.match(
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8fd7ead9/lucene/core/src/java/org/apache/lucene/search/TermScorer.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/TermScorer.java b/lucene/core/src/java/org/apache/lucene/search/TermScorer.java
index a4aeb04..653a60e 100644
--- a/lucene/core/src/java/org/apache/lucene/search/TermScorer.java
+++ b/lucene/core/src/java/org/apache/lucene/search/TermScorer.java
@@ -20,14 +20,12 @@ package org.apache.lucene.search;
import java.io.IOException;
import org.apache.lucene.index.PostingsEnum;
-import org.apache.lucene.search.similarities.Similarity;
/** Expert: A <code>Scorer</code> for documents matching a <code>Term</code>.
*/
final class TermScorer extends Scorer {
private final PostingsEnum postingsEnum;
- private final Similarity.SimScorer docScorer;
- private final float maxFreq;
+ private final LeafSimScorer docScorer;
/**
* Construct a <code>TermScorer</code>.
@@ -39,14 +37,11 @@ final class TermScorer extends Scorer {
* @param docScorer
* The <code>Similarity.SimScorer</code> implementation
* to be used for score computations.
- * @param maxFreq
- * An upper bound of the term frequency of the searched term in any document.
*/
- TermScorer(Weight weight, PostingsEnum td, Similarity.SimScorer docScorer, float maxFreq) {
+ TermScorer(Weight weight, PostingsEnum td, LeafSimScorer docScorer) {
super(weight);
this.docScorer = docScorer;
this.postingsEnum = td;
- this.maxFreq = maxFreq;
}
@Override
@@ -71,7 +66,7 @@ final class TermScorer extends Scorer {
@Override
public float maxScore() {
- return docScorer.maxScore(maxFreq);
+ return docScorer.maxScore();
}
/** Returns a string representation of this <code>TermScorer</code>. */
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8fd7ead9/lucene/core/src/java/org/apache/lucene/search/package-info.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/package-info.java b/lucene/core/src/java/org/apache/lucene/search/package-info.java
index 69c5c2a..7e53da4 100644
--- a/lucene/core/src/java/org/apache/lucene/search/package-info.java
+++ b/lucene/core/src/java/org/apache/lucene/search/package-info.java
@@ -378,7 +378,7 @@
* scored the way it was.
* Typically a weight such as TermWeight
* that scores via a {@link org.apache.lucene.search.similarities.Similarity Similarity} will make use of the Similarity's implementation:
- * {@link org.apache.lucene.search.similarities.Similarity.SimScorer#explain(int, Explanation) SimScorer#explain(int doc, Explanation freq)}.
+ * {@link org.apache.lucene.search.similarities.Similarity.SimScorer#explain(Explanation, long) SimScorer#explain(Explanation freq, long norm)}.
* </li>
* </ol>
* <a name="scorerClass"></a>
@@ -402,7 +402,7 @@
* {@link org.apache.lucene.search.Scorer#score score()} — Return the score of the
* current document. This value can be determined in any appropriate way for an application. For instance, the
* {@link org.apache.lucene.search.TermScorer TermScorer} simply defers to the configured Similarity:
- * {@link org.apache.lucene.search.similarities.Similarity.SimScorer#score(int, float) SimScorer.score(int doc, float freq)}.
+ * {@link org.apache.lucene.search.similarities.Similarity.SimScorer#score(float, long) SimScorer.score(float freq, long norm)}.
* </li>
* <li>
* {@link org.apache.lucene.search.Scorer#getChildren getChildren()} — Returns any child subscorers
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8fd7ead9/lucene/core/src/java/org/apache/lucene/search/similarities/Axiomatic.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/Axiomatic.java b/lucene/core/src/java/org/apache/lucene/search/similarities/Axiomatic.java
index 1522e5d..3865933 100644
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/Axiomatic.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/Axiomatic.java
@@ -120,10 +120,10 @@ public abstract class Axiomatic extends SimilarityBase {
@Override
protected Explanation explain(
- BasicStats stats, int doc, Explanation freq, double docLen) {
+ BasicStats stats, Explanation freq, double docLen) {
List<Explanation> subs = new ArrayList<>();
double f = freq.getValue().doubleValue();
- explain(subs, stats, doc, f, docLen);
+ explain(subs, stats, f, docLen);
double score = tf(stats, f, docLen)
* ln(stats, f, docLen)
@@ -132,7 +132,7 @@ public abstract class Axiomatic extends SimilarityBase {
- gamma(stats, f, docLen);
Explanation explanation = Explanation.match((float) score,
- "score(" + getClass().getSimpleName() + ", doc=" + doc + ", freq=" + freq.getValue() +"), computed from:",
+ "score(" + getClass().getSimpleName() + ", freq=" + freq.getValue() +"), computed from:",
subs);
if (stats.boost != 1f) {
explanation = Explanation.match((float) (score * stats.boost), "Boosted score, computed as (score * boost) from:",
@@ -148,7 +148,7 @@ public abstract class Axiomatic extends SimilarityBase {
}
@Override
- protected void explain(List<Explanation> subs, BasicStats stats, int doc,
+ protected void explain(List<Explanation> subs, BasicStats stats,
double freq, double docLen) {
if (stats.getBoost() != 1.0d) {
subs.add(Explanation.match((float) stats.getBoost(),
@@ -165,7 +165,7 @@ public abstract class Axiomatic extends SimilarityBase {
subs.add(tflnExplain(stats, freq, docLen));
subs.add(idfExplain(stats, freq, docLen));
subs.add(Explanation.match((float) gamma(stats, freq, docLen), "gamma"));
- super.explain(subs, stats, doc, freq, docLen);
+ super.explain(subs, stats, freq, docLen);
}
/**
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8fd7ead9/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java
index dce156b..09bef40 100644
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java
@@ -22,8 +22,6 @@ import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.index.FieldInvertState;
-import org.apache.lucene.index.LeafReaderContext;
-import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.TermStatistics;
@@ -176,7 +174,7 @@ public class BM25Similarity extends Similarity {
}
@Override
- public final SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
+ public final SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
Explanation idf = termStats.length == 1 ? idfExplain(collectionStats, termStats[0]) : idfExplain(collectionStats, termStats);
float avgdl = avgFieldLength(collectionStats);
@@ -184,100 +182,17 @@ public class BM25Similarity extends Similarity {
for (int i = 0; i < cache.length; i++) {
cache[i] = k1 * ((1 - b) + b * LENGTH_TABLE[i] / avgdl);
}
- return new BM25Stats(collectionStats.field(), boost, k1, idf, avgdl, cache);
- }
-
- @Override
- public final SimScorer simScorer(SimWeight stats, LeafReaderContext context) throws IOException {
- BM25Stats bm25stats = (BM25Stats) stats;
- return new BM25DocScorer(bm25stats, context.reader().getNormValues(bm25stats.field));
- }
-
- private class BM25DocScorer extends SimScorer {
- private final BM25Stats stats;
- private final float weightValue; // boost * idf * (k1 + 1)
- private final NumericDocValues norms;
- /** precomputed cache for all length values */
- private final float[] lengthCache;
- /** precomputed norm[256] with k1 * ((1 - b) + b * dl / avgdl) */
- private final float[] cache;
-
- BM25DocScorer(BM25Stats stats, NumericDocValues norms) throws IOException {
- this.stats = stats;
- this.weightValue = stats.weight;
- this.norms = norms;
- lengthCache = LENGTH_TABLE;
- cache = stats.cache;
- }
-
- @Override
- public float score(int doc, float freq) throws IOException {
- // if there are no norms, we act as if b=0
- double norm;
- if (norms == null) {
- norm = k1;
- } else {
- boolean found = norms.advanceExact(doc);
- assert found;
- norm = cache[((byte) norms.longValue()) & 0xFF];
- }
- return weightValue * (float) (freq / (freq + norm));
- }
-
- @Override
- public float maxScore(float maxFreq) {
- // TODO: leverage maxFreq and the min norm from the cache
- return weightValue;
- }
-
- @Override
- public Explanation explain(int doc, Explanation freq) throws IOException {
- List<Explanation> subs = new ArrayList<>();
- subs.addAll(stats.explain());
- Explanation tfExpl = explainTF(doc, freq);
- subs.add(tfExpl);
- return Explanation.match(stats.weight * tfExpl.getValue().floatValue(),
- "score(doc="+doc+",freq="+freq.getValue()+"), product of:", subs);
- }
-
- private Explanation explainTF(int doc, Explanation freq) throws IOException {
- List<Explanation> subs = new ArrayList<>();
- subs.add(freq);
- subs.add(Explanation.match(k1, "k1, term saturation parameter"));
- if (norms == null) {
- subs.add(Explanation.match(0, "b, field omits length norms"));
- return Explanation.match(
- (float) (freq.getValue().floatValue() / (freq.getValue().floatValue() + (double) k1)),
- "tf, computed as freq / (freq + k1) from:", subs);
- } else {
- boolean found = norms.advanceExact(doc);
- assert found;
- byte norm = (byte) norms.longValue();
- float doclen = lengthCache[norm & 0xff];
- subs.add(Explanation.match(b, "b, length normalization parameter"));
- if ((norm & 0xFF) > 39) {
- subs.add(Explanation.match(doclen, "dl, length of field (approximate)"));
- } else {
- subs.add(Explanation.match(doclen, "dl, length of field"));
- }
- subs.add(Explanation.match(stats.avgdl, "avgdl, average length of field"));
- float normValue = k1 * ((1 - b) + b * doclen / stats.avgdl);
- return Explanation.match(
- (float) (freq.getValue().floatValue() / (freq.getValue().floatValue() + (double) normValue)),
- "tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:", subs);
- }
- }
-
+ return new BM25Scorer(collectionStats.field(), boost, k1, b, idf, avgdl, cache);
}
/** Collection statistics for the BM25 model. */
- private static class BM25Stats extends SimWeight {
- /** field name, for pulling norms */
- private final String field;
+ private static class BM25Scorer extends SimScorer {
/** query boost */
private final float boost;
/** k1 value for scale factor */
private final float k1;
+ /** b value for length normalization impact */
+ private final float b;
/** BM25's idf */
private final Explanation idf;
/** The average document length. */
@@ -287,17 +202,57 @@ public class BM25Similarity extends Similarity {
/** weight (idf * boost) */
private final float weight;
- BM25Stats(String field, float boost, float k1, Explanation idf, float avgdl, float[] cache) {
- this.field = field;
+ BM25Scorer(String field, float boost, float k1, float b, Explanation idf, float avgdl, float[] cache) {
+ super(field);
this.boost = boost;
this.idf = idf;
this.avgdl = avgdl;
this.k1 = k1;
+ this.b = b;
this.cache = cache;
this.weight = (k1 + 1) * boost * idf.getValue().floatValue();
}
- private List<Explanation> explain() {
+ @Override
+ public float score(float freq, long encodedNorm) throws IOException {
+ double norm = cache[((byte) encodedNorm) & 0xFF];
+ return weight * (float) (freq / (freq + norm));
+ }
+
+ @Override
+ public float maxScore(float maxFreq) {
+ // TODO: leverage maxFreq and the min norm from the cache
+ return weight;
+ }
+
+ @Override
+ public Explanation explain(Explanation freq, long encodedNorm) throws IOException {
+ List<Explanation> subs = new ArrayList<>(explainConstantFactors());
+ Explanation tfExpl = explainTF(freq, encodedNorm);
+ subs.add(tfExpl);
+ return Explanation.match(weight * tfExpl.getValue().floatValue(),
+ "score(freq="+freq.getValue()+"), product of:", subs);
+ }
+
+ private Explanation explainTF(Explanation freq, long norm) throws IOException {
+ List<Explanation> subs = new ArrayList<>();
+ subs.add(freq);
+ subs.add(Explanation.match(k1, "k1, term saturation parameter"));
+ float doclen = LENGTH_TABLE[((byte) norm) & 0xff];
+ subs.add(Explanation.match(b, "b, length normalization parameter"));
+ if ((norm & 0xFF) > 39) {
+ subs.add(Explanation.match(doclen, "dl, length of field (approximate)"));
+ } else {
+ subs.add(Explanation.match(doclen, "dl, length of field"));
+ }
+ subs.add(Explanation.match(avgdl, "avgdl, average length of field"));
+ float normValue = k1 * ((1 - b) + b * doclen / avgdl);
+ return Explanation.match(
+ (float) (freq.getValue().floatValue() / (freq.getValue().floatValue() + (double) normValue)),
+ "tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:", subs);
+ }
+
+ private List<Explanation> explainConstantFactors() {
List<Explanation> subs = new ArrayList<>();
// scale factor
subs.add(Explanation.match(k1 + 1, "scaling factor, k1 + 1"));
@@ -311,7 +266,6 @@ public class BM25Similarity extends Similarity {
}
}
-
@Override
public String toString() {
return "BM25(k1=" + k1 + ",b=" + b + ")";
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8fd7ead9/lucene/core/src/java/org/apache/lucene/search/similarities/BasicStats.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicStats.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicStats.java
index cc3cab4..dc9356f 100644
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicStats.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicStats.java
@@ -23,7 +23,7 @@ import org.apache.lucene.index.Terms;
* Stores all statistics commonly used ranking methods.
* @lucene.experimental
*/
-public class BasicStats extends Similarity.SimWeight {
+public class BasicStats {
final String field;
/** The number of documents. */
protected long numberOfDocuments;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8fd7ead9/lucene/core/src/java/org/apache/lucene/search/similarities/BooleanSimilarity.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BooleanSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BooleanSimilarity.java
index 7134172..2690365 100644
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/BooleanSimilarity.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BooleanSimilarity.java
@@ -19,7 +19,6 @@ package org.apache.lucene.search.similarities;
import java.io.IOException;
import org.apache.lucene.index.FieldInvertState;
-import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.TermStatistics;
@@ -47,44 +46,36 @@ public class BooleanSimilarity extends Similarity {
}
@Override
- public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
- return new BooleanWeight(boost);
+ public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
+ return new BooleanWeight(collectionStats.field(), boost);
}
- private static class BooleanWeight extends SimWeight {
+ private static class BooleanWeight extends SimScorer {
final float boost;
- BooleanWeight(float boost) {
+ BooleanWeight(String field, float boost) {
+ super(field);
this.boost = boost;
}
- }
-
- @Override
- public SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException {
- final float boost = ((BooleanWeight) weight).boost;
-
- return new SimScorer() {
- @Override
- public float score(int doc, float freq) throws IOException {
- return boost;
- }
-
- @Override
- public float maxScore(float maxFreq) {
- return boost;
- }
+ @Override
+ public float score(float freq, long norm) throws IOException {
+ return boost;
+ }
- @Override
- public Explanation explain(int doc, Explanation freq) throws IOException {
- Explanation queryBoostExpl = Explanation.match(boost, "boost, query boost");
- return Explanation.match(
- queryBoostExpl.getValue(),
- "score(" + getClass().getSimpleName() + ", doc=" + doc + "), computed from:",
- queryBoostExpl);
- }
+ @Override
+ public float maxScore(float maxFreq) {
+ return boost;
+ }
- };
+ @Override
+ public Explanation explain(Explanation freq, long norm) throws IOException {
+ Explanation queryBoostExpl = Explanation.match(boost, "boost, query boost");
+ return Explanation.match(
+ queryBoostExpl.getValue(),
+ "score(" + getClass().getSimpleName() + "), computed from:",
+ queryBoostExpl);
+ }
}
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8fd7ead9/lucene/core/src/java/org/apache/lucene/search/similarities/DFISimilarity.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/DFISimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/DFISimilarity.java
index 66f22be..44da93c 100644
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/DFISimilarity.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/DFISimilarity.java
@@ -79,12 +79,12 @@ public class DFISimilarity extends SimilarityBase {
@Override
protected Explanation explain(
- BasicStats stats, int doc, Explanation freq, double docLen) {
+ BasicStats stats, Explanation freq, double docLen) {
final double expected = (stats.getTotalTermFreq() + 1) * docLen /
(stats.getNumberOfFieldTokens() + 1);
if (freq.getValue().doubleValue() <= expected){
return Explanation.match((float) 0, "score(" +
- getClass().getSimpleName() + ", doc=" + doc + ", freq=" +
+ getClass().getSimpleName() + ", freq=" +
freq.getValue() +"), equals to 0");
}
Explanation explExpected = Explanation.match((float) expected,
@@ -103,7 +103,7 @@ public class DFISimilarity extends SimilarityBase {
return Explanation.match(
(float) score(stats, freq.getValue().doubleValue(), docLen),
- "score(" + getClass().getSimpleName() + ", doc=" + doc + ", freq=" +
+ "score(" + getClass().getSimpleName() + ", freq=" +
freq.getValue() +"), computed as boost * log2(measure + 1) from:",
Explanation.match( (float)stats.getBoost(), "boost, query boost"),
explMeasure);
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8fd7ead9/lucene/core/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java
index a41e35c..1677168 100644
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java
@@ -121,7 +121,7 @@ public class DFRSimilarity extends SimilarityBase {
@Override
protected void explain(List<Explanation> subs,
- BasicStats stats, int doc, double freq, double docLen) {
+ BasicStats stats, double freq, double docLen) {
if (stats.getBoost() != 1.0d) {
subs.add(Explanation.match( (float)stats.getBoost(), "boost, query boost"));
}
@@ -136,13 +136,13 @@ public class DFRSimilarity extends SimilarityBase {
@Override
protected Explanation explain(
- BasicStats stats, int doc, Explanation freq, double docLen) {
+ BasicStats stats, Explanation freq, double docLen) {
List<Explanation> subs = new ArrayList<>();
- explain(subs, stats, doc, freq.getValue().doubleValue(), docLen);
+ explain(subs, stats, freq.getValue().doubleValue(), docLen);
return Explanation.match(
(float) score(stats, freq.getValue().doubleValue(), docLen),
- "score(" + getClass().getSimpleName() + ", doc=" + doc + ", freq=" +
+ "score(" + getClass().getSimpleName() + ", freq=" +
freq.getValue() +"), computed as boost * " +
"basicModel.score(stats, tfn) * afterEffect.score(stats, tfn) from:",
subs);
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8fd7ead9/lucene/core/src/java/org/apache/lucene/search/similarities/IBSimilarity.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/IBSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/IBSimilarity.java
index 9a57608..231d554 100644
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/IBSimilarity.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/IBSimilarity.java
@@ -112,7 +112,7 @@ public class IBSimilarity extends SimilarityBase {
@Override
protected void explain(
- List<Explanation> subs, BasicStats stats, int doc, double freq, double docLen) {
+ List<Explanation> subs, BasicStats stats, double freq, double docLen) {
if (stats.getBoost() != 1.0d) {
subs.add(Explanation.match((float)stats.getBoost(), "boost, query boost"));
}
@@ -125,13 +125,13 @@ public class IBSimilarity extends SimilarityBase {
@Override
protected Explanation explain(
- BasicStats stats, int doc, Explanation freq, double docLen) {
+ BasicStats stats, Explanation freq, double docLen) {
List<Explanation> subs = new ArrayList<>();
- explain(subs, stats, doc, freq.getValue().doubleValue(), docLen);
+ explain(subs, stats, freq.getValue().doubleValue(), docLen);
return Explanation.match(
(float) score(stats, freq.getValue().doubleValue(), docLen),
- "score(" + getClass().getSimpleName() + ", doc=" + doc + ", freq=" +
+ "score(" + getClass().getSimpleName() + ", freq=" +
freq.getValue() +"), computed as boost * " +
"distribution.score(stats, normalization.tfn(stats, freq," +
" docLen), lambda.lambda(stats)) from:",
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8fd7ead9/lucene/core/src/java/org/apache/lucene/search/similarities/LMDirichletSimilarity.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/LMDirichletSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/LMDirichletSimilarity.java
index c12cba4..7522c17 100644
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/LMDirichletSimilarity.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/LMDirichletSimilarity.java
@@ -84,7 +84,7 @@ public class LMDirichletSimilarity extends LMSimilarity {
}
@Override
- protected void explain(List<Explanation> subs, BasicStats stats, int doc,
+ protected void explain(List<Explanation> subs, BasicStats stats,
double freq, double docLen) {
if (stats.getBoost() != 1.0d) {
subs.add(Explanation.match((float) stats.getBoost(), "query boost"));
@@ -107,18 +107,18 @@ public class LMDirichletSimilarity extends LMSimilarity {
(float)Math.log(mu / (docLen + mu)),
"document norm, computed as log(mu / (dl + mu))"));
subs.add(Explanation.match((float) docLen,"dl, length of field"));
- super.explain(subs, stats, doc, freq, docLen);
+ super.explain(subs, stats, freq, docLen);
}
@Override
protected Explanation explain(
- BasicStats stats, int doc, Explanation freq, double docLen) {
+ BasicStats stats, Explanation freq, double docLen) {
List<Explanation> subs = new ArrayList<>();
- explain(subs, stats, doc, freq.getValue().doubleValue(), docLen);
+ explain(subs, stats, freq.getValue().doubleValue(), docLen);
return Explanation.match(
(float) score(stats, freq.getValue().doubleValue(), docLen),
- "score(" + getClass().getSimpleName() + ", doc=" + doc + ", freq=" +
+ "score(" + getClass().getSimpleName() + ", freq=" +
freq.getValue() +"), computed as boost * " +
"(term weight + document norm) from:",
subs);
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8fd7ead9/lucene/core/src/java/org/apache/lucene/search/similarities/LMJelinekMercerSimilarity.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/LMJelinekMercerSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/LMJelinekMercerSimilarity.java
index 42e5a7b..dde0650 100644
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/LMJelinekMercerSimilarity.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/LMJelinekMercerSimilarity.java
@@ -74,7 +74,7 @@ public class LMJelinekMercerSimilarity extends LMSimilarity {
}
@Override
- protected void explain(List<Explanation> subs, BasicStats stats, int doc,
+ protected void explain(List<Explanation> subs, BasicStats stats,
double freq, double docLen) {
if (stats.getBoost() != 1.0d) {
subs.add(Explanation.match((float) stats.getBoost(), "boost"));
@@ -88,18 +88,18 @@ public class LMJelinekMercerSimilarity extends LMSimilarity {
"freq, number of occurrences of term in the document");
subs.add(explFreq);
subs.add(Explanation.match((float) docLen,"dl, length of field"));
- super.explain(subs, stats, doc, freq, docLen);
+ super.explain(subs, stats, freq, docLen);
}
@Override
protected Explanation explain(
- BasicStats stats, int doc, Explanation freq, double docLen) {
+ BasicStats stats, Explanation freq, double docLen) {
List<Explanation> subs = new ArrayList<>();
- explain(subs, stats, doc, freq.getValue().doubleValue(), docLen);
+ explain(subs, stats, freq.getValue().doubleValue(), docLen);
return Explanation.match(
(float) score(stats, freq.getValue().doubleValue(), docLen),
- "score(" + getClass().getSimpleName() + ", doc=" + doc + ", freq=" +
+ "score(" + getClass().getSimpleName() + ", freq=" +
freq.getValue() +"), computed as boost * " +
"log(1 + ((1 - lambda) * freq / dl) /(lambda * P)) from:",
subs);
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8fd7ead9/lucene/core/src/java/org/apache/lucene/search/similarities/LMSimilarity.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/LMSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/LMSimilarity.java
index 8154806..73a1276 100644
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/LMSimilarity.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/LMSimilarity.java
@@ -70,7 +70,7 @@ public abstract class LMSimilarity extends SimilarityBase {
}
@Override
- protected void explain(List<Explanation> subExpls, BasicStats stats, int doc,
+ protected void explain(List<Explanation> subExpls, BasicStats stats,
double freq, double docLen) {
subExpls.add(Explanation.match((float) collectionModel.computeProbability(stats),
"collection probability"));
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8fd7ead9/lucene/core/src/java/org/apache/lucene/search/similarities/MultiSimilarity.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/MultiSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/MultiSimilarity.java
index 2f48cc6..3526db4 100644
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/MultiSimilarity.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/MultiSimilarity.java
@@ -22,7 +22,6 @@ import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.index.FieldInvertState;
-import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.TermStatistics;
@@ -49,35 +48,27 @@ public class MultiSimilarity extends Similarity {
}
@Override
- public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
- SimWeight subStats[] = new SimWeight[sims.length];
- for (int i = 0; i < subStats.length; i++) {
- subStats[i] = sims[i].computeWeight(boost, collectionStats, termStats);
- }
- return new MultiStats(subStats);
- }
-
- @Override
- public SimScorer simScorer(SimWeight stats, LeafReaderContext context) throws IOException {
+ public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
SimScorer subScorers[] = new SimScorer[sims.length];
for (int i = 0; i < subScorers.length; i++) {
- subScorers[i] = sims[i].simScorer(((MultiStats)stats).subStats[i], context);
+ subScorers[i] = sims[i].scorer(boost, collectionStats, termStats);
}
- return new MultiSimScorer(subScorers);
+ return new MultiSimScorer(collectionStats.field(), subScorers);
}
static class MultiSimScorer extends SimScorer {
private final SimScorer subScorers[];
- MultiSimScorer(SimScorer subScorers[]) {
+ MultiSimScorer(String field, SimScorer subScorers[]) {
+ super(field);
this.subScorers = subScorers;
}
@Override
- public float score(int doc, float freq) throws IOException {
+ public float score(float freq, long norm) throws IOException {
float sum = 0.0f;
for (SimScorer subScorer : subScorers) {
- sum += subScorer.score(doc, freq);
+ sum += subScorer.score(freq, norm);
}
return sum;
}
@@ -92,21 +83,13 @@ public class MultiSimilarity extends Similarity {
}
@Override
- public Explanation explain(int doc, Explanation freq) throws IOException {
+ public Explanation explain(Explanation freq, long norm) throws IOException {
List<Explanation> subs = new ArrayList<>();
for (SimScorer subScorer : subScorers) {
- subs.add(subScorer.explain(doc, freq));
+ subs.add(subScorer.explain(freq, norm));
}
- return Explanation.match(score(doc, freq.getValue().floatValue()), "sum of:", subs);
+ return Explanation.match(score(freq.getValue().floatValue(), norm), "sum of:", subs);
}
}
-
- static class MultiStats extends SimWeight {
- final SimWeight subStats[];
-
- MultiStats(SimWeight subStats[]) {
- this.subStats = subStats;
- }
- }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8fd7ead9/lucene/core/src/java/org/apache/lucene/search/similarities/PerFieldSimilarityWrapper.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/PerFieldSimilarityWrapper.java b/lucene/core/src/java/org/apache/lucene/search/similarities/PerFieldSimilarityWrapper.java
index 6c05616..ee2381f 100644
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/PerFieldSimilarityWrapper.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/PerFieldSimilarityWrapper.java
@@ -17,9 +17,6 @@
package org.apache.lucene.search.similarities;
-import java.io.IOException;
-
-import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.TermStatistics;
@@ -46,26 +43,13 @@ public abstract class PerFieldSimilarityWrapper extends Similarity {
}
@Override
- public final SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
- PerFieldSimWeight weight = new PerFieldSimWeight();
- weight.delegate = get(collectionStats.field());
- weight.delegateWeight = weight.delegate.computeWeight(boost, collectionStats, termStats);
- return weight;
- }
-
- @Override
- public final SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException {
- PerFieldSimWeight perFieldWeight = (PerFieldSimWeight) weight;
- return perFieldWeight.delegate.simScorer(perFieldWeight.delegateWeight, context);
+ public final SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
+ return get(collectionStats.field()).scorer(boost, collectionStats, termStats);
}
/**
* Returns a {@link Similarity} for scoring a field.
*/
public abstract Similarity get(String name);
-
- static class PerFieldSimWeight extends SimWeight {
- Similarity delegate;
- SimWeight delegateWeight;
- }
+
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8fd7ead9/lucene/core/src/java/org/apache/lucene/search/similarities/Similarity.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/Similarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/Similarity.java
index 5f0bcd0..a2ebe4a 100644
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/Similarity.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/Similarity.java
@@ -19,16 +19,14 @@ package org.apache.lucene.search.similarities;
import java.io.IOException;
import java.util.Collections;
+import java.util.Objects;
+import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.index.FieldInvertState;
-import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
-import org.apache.lucene.search.PhraseQuery;
-import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermStatistics;
-import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.util.SmallFloat;
/**
@@ -38,9 +36,9 @@ import org.apache.lucene.util.SmallFloat;
* <p>
* This is a low-level API, you should only extend this API if you want to implement
* an information retrieval <i>model</i>. If you are instead looking for a convenient way
- * to alter Lucene's scoring, consider extending a higher-level implementation
- * such as {@link TFIDFSimilarity}, which implements the vector space model with this API, or
- * just tweaking the default implementation: {@link BM25Similarity}.
+ * to alter Lucene's scoring, consider just tweaking the default implementation:
+ * {@link BM25Similarity} or extend {@link SimilarityBase}, which makes it easy to compute
+ * a score from index statistics.
* <p>
* Similarity determines how Lucene weights terms, and Lucene interacts with
* this class at both <a href="#indextime">index-time</a> and
@@ -49,23 +47,22 @@ import org.apache.lucene.util.SmallFloat;
* <a name="indextime">Indexing Time</a>
* At indexing time, the indexer calls {@link #computeNorm(FieldInvertState)}, allowing
* the Similarity implementation to set a per-document value for the field that will
- * be later accessible via {@link org.apache.lucene.index.LeafReader#getNormValues(String)}. Lucene makes no assumption
- * about what is in this norm, but it is most useful for encoding length normalization
- * information.
+ * be later accessible via {@link org.apache.lucene.index.LeafReader#getNormValues(String)}.
+ * Lucene makes no assumption about what is in this norm, but it is most useful for
+ * encoding length normalization information.
* <p>
* Implementations should carefully consider how the normalization is encoded: while
- * Lucene's {@link BM25Similarity} encodes a combination of index-time boost
- * and length normalization information with {@link SmallFloat} into a single byte, this
- * might not be suitable for all purposes.
+ * Lucene's {@link BM25Similarity} encodes length normalization information with
+ * {@link SmallFloat} into a single byte, this might not be suitable for all purposes.
* <p>
* Many formulas require the use of average document length, which can be computed via a
* combination of {@link CollectionStatistics#sumTotalTermFreq()} and
- * {@link CollectionStatistics#maxDoc()} or {@link CollectionStatistics#docCount()},
- * depending upon whether the average should reflect field sparsity.
+ * {@link CollectionStatistics#docCount()}.
* <p>
- * Additional scoring factors can be stored in named
- * <code>NumericDocValuesField</code>s and accessed
- * at query-time with {@link org.apache.lucene.index.LeafReader#getNumericDocValues(String)}.
+ * Additional scoring factors can be stored in named {@link NumericDocValuesField}s and
+ * accessed at query-time with {@link org.apache.lucene.index.LeafReader#getNumericDocValues(String)}.
+ * However this should not be done in the {@link Similarity} but externally, for instance
+ * by using <tt>FunctionScoreQuery</tt>.
* <p>
* Finally, using index-time boosts (either via folding into the normalization byte or
* via DocValues), is an inefficient way to boost the scores of different fields if the
@@ -76,14 +73,13 @@ import org.apache.lucene.util.SmallFloat;
* <a name="querytime">Query time</a>
* At query-time, Queries interact with the Similarity via these steps:
* <ol>
- * <li>The {@link #computeWeight(float, CollectionStatistics, TermStatistics...)} method is called a single time,
+ * <li>The {@link #scorer(float, CollectionStatistics, TermStatistics...)} method is called a single time,
* allowing the implementation to compute any statistics (such as IDF, average document length, etc)
* across <i>the entire collection</i>. The {@link TermStatistics} and {@link CollectionStatistics} passed in
* already contain all of the raw statistics involved, so a Similarity can freely use any combination
* of statistics without causing any additional I/O. Lucene makes no assumption about what is
- * stored in the returned {@link Similarity.SimWeight} object.
- * <li>For each segment in the index, the Query creates a {@link #simScorer(SimWeight, org.apache.lucene.index.LeafReaderContext)}
- * The score() method is called for each matching document.
+ * stored in the returned {@link Similarity.SimScorer} object.
+ * <li>Then {@link SimScorer#score(float, long)} is called for every matching document to compute its score.
* </ol>
* <p>
* <a name="explaintime">Explanations</a>
@@ -126,37 +122,38 @@ public abstract class Similarity {
* @param termStats term-level statistics, such as the document frequency of a term across the collection.
* @return SimWeight object with the information this Similarity needs to score a query.
*/
- public abstract SimWeight computeWeight(float boost,
+ public abstract SimScorer scorer(float boost,
CollectionStatistics collectionStats, TermStatistics... termStats);
-
- /**
- * Creates a new {@link Similarity.SimScorer} to score matching documents from a segment of the inverted index.
- * @param weight collection information from {@link #computeWeight(float, CollectionStatistics, TermStatistics...)}
- * @param context segment of the inverted index to be scored.
- * @return SloppySimScorer for scoring documents across <code>context</code>
- * @throws IOException if there is a low-level I/O error
- */
- public abstract SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException;
- /**
- * API for scoring "sloppy" queries such as {@link TermQuery},
- * {@link SpanQuery}, and {@link PhraseQuery}.
+ /** Stores the weight for a query across the indexed collection. This abstract
+ * implementation is empty; descendants of {@code Similarity} should
+ * subclass {@code SimWeight} and define the statistics they require in the
+ * subclass. Examples include idf, average field length, etc.
*/
public static abstract class SimScorer {
-
+
+ private final String field;
+
/**
* Sole constructor. (For invocation by subclass
- * constructors, typically implicit.)
+ * constructors.)
*/
- public SimScorer() {}
+ public SimScorer(String field) {
+ this.field = Objects.requireNonNull(field);
+ }
+
+ /** Return the field that this {@link SimScorer} operates on. */
+ public final String getField() {
+ return field;
+ }
/**
* Score a single document
- * @param doc document id within the inverted index segment
* @param freq sloppy term frequency
+ * @param norm encoded normalization factor, as returned by {@link Similarity#computeNorm}, or {@code 1} if norms are disabled
* @return document's score
*/
- public abstract float score(int doc, float freq) throws IOException;
+ public abstract float score(float freq, long norm) throws IOException;
/**
* Return the maximum score that this scorer may produce for freqs in {@code ]0, maxFreq]}.
@@ -167,30 +164,16 @@ public abstract class Similarity {
/**
* Explain the score for a single document
- * @param doc document id within the inverted index segment
* @param freq Explanation of how the sloppy term frequency was computed
+ * @param norm encoded normalization factor, as returned by {@link Similarity#computeNorm}, or {@code 1} if norms are disabled
* @return document's score
*/
- public Explanation explain(int doc, Explanation freq) throws IOException {
+ public Explanation explain(Explanation freq, long norm) throws IOException {
return Explanation.match(
- score(doc, freq.getValue().floatValue()),
- "score(doc=" + doc + ",freq=" + freq.getValue() +"), with freq of:",
+ score(freq.getValue().floatValue(), norm),
+ "score(freq=" + freq.getValue() +"), with freq of:",
Collections.singleton(freq));
}
- }
-
- /** Stores the weight for a query across the indexed collection. This abstract
- * implementation is empty; descendants of {@code Similarity} should
- * subclass {@code SimWeight} and define the statistics they require in the
- * subclass. Examples include idf, average field length, etc.
- */
- public static abstract class SimWeight {
-
- /**
- * Sole constructor. (For invocation by subclass
- * constructors, typically implicit.)
- */
- public SimWeight() {}
}
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8fd7ead9/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java b/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java
index f227f38..f750b12 100644
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java
@@ -22,8 +22,6 @@ import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.index.FieldInvertState;
-import org.apache.lucene.index.LeafReaderContext;
-import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.TermStatistics;
@@ -33,7 +31,7 @@ import org.apache.lucene.util.SmallFloat;
* A subclass of {@code Similarity} that provides a simplified API for its
* descendants. Subclasses are only required to implement the {@link #score}
* and {@link #toString()} methods. Implementing
- * {@link #explain(List, BasicStats, int, double, double)} is optional,
+ * {@link #explain(List, BasicStats, double, double)} is optional,
* inasmuch as SimilarityBase already provides a basic explanation of the score
* and the term frequency. However, implementers of a subclass are encouraged to
* include as much detail about the scoring method as possible.
@@ -82,13 +80,18 @@ public abstract class SimilarityBase extends Similarity {
}
@Override
- public final SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
- BasicStats stats[] = new BasicStats[termStats.length];
+ public final SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
+ SimScorer weights[] = new SimScorer[termStats.length];
for (int i = 0; i < termStats.length; i++) {
- stats[i] = newStats(collectionStats.field(), boost);
- fillBasicStats(stats[i], collectionStats, termStats[i]);
+ BasicStats stats = newStats(collectionStats.field(), boost);
+ fillBasicStats(stats, collectionStats, termStats[i]);
+ weights[i] = new BasicSimScorer(stats);
+ }
+ if (weights.length == 1) {
+ return weights[0];
+ } else {
+ return new MultiSimilarity.MultiSimScorer(collectionStats.field(), weights);
}
- return stats.length == 1 ? stats[0] : new MultiSimilarity.MultiStats(stats);
}
/** Factory method to return a custom stats object */
@@ -137,12 +140,11 @@ public abstract class SimilarityBase extends Similarity {
*
* @param subExpls the list of details of the explanation to extend
* @param stats the corpus level statistics.
- * @param doc the document id.
* @param freq the term frequency.
* @param docLen the document length.
*/
protected void explain(
- List<Explanation> subExpls, BasicStats stats, int doc, double freq, double docLen) {}
+ List<Explanation> subExpls, BasicStats stats, double freq, double docLen) {}
/**
* Explains the score. The implementation here provides a basic explanation
@@ -151,43 +153,24 @@ public abstract class SimilarityBase extends Similarity {
* attaches the score (computed via the {@link #score(BasicStats, double, double)}
* method) and the explanation for the term frequency. Subclasses content with
* this format may add additional details in
- * {@link #explain(List, BasicStats, int, double, double)}.
+ * {@link #explain(List, BasicStats, double, double)}.
*
* @param stats the corpus level statistics.
- * @param doc the document id.
* @param freq the term frequency and its explanation.
* @param docLen the document length.
* @return the explanation.
*/
protected Explanation explain(
- BasicStats stats, int doc, Explanation freq, double docLen) {
+ BasicStats stats, Explanation freq, double docLen) {
List<Explanation> subs = new ArrayList<>();
- explain(subs, stats, doc, freq.getValue().floatValue(), docLen);
+ explain(subs, stats, freq.getValue().floatValue(), docLen);
return Explanation.match(
(float) score(stats, freq.getValue().floatValue(), docLen),
- "score(" + getClass().getSimpleName() + ", doc=" + doc + ", freq=" + freq.getValue() +"), computed from:",
+ "score(" + getClass().getSimpleName() + ", freq=" + freq.getValue() +"), computed from:",
subs);
}
- @Override
- public final SimScorer simScorer(SimWeight stats, LeafReaderContext context) throws IOException {
- if (stats instanceof MultiSimilarity.MultiStats) {
- // a multi term query (e.g. phrase). return the summation,
- // scoring almost as if it were boolean query
- SimWeight subStats[] = ((MultiSimilarity.MultiStats) stats).subStats;
- SimScorer subScorers[] = new SimScorer[subStats.length];
- for (int i = 0; i < subScorers.length; i++) {
- BasicStats basicstats = (BasicStats) subStats[i];
- subScorers[i] = new BasicSimScorer(basicstats, context.reader().getNormValues(basicstats.field));
- }
- return new MultiSimilarity.MultiSimScorer(subScorers);
- } else {
- BasicStats basicstats = (BasicStats) stats;
- return new BasicSimScorer(basicstats, context.reader().getNormValues(basicstats.field));
- }
- }
-
/**
* Subclasses must override this method to return the name of the Similarity
* and preferably the values of parameters (if any) as well.
@@ -227,33 +210,27 @@ public abstract class SimilarityBase extends Similarity {
// --------------------------------- Classes ---------------------------------
- /** Delegates the {@link #score(int, float)} and
- * {@link #explain(int, Explanation)} methods to
+ /** Delegates the {@link #score(float, long)} and
+ * {@link #explain(Explanation, long)} methods to
* {@link SimilarityBase#score(BasicStats, double, double)} and
- * {@link SimilarityBase#explain(BasicStats, int, Explanation, double)},
+ * {@link SimilarityBase#explain(BasicStats, Explanation, double)},
* respectively.
*/
final class BasicSimScorer extends SimScorer {
- private final BasicStats stats;
- private final NumericDocValues norms;
+ final BasicStats stats;
- BasicSimScorer(BasicStats stats, NumericDocValues norms) throws IOException {
+ BasicSimScorer(BasicStats stats) {
+ super(stats.field);
this.stats = stats;
- this.norms = norms;
}
- double getLengthValue(int doc) throws IOException {
- if (norms == null) {
- return 1D;
- }
- boolean found = norms.advanceExact(doc);
- assert found;
- return LENGTH_TABLE[Byte.toUnsignedInt((byte) norms.longValue())];
+ double getLengthValue(long norm) throws IOException {
+ return LENGTH_TABLE[Byte.toUnsignedInt((byte) norm)];
}
@Override
- public float score(int doc, float freq) throws IOException {
- return (float) SimilarityBase.this.score(stats, freq, getLengthValue(doc));
+ public float score(float freq, long norm) throws IOException {
+ return (float) SimilarityBase.this.score(stats, freq, getLengthValue(norm));
}
@Override
@@ -262,8 +239,8 @@ public abstract class SimilarityBase extends Similarity {
}
@Override
- public Explanation explain(int doc, Explanation freq) throws IOException {
- return SimilarityBase.this.explain(stats, doc, freq, getLengthValue(doc));
+ public Explanation explain(Explanation freq, long norm) throws IOException {
+ return SimilarityBase.this.explain(stats, freq, getLengthValue(norm));
}
}