You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2018/03/20 10:57:44 UTC
[1/2] lucene-solr:branch_7x: LUCENE-8197: Efficient integration of
static scoring factors.
Repository: lucene-solr
Updated Branches:
refs/heads/branch_7x ddb0bd78f -> 70cc7b687
refs/heads/master 2e35ef2b3 -> 710993435
LUCENE-8197: Efficient integration of static scoring factors.
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/70cc7b68
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/70cc7b68
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/70cc7b68
Branch: refs/heads/branch_7x
Commit: 70cc7b68783b313a845996dac9e28c3ea6ad61e3
Parents: ddb0bd7
Author: Adrien Grand <jp...@gmail.com>
Authored: Tue Mar 20 11:21:39 2018 +0100
Committer: Adrien Grand <jp...@gmail.com>
Committed: Tue Mar 20 11:57:21 2018 +0100
----------------------------------------------------------------------
lucene/CHANGES.txt | 3 +
.../apache/lucene/document/FeatureField.java | 527 +++++++++++++++++++
.../apache/lucene/document/FeatureQuery.java | 145 +++++
.../lucene/document/TestFeatureField.java | 316 +++++++++++
4 files changed, 991 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/70cc7b68/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index c4d7e25..47459ec 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -13,6 +13,9 @@ New Features
deleted documents around for later reuse. See "IW.softUpdateDocument(...)"
for reference. (Simon Willnauer)
+* LUCENE-8097: A new FeatureField makes it easy and efficient to integrate
+ static relevance signals into the final score. (Adrien Grand, Robert Muir)
+
Other
* LUCENE-8214: Improve selection of testPoint for GeoComplexPolygon.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/70cc7b68/lucene/core/src/java/org/apache/lucene/document/FeatureField.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/document/FeatureField.java b/lucene/core/src/java/org/apache/lucene/document/FeatureField.java
new file mode 100644
index 0000000..10a7310
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/document/FeatureField.java
@@ -0,0 +1,527 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.document;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
+import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermContext;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.BoostQuery;
+import org.apache.lucene.search.Explanation;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.similarities.BM25Similarity;
+import org.apache.lucene.search.similarities.Similarity.SimScorer;
+import org.apache.lucene.util.BytesRef;
+
+/**
+ * {@link Field} that can be used to store static scoring factors into
+ * documents. This is mostly inspired from the work from Nick Craswell,
+ * Stephen Robertson, Hugo Zaragoza and Michael Taylor. Relevance weighting
+ * for query independent evidence. Proceedings of the 28th annual international
+ * ACM SIGIR conference on Research and development in information retrieval.
+ * August 15-19, 2005, Salvador, Brazil.
+ * <p>
+ * Feature values are internally encoded as term frequencies. Putting
+ * feature queries as
+ * {@link org.apache.lucene.search.BooleanClause.Occur#SHOULD} clauses of a
+ * {@link BooleanQuery} allows to combine query-dependent scores (eg. BM25)
+ * with query-independent scores using a linear combination. The fact that
+ * feature values are stored as frequencies will allow search logic to
+ * efficiently skip documents that can't be competitive when total hit counts
+ * are not requested in the future. This makes it a compelling option compared
+ * to storing such factors eg. in a doc-value field.
+ * <p>
+ * This field may only store factors that are positively correlated with the
+ * final score, like pagerank. In case of factors that are inversely correlated
+ * with the score like url length, the inverse of the scoring factor should be
+ * stored, ie. {@code 1/urlLength}.
+ * <p>
+ * This field only considers the top 9 significant bits for storage efficiency
+ * which allows to store them on 16 bits internally. In practice this limitation
+ * means that values are stored with a relative precision of
+ * 2<sup>-8</sup> = 0.00390625.
+ * <p>
+ * Given a scoring factor {@code S > 0} and its weight {@code w > 0}, there
+ * are three ways that S can be turned into a score:
+ * <ul>
+ * <li>{@link #newLogQuery w * log(a + S)}, with a ≥ 1. This function
+ * usually makes sense because the distribution of scoring factors
+ * often follows a power law. This is typically the case for pagerank for
+ * instance. However the paper suggested that the {@code satu} and
+ * {@code sigm} functions give even better results.
+ * <li>{@link #newSaturationQuery satu(S) = w * S / (S + k)}, with k > 0. This
+ * function is similar to the one used by {@link BM25Similarity} in order
+ * to incorporate term frequency into the final score and produces values
+ * between 0 and 1. A value of 0.5 is obtained when S and k are equal.
+ * <li>{@link #newSigmoidQuery sigm(S) = w * S<sup>a</sup> / (S<sup>a</sup> + k<sup>a</sup>)},
+ * with k > 0, a > 0. This function provided even better results
+ * than the two above but is also harder to tune due to the fact it has
+ * 2 parameters. Like with {@code satu}, values are in the 0..1 range and
+ * 0.5 is obtained when S and k are equal.
+ * </ul>
+ * <p>
+ * The constants in the above formulas typically need training in order to
+ * compute optimal values. If you don't know where to start, the
+ * {@link #newSaturationQuery(IndexSearcher, String, String)} method uses
+ * {@code 1f} as a weight and tries to guess a sensible value for the
+ * {@code pivot} parameter of the saturation function based on index
+ * statistics, which shouldn't perform too bad. Here is an example, assuming
+ * that documents have a {@link FeatureField} called 'features' with values for
+ * the 'pagerank' feature.
+ * <pre class="prettyprint">
+ * Query query = new BooleanQuery.Builder()
+ * .add(new TermQuery(new Term("body", "apache")), Occur.SHOULD)
+ * .add(new TermQuery(new Term("body", "lucene")), Occur.SHOULD)
+ * .build();
+ * Query boost = FeatureField.newSaturationQuery(searcher, "features", "pagerank");
+ * Query boostedQuery = new BooleanQuery.Builder()
+ * .add(query, Occur.MUST)
+ * .add(boost, Occur.SHOULD)
+ * .build();
+ * TopDocs topDocs = searcher.search(boostedQuery, 10);
+ * </pre>
+ * @lucene.experimental
+ */
+public final class FeatureField extends Field {
+
+ private static final FieldType FIELD_TYPE = new FieldType();
+ static {
+ FIELD_TYPE.setTokenized(false);
+ FIELD_TYPE.setOmitNorms(true);
+ FIELD_TYPE.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
+ }
+
+ private float featureValue;
+
+ /**
+ * Create a feature.
+ * @param fieldName The name of the field to store the information into. All features may be stored in the same field.
+ * @param featureName The name of the feature, eg. 'pagerank`. It will be indexed as a term.
+ * @param featureValue The value of the feature, must be a positive, finite, normal float.
+ */
+ public FeatureField(String fieldName, String featureName, float featureValue) {
+ super(fieldName, featureName, FIELD_TYPE);
+ setFeatureValue(featureValue);
+ }
+
+ /**
+ * Update the feature value of this field.
+ */
+ public void setFeatureValue(float featureValue) {
+ if (Float.isFinite(featureValue) == false) {
+ throw new IllegalArgumentException("featureValue must be finite, got: " + featureValue +
+ " for feature " + fieldsData + " on field " + name);
+ }
+ if (featureValue < Float.MIN_NORMAL) {
+ throw new IllegalArgumentException("featureValue must be a positive normal float, got: " +
+ featureValue + "for feature " + fieldsData + " on field " + name +
+ " which is less than the minimum positive normal float: " + Float.MIN_NORMAL);
+ }
+ this.featureValue = featureValue;
+ }
+
+ @Override
+ public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) {
+ FeatureTokenStream stream;
+ if (reuse instanceof FeatureTokenStream) {
+ stream = (FeatureTokenStream) reuse;
+ } else {
+ stream = new FeatureTokenStream();
+ }
+
+ int freqBits = Float.floatToIntBits(featureValue);
+ stream.setValues((String) fieldsData, freqBits >>> 15);
+ return stream;
+ }
+
+ private static final class FeatureTokenStream extends TokenStream {
+ private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
+ private final TermFrequencyAttribute freqAttribute = addAttribute(TermFrequencyAttribute.class);
+ private boolean used = true;
+ private String value = null;
+ private int freq = 0;
+
+ private FeatureTokenStream() {
+ }
+
+ /** Sets the values */
+ void setValues(String value, int freq) {
+ this.value = value;
+ this.freq = freq;
+ }
+
+ @Override
+ public boolean incrementToken() {
+ if (used) {
+ return false;
+ }
+ clearAttributes();
+ termAttribute.append(value);
+ freqAttribute.setTermFrequency(freq);
+ used = true;
+ return true;
+ }
+
+ @Override
+ public void reset() {
+ used = false;
+ }
+
+ @Override
+ public void close() {
+ value = null;
+ }
+ }
+
+ private static final int MAX_FREQ = Float.floatToIntBits(Float.MAX_VALUE) >>> 15;
+
+ private static float decodeFeatureValue(float freq) {
+ if (freq > MAX_FREQ) {
+ // This is never used in practice but callers of the SimScorer API might
+ // occasionally call it on eg. Float.MAX_VALUE to compute the max score
+ // so we need to be consistent.
+ return Float.MAX_VALUE;
+ }
+ int tf = (int) freq; // lossless
+ int featureBits = tf << 15;
+ return Float.intBitsToFloat(featureBits);
+ }
+
+ static abstract class FeatureFunction {
+ abstract SimScorer scorer(String field, float w);
+ abstract Explanation explain(String field, String feature, float w, int doc, int freq) throws IOException;
+ }
+
+ static final class LogFunction extends FeatureFunction {
+
+ private final float scalingFactor;
+
+ LogFunction(float a) {
+ this.scalingFactor = a;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (obj == null || getClass() != obj.getClass()) {
+ return false;
+ }
+ LogFunction that = (LogFunction) obj;
+ return scalingFactor == that.scalingFactor;
+ }
+
+ @Override
+ public int hashCode() {
+ return Float.hashCode(scalingFactor);
+ }
+
+ @Override
+ public String toString() {
+ return "LogFunction(scalingFactor=" + scalingFactor + ")";
+ }
+
+ @Override
+ SimScorer scorer(String field, float weight) {
+ return new SimScorer() {
+ @Override
+ public float score(int doc, float freq) {
+ return (float) (weight * Math.log(scalingFactor + decodeFeatureValue(freq)));
+ }
+
+ @Override
+ public float computeSlopFactor(int distance) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public float computePayloadFactor(int doc, int start, int end, BytesRef payload) {
+ throw new UnsupportedOperationException();
+ }
+ };
+ }
+
+ @Override
+ Explanation explain(String field, String feature, float w, int doc, int freq) throws IOException {
+ float featureValue = decodeFeatureValue(freq);
+ float score = scorer(field, w).score(doc, freq);
+ return Explanation.match(score,
+ "Log function on the " + field + " field for the " + feature + " feature, computed as w * log(a + S) from:",
+ Explanation.match(w, "w, weight of this function"),
+ Explanation.match(scalingFactor, "a, scaling factor"),
+ Explanation.match(featureValue, "S, feature value"));
+ }
+ }
+
+ static final class SaturationFunction extends FeatureFunction {
+
+ private final float pivot;
+
+ SaturationFunction(float pivot) {
+ this.pivot = pivot;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (obj == null || getClass() != obj.getClass()) {
+ return false;
+ }
+ SaturationFunction that = (SaturationFunction) obj;
+ return pivot == that.pivot;
+ }
+
+ @Override
+ public int hashCode() {
+ return Float.hashCode(pivot);
+ }
+
+ @Override
+ public String toString() {
+ return "SaturationFunction(pivot=" + pivot + ")";
+ }
+
+ @Override
+ SimScorer scorer(String field, float weight) {
+ return new SimScorer() {
+ @Override
+ public float score(int doc, float freq) {
+ float f = decodeFeatureValue(freq);
+ // should be f / (f + k) but we rewrite it to
+ // 1 - k / (f + k) to make sure it doesn't decrease
+ // with f in spite of rounding
+ return weight * (1 - pivot / (f + pivot));
+ }
+
+ @Override
+ public float computeSlopFactor(int distance) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public float computePayloadFactor(int doc, int start, int end, BytesRef payload) {
+ throw new UnsupportedOperationException();
+ }
+ };
+ }
+
+ @Override
+ Explanation explain(String field, String feature, float weight, int doc, int freq) throws IOException {
+ float featureValue = decodeFeatureValue(freq);
+ float score = scorer(field, weight).score(doc, freq);
+ return Explanation.match(score,
+ "Saturation function on the " + field + " field for the " + feature + " feature, computed as w * S / (S + k) from:",
+ Explanation.match(weight, "w, weight of this function"),
+ Explanation.match(pivot, "k, pivot feature value that would give a score contribution equal to w/2"),
+ Explanation.match(featureValue, "S, feature value"));
+ }
+ }
+
+ static final class SigmoidFunction extends FeatureFunction {
+
+ private final float pivot, a;
+ private final double pivotPa;
+
+ SigmoidFunction(float pivot, float a) {
+ this.pivot = pivot;
+ this.a = a;
+ this.pivotPa = Math.pow(pivot, a);
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (obj == null || getClass() != obj.getClass()) {
+ return false;
+ }
+ SigmoidFunction that = (SigmoidFunction) obj;
+ return pivot == that.pivot
+ && a == that.a;
+ }
+
+ @Override
+ public int hashCode() {
+ int h = Float.hashCode(pivot);
+ h = 31 * h + Float.hashCode(a);
+ return h;
+ }
+
+ @Override
+ public String toString() {
+ return "SigmoidFunction(pivot=" + pivot + ", a=" + a + ")";
+ }
+
+ @Override
+ SimScorer scorer(String field, float weight) {
+ return new SimScorer() {
+ @Override
+ public float score(int doc, float freq) {
+ float f = decodeFeatureValue(freq);
+ // should be f^a / (f^a + k^a) but we rewrite it to
+ // 1 - k^a / (f + k^a) to make sure it doesn't decrease
+ // with f in spite of rounding
+ return (float) (weight * (1 - pivotPa / (Math.pow(f, a) + pivotPa)));
+ }
+
+ @Override
+ public float computeSlopFactor(int distance) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public float computePayloadFactor(int doc, int start, int end, BytesRef payload) {
+ throw new UnsupportedOperationException();
+ }
+ };
+ }
+
+ @Override
+ Explanation explain(String field, String feature, float weight, int doc, int freq) throws IOException {
+ float featureValue = decodeFeatureValue(freq);
+ float score = scorer(field, weight).score(doc, freq);
+ return Explanation.match(score,
+ "Sigmoid function on the " + field + " field for the " + feature + " feature, computed as w * S^a / (S^a + k^a) from:",
+ Explanation.match(weight, "w, weight of this function"),
+ Explanation.match(pivot, "k, pivot feature value that would give a score contribution equal to w/2"),
+ Explanation.match(pivot, "a, exponent, higher values make the function grow slower before k and faster after k"),
+ Explanation.match(featureValue, "S, feature value"));
+ }
+ }
+
+ /**
+ * Given that IDFs are logs, similarities that incorporate term freq and
+ * document length in sane (ie. saturated) ways should have their score
+ * bounded by a log. So we reject weights that are too high as it would mean
+ * that this clause would completely dominate ranking, removing the need for
+ * query-dependent scores.
+ */
+ private static final float MAX_WEIGHT = Long.SIZE;
+
+ /**
+ * Return a new {@link Query} that will score documents as
+ * {@code weight * Math.log(scalingFactor + S)} where S is the value of the static feature.
+ * @param fieldName field that stores features
+ * @param featureName name of the feature
+ * @param weight weight to give to this feature, must be in (0,64]
+ * @param scalingFactor scaling factor applied before taking the logarithm, must be in [1, +Infinity)
+ * @throws IllegalArgumentException if weight is not in (0,64] or scalingFactor is not in [1, +Infinity)
+ */
+ public static Query newLogQuery(String fieldName, String featureName, float weight, float scalingFactor) {
+ if (weight <= 0 || weight > MAX_WEIGHT) {
+ throw new IllegalArgumentException("weight must be in (0, " + MAX_WEIGHT + "], got: " + weight);
+ }
+ if (scalingFactor < 1 || Float.isFinite(scalingFactor) == false) {
+ throw new IllegalArgumentException("scalingFactor must be >= 1, got: " + scalingFactor);
+ }
+ Query q = new FeatureQuery(fieldName, featureName, new LogFunction(scalingFactor));
+ if (weight != 1f) {
+ q = new BoostQuery(q, weight);
+ }
+ return q;
+ }
+
+ /**
+ * Return a new {@link Query} that will score documents as
+ * {@code weight * S / (S + pivot)} where S is the value of the static feature.
+ * @param fieldName field that stores features
+ * @param featureName name of the feature
+ * @param weight weight to give to this feature, must be in (0,64]
+ * @param pivot feature value that would give a score contribution equal to weight/2, must be in (0, +Infinity)
+ * @throws IllegalArgumentException if weight is not in (0,64] or pivot is not in (0, +Infinity)
+ */
+ public static Query newSaturationQuery(String fieldName, String featureName, float weight, float pivot) {
+ if (weight <= 0 || weight > MAX_WEIGHT) {
+ throw new IllegalArgumentException("weight must be in (0, " + MAX_WEIGHT + "], got: " + weight);
+ }
+ if (pivot <= 0 || Float.isFinite(pivot) == false) {
+ throw new IllegalArgumentException("pivot must be > 0, got: " + pivot);
+ }
+ Query q = new FeatureQuery(fieldName, featureName, new SaturationFunction(pivot));
+ if (weight != 1f) {
+ q = new BoostQuery(q, weight);
+ }
+ return q;
+ }
+
+ /**
+ * Same as {@link #newSaturationQuery(String, String, float, float)} but
+ * uses {@code 1f} as a weight and tries to compute a sensible default value
+ * for {@code pivot} using
+ * {@link #computePivotFeatureValue(IndexSearcher, String, String)}. This
+ * isn't expected to give an optimal configuration of these parameters but
+ * should be a good start if you have no idea what the values of these
+ * parameters should be.
+ * @param searcher the {@link IndexSearcher} that you will search against
+ * @param featureFieldName the field that stores features
+ * @param featureName the name of the feature
+ */
+ public static Query newSaturationQuery(IndexSearcher searcher,
+ String featureFieldName, String featureName) throws IOException {
+ float weight = 1f;
+ float pivot = computePivotFeatureValue(searcher, featureFieldName, featureName);
+ return newSaturationQuery(featureFieldName, featureName, weight, pivot);
+ }
+
+ /**
+ * Return a new {@link Query} that will score documents as
+ * {@code weight * S^a / (S^a + pivot^a)} where S is the value of the static feature.
+ * @param fieldName field that stores features
+ * @param featureName name of the feature
+ * @param weight weight to give to this feature, must be in (0,64]
+ * @param pivot feature value that would give a score contribution equal to weight/2, must be in (0, +Infinity)
+ * @param exp exponent, higher values make the function grow slower before 'pivot' and faster after 'pivot', must be in (0, +Infinity)
+ * @throws IllegalArgumentException if w is not in (0,64] or either k or a are not in (0, +Infinity)
+ */
+ public static Query newSigmoidQuery(String fieldName, String featureName, float weight, float pivot, float exp) {
+ if (weight <= 0 || weight > MAX_WEIGHT) {
+ throw new IllegalArgumentException("weight must be in (0, " + MAX_WEIGHT + "], got: " + weight);
+ }
+ if (pivot <= 0 || Float.isFinite(pivot) == false) {
+ throw new IllegalArgumentException("pivot must be > 0, got: " + pivot);
+ }
+ if (exp <= 0 || Float.isFinite(exp) == false) {
+ throw new IllegalArgumentException("exp must be > 0, got: " + exp);
+ }
+ Query q = new FeatureQuery(fieldName, featureName, new SigmoidFunction(pivot, exp));
+ if (weight != 1f) {
+ q = new BoostQuery(q, weight);
+ }
+ return q;
+ }
+
+ /**
+ * Compute a feature value that may be used as the {@code pivot} parameter of
+ * the {@link #newSaturationQuery(String, String, float, float)} and
+ * {@link #newSigmoidQuery(String, String, float, float, float)} factory
+ * methods. The implementation takes the average of the int bits of the float
+ * representation in practice before converting it back to a float. Given that
+ * floats store the exponent in the higher bits, it means that the result will
+ * be an approximation of the geometric mean of all feature values.
+ * @param searcher the {@link IndexSearcher} to search against
+ * @param featureField the field that stores features
+ * @param featureName the name of the feature
+ */
+ public static float computePivotFeatureValue(IndexSearcher searcher, String featureField, String featureName) throws IOException {
+ Term term = new Term(featureField, featureName);
+ TermContext context = TermContext.build(searcher.getIndexReader().getContext(), term);
+ float avgFreq = (float) ((double) context.totalTermFreq() / context.docFreq());
+ return decodeFeatureValue(avgFreq);
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/70cc7b68/lucene/core/src/java/org/apache/lucene/document/FeatureQuery.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/document/FeatureQuery.java b/lucene/core/src/java/org/apache/lucene/document/FeatureQuery.java
new file mode 100644
index 0000000..eb71d05
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/document/FeatureQuery.java
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.document;
+
+import java.io.IOException;
+import java.util.Objects;
+import java.util.Set;
+
+import org.apache.lucene.document.FeatureField.FeatureFunction;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.PostingsEnum;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.Explanation;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.Scorer;
+import org.apache.lucene.search.Weight;
+import org.apache.lucene.search.similarities.Similarity.SimScorer;
+import org.apache.lucene.util.BytesRef;
+
+final class FeatureQuery extends Query {
+
+ private final String fieldName;
+ private final String featureName;
+ private final FeatureFunction function;
+
+ FeatureQuery(String fieldName, String featureName, FeatureFunction function) {
+ this.fieldName = Objects.requireNonNull(fieldName);
+ this.featureName = Objects.requireNonNull(featureName);
+ this.function = Objects.requireNonNull(function);
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (obj == null || getClass() != obj.getClass()) {
+ return false;
+ }
+ FeatureQuery that = (FeatureQuery) obj;
+ return Objects.equals(fieldName, that.fieldName) &&
+ Objects.equals(featureName, that.featureName) &&
+ Objects.equals(function, that.function);
+ }
+
+ @Override
+ public int hashCode() {
+ int h = getClass().hashCode();
+ h = 31 * h + fieldName.hashCode();
+ h = 31 * h + featureName.hashCode();
+ h = 31 * h + function.hashCode();
+ return h;
+ }
+
+ @Override
+ public Weight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException {
+ return new Weight(this) {
+
+ @Override
+ public boolean isCacheable(LeafReaderContext ctx) {
+ return false;
+ }
+
+ @Override
+ public void extractTerms(Set<Term> terms) {}
+
+ @Override
+ public Explanation explain(LeafReaderContext context, int doc) throws IOException {
+ String desc = "weight(" + getQuery() + " in " + doc + ") [" + function + "]";
+
+ Terms terms = context.reader().terms(fieldName);
+ if (terms == null) {
+ return Explanation.noMatch(desc + ". Field " + fieldName + " doesn't exist.");
+ }
+ TermsEnum termsEnum = terms.iterator();
+ if (termsEnum.seekExact(new BytesRef(featureName)) == false) {
+ return Explanation.noMatch(desc + ". Feature " + featureName + " doesn't exist.");
+ }
+
+ PostingsEnum postings = termsEnum.postings(null, PostingsEnum.FREQS);
+ if (postings.advance(doc) != doc) {
+ return Explanation.noMatch(desc + ". Feature " + featureName + " isn't set.");
+ }
+
+ return function.explain(fieldName, featureName, boost, doc, postings.freq());
+ }
+
+ @Override
+ public Scorer scorer(LeafReaderContext context) throws IOException {
+ Terms terms = context.reader().terms(fieldName);
+ if (terms == null) {
+ return null;
+ }
+ TermsEnum termsEnum = terms.iterator();
+ if (termsEnum.seekExact(new BytesRef(featureName)) == false) {
+ return null;
+ }
+
+ SimScorer scorer = function.scorer(fieldName, boost);
+ PostingsEnum postings = termsEnum.postings(null, PostingsEnum.FREQS);
+
+ return new Scorer(this) {
+
+ @Override
+ public int docID() {
+ return postings.docID();
+ }
+
+ @Override
+ public float score() throws IOException {
+ return scorer.score(postings.docID(), postings.freq());
+ }
+
+ @Override
+ public DocIdSetIterator iterator() {
+ return postings;
+ }
+
+ };
+ }
+
+ };
+ }
+
+ @Override
+ public String toString(String field) {
+ return "FeatureQuery(field=" + fieldName + ", feature=" + featureName + ", function=" + function + ")";
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/70cc7b68/lucene/core/src/test/org/apache/lucene/document/TestFeatureField.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/document/TestFeatureField.java b/lucene/core/src/test/org/apache/lucene/document/TestFeatureField.java
new file mode 100644
index 0000000..c15c226
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/document/TestFeatureField.java
@@ -0,0 +1,316 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.document;
+
+import java.io.IOException;
+
+import org.apache.lucene.document.Field.Store;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.BooleanClause.Occur;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.QueryUtils;
+import org.apache.lucene.search.Scorer;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.search.Weight;
+import org.apache.lucene.search.similarities.BM25Similarity;
+import org.apache.lucene.search.similarities.Similarity.SimScorer;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCase;
+
+public class TestFeatureField extends LuceneTestCase {
+
+ /** Round a float value the same way that {@link FeatureField} rounds feature values. */
+ private static float round(float f) {
+ int bits = Float.floatToIntBits(f);
+ bits &= ~0 << 15; // clear last 15 bits
+ return Float.intBitsToFloat(bits);
+ }
+
+ public void testBasics() throws Exception {
+ Directory dir = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(random(), dir, newIndexWriterConfig()
+ .setMergePolicy(newLogMergePolicy(random().nextBoolean())));
+ Document doc = new Document();
+ FeatureField pagerank = new FeatureField("features", "pagerank", 1);
+ FeatureField urlLength = new FeatureField("features", "urlLen", 1);
+ doc.add(pagerank);
+ doc.add(urlLength);
+
+ pagerank.setFeatureValue(10);
+ urlLength.setFeatureValue(1f / 24);
+ writer.addDocument(doc);
+
+ pagerank.setFeatureValue(100);
+ urlLength.setFeatureValue(1f / 20);
+ writer.addDocument(doc);
+
+ writer.addDocument(new Document()); // gap
+
+ pagerank.setFeatureValue(1);
+ urlLength.setFeatureValue(1f / 100);
+ writer.addDocument(doc);
+
+ pagerank.setFeatureValue(42);
+ urlLength.setFeatureValue(1f / 23);
+ writer.addDocument(doc);
+
+ writer.forceMerge(1);
+ DirectoryReader reader = writer.getReader();
+ writer.close();
+
+ IndexSearcher searcher = new IndexSearcher(reader);
+ LeafReaderContext context = reader.leaves().get(0);
+
+ Query q = FeatureField.newLogQuery("features", "pagerank", 3f, 4.5f);
+ Weight w = q.createWeight(searcher, true, 2);
+ Scorer s = w.scorer(context);
+
+ assertEquals(0, s.iterator().nextDoc());
+ assertEquals((float) (6.0 * Math.log(4.5f + 10)), s.score(), 0f);
+
+ assertEquals(1, s.iterator().nextDoc());
+ assertEquals((float) (6.0 * Math.log(4.5f + 100)), s.score(), 0f);
+
+ assertEquals(3, s.iterator().nextDoc());
+ assertEquals((float) (6.0 * Math.log(4.5f + 1)), s.score(), 0f);
+
+ assertEquals(4, s.iterator().nextDoc());
+ assertEquals((float) (6.0 * Math.log(4.5f + 42)), s.score(), 0f);
+
+ assertEquals(DocIdSetIterator.NO_MORE_DOCS, s.iterator().nextDoc());
+
+ q = FeatureField.newSaturationQuery("features", "pagerank", 3f, 4.5f);
+ w = q.createWeight(searcher, true, 2);
+ s = w.scorer(context);
+
+ assertEquals(0, s.iterator().nextDoc());
+ assertEquals(6f * (1 - 4.5f / (4.5f + 10)), s.score(), 0f);
+
+ assertEquals(1, s.iterator().nextDoc());
+ assertEquals(6f * (1 - 4.5f / (4.5f + 100)), s.score(), 0f);
+
+ assertEquals(3, s.iterator().nextDoc());
+ assertEquals(6f * (1 - 4.5f / (4.5f + 1)), s.score(), 0f);
+
+ assertEquals(4, s.iterator().nextDoc());
+ assertEquals(6f * (1 - 4.5f / (4.5f + 42)), s.score(), 0f);
+
+ assertEquals(DocIdSetIterator.NO_MORE_DOCS, s.iterator().nextDoc());
+
+ q = FeatureField.newSigmoidQuery("features", "pagerank", 3f, 4.5f, 0.6f);
+ w = q.createWeight(searcher, true, 2);
+ s = w.scorer(context);
+ double kPa = Math.pow(4.5f, 0.6f);
+
+ assertEquals(0, s.iterator().nextDoc());
+ assertEquals((float) (6 * (1 - kPa / (kPa + Math.pow(10, 0.6f)))), s.score(), 0f);
+
+ assertEquals(1, s.iterator().nextDoc());
+ assertEquals((float) (6 * (1 - kPa / (kPa + Math.pow(100, 0.6f)))), s.score(), 0f);
+
+ assertEquals(3, s.iterator().nextDoc());
+ assertEquals((float) (6 * (1 - kPa / (kPa + Math.pow(1, 0.6f)))), s.score(), 0f);
+
+ assertEquals(4, s.iterator().nextDoc());
+ assertEquals((float) (6 * (1 - kPa / (kPa + Math.pow(42, 0.6f)))), s.score(), 0f);
+
+ assertEquals(DocIdSetIterator.NO_MORE_DOCS, s.iterator().nextDoc());
+
+ q = FeatureField.newSaturationQuery("features", "urlLen", 3f, 1f/24);
+ w = q.createWeight(searcher, true, 2);
+ s = w.scorer(context);
+
+ assertEquals(0, s.iterator().nextDoc());
+ assertEquals(6f * (1 - (1f/24) / (1f/24 + round(1f/24))), s.score(), 0f);
+
+ assertEquals(1, s.iterator().nextDoc());
+ assertEquals(6f * (1 - 1f/24 / (1f/24 + round(1f/20))), s.score(), 0f);
+
+ assertEquals(3, s.iterator().nextDoc());
+ assertEquals(6f * (1 - 1f/24 / (1f/24 + round(1f/100))), s.score(), 0f);
+
+ assertEquals(4, s.iterator().nextDoc());
+ assertEquals(6f * (1 - 1f/24 / (1f/24 + round(1f/23))), s.score(), 0f);
+
+ assertEquals(DocIdSetIterator.NO_MORE_DOCS, s.iterator().nextDoc());
+
+ reader.close();
+ dir.close();
+ }
+
+ public void testExplanations() throws Exception {
+ Directory dir = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(random(), dir, newIndexWriterConfig()
+ .setMergePolicy(newLogMergePolicy(random().nextBoolean())));
+ Document doc = new Document();
+ FeatureField pagerank = new FeatureField("features", "pagerank", 1);
+ doc.add(pagerank);
+
+ pagerank.setFeatureValue(10);
+ writer.addDocument(doc);
+
+ pagerank.setFeatureValue(100);
+ writer.addDocument(doc);
+
+ writer.addDocument(new Document()); // gap
+
+ pagerank.setFeatureValue(1);
+ writer.addDocument(doc);
+
+ pagerank.setFeatureValue(42);
+ writer.addDocument(doc);
+
+ DirectoryReader reader = writer.getReader();
+ writer.close();
+
+ IndexSearcher searcher = new IndexSearcher(reader);
+
+ QueryUtils.check(random(), FeatureField.newLogQuery("features", "pagerank", 1f, 4.5f), searcher);
+ QueryUtils.check(random(), FeatureField.newSaturationQuery("features", "pagerank", 1f, 12f), searcher);
+ QueryUtils.check(random(), FeatureField.newSigmoidQuery("features", "pagerank", 1f, 12f, 0.6f), searcher);
+
+ // Test boosts that are > 1
+ QueryUtils.check(random(), FeatureField.newLogQuery("features", "pagerank", 3f, 4.5f), searcher);
+ QueryUtils.check(random(), FeatureField.newSaturationQuery("features", "pagerank", 3f, 12f), searcher);
+ QueryUtils.check(random(), FeatureField.newSigmoidQuery("features", "pagerank", 3f, 12f, 0.6f), searcher);
+
+ // Test boosts that are < 1
+ QueryUtils.check(random(), FeatureField.newLogQuery("features", "pagerank", .2f, 4.5f), searcher);
+ QueryUtils.check(random(), FeatureField.newSaturationQuery("features", "pagerank", .2f, 12f), searcher);
+ QueryUtils.check(random(), FeatureField.newSigmoidQuery("features", "pagerank", .2f, 12f, 0.6f), searcher);
+
+ reader.close();
+ dir.close();
+ }
+
+ public void testLogSimScorer() throws IOException {
+ doTestSimScorer(new FeatureField.LogFunction(4.5f).scorer("foo", 3f));
+ }
+
+ public void testSatuSimScorer() throws IOException {
+ doTestSimScorer(new FeatureField.SaturationFunction(20f).scorer("foo", 3f));
+ }
+
+ public void testSigmSimScorer() throws IOException {
+ doTestSimScorer(new FeatureField.SigmoidFunction(20f, 0.6f).scorer("foo", 3f));
+ }
+
+ private void doTestSimScorer(SimScorer s) throws IOException {
+ float maxScore = s.score(0, Float.MAX_VALUE);
+ assertTrue(Float.isFinite(maxScore)); // used to compute max scores
+ // Test that the score doesn't decrease with freq
+ for (int freq = 2; freq < 65536; ++freq) {
+ assertTrue(s.score(freq - 1, 1L) <= s.score(freq, 1L));
+ }
+ assertTrue(s.score(65535, 1L) <= maxScore);
+ }
+
+ public void testComputePivotFeatureValue() throws IOException {
+ Directory dir = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(random(), dir, newIndexWriterConfig());
+ Document doc = new Document();
+ FeatureField pagerank = new FeatureField("features", "pagerank", 1);
+ doc.add(pagerank);
+
+ pagerank.setFeatureValue(10);
+ writer.addDocument(doc);
+
+ pagerank.setFeatureValue(100);
+ writer.addDocument(doc);
+
+ writer.addDocument(new Document()); // gap
+
+ pagerank.setFeatureValue(1);
+ writer.addDocument(doc);
+
+ pagerank.setFeatureValue(42);
+ writer.addDocument(doc);
+
+ DirectoryReader reader = writer.getReader();
+ writer.close();
+
+ IndexSearcher searcher = new IndexSearcher(reader);
+ float pivot = FeatureField.computePivotFeatureValue(searcher, "features", "pagerank");
+ double expected = Math.pow(10 * 100 * 1 * 42, 1/4.); // geometric mean
+ assertEquals(expected, pivot, 0.1);
+
+ reader.close();
+ dir.close();
+ }
+
+ public void testDemo() throws IOException {
+ Directory dir = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(random(), dir, newIndexWriterConfig()
+ .setMergePolicy(newLogMergePolicy(random().nextBoolean())));
+ Document doc = new Document();
+ FeatureField pagerank = new FeatureField("features", "pagerank", 1);
+ doc.add(pagerank);
+ TextField body = new TextField("body", "", Store.NO);
+ doc.add(body);
+
+ pagerank.setFeatureValue(10);
+ body.setStringValue("Apache Lucene");
+ writer.addDocument(doc);
+
+ pagerank.setFeatureValue(1000);
+ body.setStringValue("Apache Web HTTP server");
+ writer.addDocument(doc);
+
+ pagerank.setFeatureValue(1);
+ body.setStringValue("Lucene is a search engine");
+ writer.addDocument(doc);
+
+ pagerank.setFeatureValue(42);
+ body.setStringValue("Lucene in the sky with diamonds");
+ writer.addDocument(doc);
+
+ DirectoryReader reader = writer.getReader();
+ writer.close();
+
+ // NOTE: If you need to make changes below, then you likely also need to
+ // update javadocs of FeatureField.
+
+ IndexSearcher searcher = new IndexSearcher(reader);
+ searcher.setSimilarity(new BM25Similarity());
+ Query query = new BooleanQuery.Builder()
+ .add(new TermQuery(new Term("body", "apache")), Occur.SHOULD)
+ .add(new TermQuery(new Term("body", "lucene")), Occur.SHOULD)
+ .build();
+ Query boost = FeatureField.newSaturationQuery(searcher, "features", "pagerank");
+ Query boostedQuery = new BooleanQuery.Builder()
+ .add(query, Occur.MUST)
+ .add(boost, Occur.SHOULD)
+ .build();
+ TopDocs topDocs = searcher.search(boostedQuery, 10);
+ assertEquals(4, topDocs.scoreDocs.length);
+ assertEquals(1, topDocs.scoreDocs[0].doc);
+ assertEquals(0, topDocs.scoreDocs[1].doc);
+ assertEquals(3, topDocs.scoreDocs[2].doc);
+ assertEquals(2, topDocs.scoreDocs[3].doc);
+
+ reader.close();
+ dir.close();
+ }
+
+}
[2/2] lucene-solr:master: LUCENE-8197: Efficient integration of
static scoring factors.
Posted by jp...@apache.org.
LUCENE-8197: Efficient integration of static scoring factors.
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/71099343
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/71099343
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/71099343
Branch: refs/heads/master
Commit: 710993435f365fce44a60b7c498ce6af8327f92c
Parents: 2e35ef2
Author: Adrien Grand <jp...@gmail.com>
Authored: Tue Mar 20 11:21:39 2018 +0100
Committer: Adrien Grand <jp...@gmail.com>
Committed: Tue Mar 20 11:57:29 2018 +0100
----------------------------------------------------------------------
lucene/CHANGES.txt | 3 +
.../apache/lucene/document/FeatureField.java | 496 +++++++++++++++++++
.../apache/lucene/document/FeatureQuery.java | 157 ++++++
.../lucene/document/TestFeatureField.java | 317 ++++++++++++
4 files changed, 973 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/71099343/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 2769098..c435fec 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -103,6 +103,9 @@ New Features
deleted documents around for later reuse. See "IW.softUpdateDocument(...)"
for reference. (Simon Willnauer)
+* LUCENE-8097: A new FeatureField makes it easy and efficient to integrate
+ static relevance signals into the final score. (Adrien Grand, Robert Muir)
+
Other
* LUCENE-8214: Improve selection of testPoint for GeoComplexPolygon.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/71099343/lucene/core/src/java/org/apache/lucene/document/FeatureField.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/document/FeatureField.java b/lucene/core/src/java/org/apache/lucene/document/FeatureField.java
new file mode 100644
index 0000000..1b8a4e5
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/document/FeatureField.java
@@ -0,0 +1,496 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.document;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
+import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermStates;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.BoostQuery;
+import org.apache.lucene.search.Explanation;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.similarities.BM25Similarity;
+import org.apache.lucene.search.similarities.Similarity.SimScorer;
+
+/**
+ * {@link Field} that can be used to store static scoring factors into
+ * documents. This is mostly inspired from the work from Nick Craswell,
+ * Stephen Robertson, Hugo Zaragoza and Michael Taylor. Relevance weighting
+ * for query independent evidence. Proceedings of the 28th annual international
+ * ACM SIGIR conference on Research and development in information retrieval.
+ * August 15-19, 2005, Salvador, Brazil.
+ * <p>
+ * Feature values are internally encoded as term frequencies. Putting
+ * feature queries as
+ * {@link org.apache.lucene.search.BooleanClause.Occur#SHOULD} clauses of a
+ * {@link BooleanQuery} allows to combine query-dependent scores (eg. BM25)
+ * with query-independent scores using a linear combination. The fact that
+ * feature values are stored as frequencies also allows search logic to
+ * efficiently skip documents that can't be competitive when total hit counts
+ * are not requested. This makes it a compelling option compared to storing
+ * such factors eg. in a doc-value field.
+ * <p>
+ * This field may only store factors that are positively correlated with the
+ * final score, like pagerank. In case of factors that are inversely correlated
+ * with the score like url length, the inverse of the scoring factor should be
+ * stored, ie. {@code 1/urlLength}.
+ * <p>
+ * This field only considers the top 9 significant bits for storage efficiency
+ * which allows to store them on 16 bits internally. In practice this limitation
+ * means that values are stored with a relative precision of
+ * 2<sup>-8</sup> = 0.00390625.
+ * <p>
+ * Given a scoring factor {@code S > 0} and its weight {@code w > 0}, there
+ * are three ways that S can be turned into a score:
+ * <ul>
+ * <li>{@link #newLogQuery w * log(a + S)}, with a ≥ 1. This function
+ * usually makes sense because the distribution of scoring factors
+ * often follows a power law. This is typically the case for pagerank for
+ * instance. However the paper suggested that the {@code satu} and
+ * {@code sigm} functions give even better results.
+ * <li>{@link #newSaturationQuery satu(S) = w * S / (S + k)}, with k > 0. This
+ * function is similar to the one used by {@link BM25Similarity} in order
+ * to incorporate term frequency into the final score and produces values
+ * between 0 and 1. A value of 0.5 is obtained when S and k are equal.
+ * <li>{@link #newSigmoidQuery sigm(S) = w * S<sup>a</sup> / (S<sup>a</sup> + k<sup>a</sup>)},
+ * with k > 0, a > 0. This function provided even better results
+ * than the two above but is also harder to tune due to the fact it has
+ * 2 parameters. Like with {@code satu}, values are in the 0..1 range and
+ * 0.5 is obtained when S and k are equal.
+ * </ul>
+ * <p>
+ * The constants in the above formulas typically need training in order to
+ * compute optimal values. If you don't know where to start, the
+ * {@link #newSaturationQuery(IndexSearcher, String, String)} method uses
+ * {@code 1f} as a weight and tries to guess a sensible value for the
+ * {@code pivot} parameter of the saturation function based on index
+ * statistics, which shouldn't perform too bad. Here is an example, assuming
+ * that documents have a {@link FeatureField} called 'features' with values for
+ * the 'pagerank' feature.
+ * <pre class="prettyprint">
+ * Query query = new BooleanQuery.Builder()
+ * .add(new TermQuery(new Term("body", "apache")), Occur.SHOULD)
+ * .add(new TermQuery(new Term("body", "lucene")), Occur.SHOULD)
+ * .build();
+ * Query boost = FeatureField.newSaturationQuery(searcher, "features", "pagerank");
+ * Query boostedQuery = new BooleanQuery.Builder()
+ * .add(query, Occur.MUST)
+ * .add(boost, Occur.SHOULD)
+ * .build();
+ * TopDocs topDocs = searcher.search(boostedQuery, 10);
+ * </pre>
+ * @lucene.experimental
+ */
+public final class FeatureField extends Field {
+
+ private static final FieldType FIELD_TYPE = new FieldType();
+ static {
+ FIELD_TYPE.setTokenized(false);
+ FIELD_TYPE.setOmitNorms(true);
+ FIELD_TYPE.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
+ }
+
+ private float featureValue;
+
+ /**
+ * Create a feature.
+ * @param fieldName The name of the field to store the information into. All features may be stored in the same field.
+ * @param featureName The name of the feature, eg. 'pagerank`. It will be indexed as a term.
+ * @param featureValue The value of the feature, must be a positive, finite, normal float.
+ */
+ public FeatureField(String fieldName, String featureName, float featureValue) {
+ super(fieldName, featureName, FIELD_TYPE);
+ setFeatureValue(featureValue);
+ }
+
+ /**
+ * Update the feature value of this field.
+ */
+ public void setFeatureValue(float featureValue) {
+ if (Float.isFinite(featureValue) == false) {
+ throw new IllegalArgumentException("featureValue must be finite, got: " + featureValue +
+ " for feature " + fieldsData + " on field " + name);
+ }
+ if (featureValue < Float.MIN_NORMAL) {
+ throw new IllegalArgumentException("featureValue must be a positive normal float, got: " +
+ featureValue + "for feature " + fieldsData + " on field " + name +
+ " which is less than the minimum positive normal float: " + Float.MIN_NORMAL);
+ }
+ this.featureValue = featureValue;
+ }
+
+ @Override
+ public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) {
+ FeatureTokenStream stream;
+ if (reuse instanceof FeatureTokenStream) {
+ stream = (FeatureTokenStream) reuse;
+ } else {
+ stream = new FeatureTokenStream();
+ }
+
+ int freqBits = Float.floatToIntBits(featureValue);
+ stream.setValues((String) fieldsData, freqBits >>> 15);
+ return stream;
+ }
+
+ private static final class FeatureTokenStream extends TokenStream {
+ private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
+ private final TermFrequencyAttribute freqAttribute = addAttribute(TermFrequencyAttribute.class);
+ private boolean used = true;
+ private String value = null;
+ private int freq = 0;
+
+ private FeatureTokenStream() {
+ }
+
+ /** Sets the values */
+ void setValues(String value, int freq) {
+ this.value = value;
+ this.freq = freq;
+ }
+
+ @Override
+ public boolean incrementToken() {
+ if (used) {
+ return false;
+ }
+ clearAttributes();
+ termAttribute.append(value);
+ freqAttribute.setTermFrequency(freq);
+ used = true;
+ return true;
+ }
+
+ @Override
+ public void reset() {
+ used = false;
+ }
+
+ @Override
+ public void close() {
+ value = null;
+ }
+ }
+
+ private static final int MAX_FREQ = Float.floatToIntBits(Float.MAX_VALUE) >>> 15;
+
+ private static float decodeFeatureValue(float freq) {
+ if (freq > MAX_FREQ) {
+ // This is never used in practice but callers of the SimScorer API might
+ // occasionally call it on eg. Float.MAX_VALUE to compute the max score
+ // so we need to be consistent.
+ return Float.MAX_VALUE;
+ }
+ int tf = (int) freq; // lossless
+ int featureBits = tf << 15;
+ return Float.intBitsToFloat(featureBits);
+ }
+
+ static abstract class FeatureFunction {
+ abstract SimScorer scorer(String field, float w);
+ abstract Explanation explain(String field, String feature, float w, int freq);
+ }
+
+ static final class LogFunction extends FeatureFunction {
+
+ private final float scalingFactor;
+
+ LogFunction(float a) {
+ this.scalingFactor = a;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (obj == null || getClass() != obj.getClass()) {
+ return false;
+ }
+ LogFunction that = (LogFunction) obj;
+ return scalingFactor == that.scalingFactor;
+ }
+
+ @Override
+ public int hashCode() {
+ return Float.hashCode(scalingFactor);
+ }
+
+ @Override
+ public String toString() {
+ return "LogFunction(scalingFactor=" + scalingFactor + ")";
+ }
+
+ @Override
+ SimScorer scorer(String field, float weight) {
+ return new SimScorer(field) {
+ @Override
+ public float score(float freq, long norm) {
+ return (float) (weight * Math.log(scalingFactor + decodeFeatureValue(freq)));
+ }
+ };
+ }
+
+ @Override
+ Explanation explain(String field, String feature, float w, int freq) {
+ float featureValue = decodeFeatureValue(freq);
+ float score = scorer(field, w).score(freq, 1L);
+ return Explanation.match(score,
+ "Log function on the " + field + " field for the " + feature + " feature, computed as w * log(a + S) from:",
+ Explanation.match(w, "w, weight of this function"),
+ Explanation.match(scalingFactor, "a, scaling factor"),
+ Explanation.match(featureValue, "S, feature value"));
+ }
+ }
+
+ static final class SaturationFunction extends FeatureFunction {
+
+ private final float pivot;
+
+ SaturationFunction(float pivot) {
+ this.pivot = pivot;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (obj == null || getClass() != obj.getClass()) {
+ return false;
+ }
+ SaturationFunction that = (SaturationFunction) obj;
+ return pivot == that.pivot;
+ }
+
+ @Override
+ public int hashCode() {
+ return Float.hashCode(pivot);
+ }
+
+ @Override
+ public String toString() {
+ return "SaturationFunction(pivot=" + pivot + ")";
+ }
+
+ @Override
+ SimScorer scorer(String field, float weight) {
+ return new SimScorer(field) {
+ @Override
+ public float score(float freq, long norm) {
+ float f = decodeFeatureValue(freq);
+ // should be f / (f + k) but we rewrite it to
+ // 1 - k / (f + k) to make sure it doesn't decrease
+ // with f in spite of rounding
+ return weight * (1 - pivot / (f + pivot));
+ }
+ };
+ }
+
+ @Override
+ Explanation explain(String field, String feature, float weight, int freq) {
+ float featureValue = decodeFeatureValue(freq);
+ float score = scorer(field, weight).score(freq, 1L);
+ return Explanation.match(score,
+ "Saturation function on the " + field + " field for the " + feature + " feature, computed as w * S / (S + k) from:",
+ Explanation.match(weight, "w, weight of this function"),
+ Explanation.match(pivot, "k, pivot feature value that would give a score contribution equal to w/2"),
+ Explanation.match(featureValue, "S, feature value"));
+ }
+ }
+
+ static final class SigmoidFunction extends FeatureFunction {
+
+ private final float pivot, a;
+ private final double pivotPa;
+
+ SigmoidFunction(float pivot, float a) {
+ this.pivot = pivot;
+ this.a = a;
+ this.pivotPa = Math.pow(pivot, a);
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (obj == null || getClass() != obj.getClass()) {
+ return false;
+ }
+ SigmoidFunction that = (SigmoidFunction) obj;
+ return pivot == that.pivot
+ && a == that.a;
+ }
+
+ @Override
+ public int hashCode() {
+ int h = Float.hashCode(pivot);
+ h = 31 * h + Float.hashCode(a);
+ return h;
+ }
+
+ @Override
+ public String toString() {
+ return "SigmoidFunction(pivot=" + pivot + ", a=" + a + ")";
+ }
+
+ @Override
+ SimScorer scorer(String field, float weight) {
+ return new SimScorer(field) {
+ @Override
+ public float score(float freq, long norm) {
+ float f = decodeFeatureValue(freq);
+ // should be f^a / (f^a + k^a) but we rewrite it to
+ // 1 - k^a / (f + k^a) to make sure it doesn't decrease
+ // with f in spite of rounding
+ return (float) (weight * (1 - pivotPa / (Math.pow(f, a) + pivotPa)));
+ }
+ };
+ }
+
+ @Override
+ Explanation explain(String field, String feature, float weight, int freq) {
+ float featureValue = decodeFeatureValue(freq);
+ float score = scorer(field, weight).score(freq, 1L);
+ return Explanation.match(score,
+ "Sigmoid function on the " + field + " field for the " + feature + " feature, computed as w * S^a / (S^a + k^a) from:",
+ Explanation.match(weight, "w, weight of this function"),
+ Explanation.match(pivot, "k, pivot feature value that would give a score contribution equal to w/2"),
+ Explanation.match(pivot, "a, exponent, higher values make the function grow slower before k and faster after k"),
+ Explanation.match(featureValue, "S, feature value"));
+ }
+ }
+
+ /**
+ * Given that IDFs are logs, similarities that incorporate term freq and
+ * document length in sane (ie. saturated) ways should have their score
+ * bounded by a log. So we reject weights that are too high as it would mean
+ * that this clause would completely dominate ranking, removing the need for
+ * query-dependent scores.
+ */
+ private static final float MAX_WEIGHT = Long.SIZE;
+
+ /**
+ * Return a new {@link Query} that will score documents as
+ * {@code weight * Math.log(scalingFactor + S)} where S is the value of the static feature.
+ * @param fieldName field that stores features
+ * @param featureName name of the feature
+ * @param weight weight to give to this feature, must be in (0,64]
+ * @param scalingFactor scaling factor applied before taking the logarithm, must be in [1, +Infinity)
+ * @throws IllegalArgumentException if weight is not in (0,64] or scalingFactor is not in [1, +Infinity)
+ */
+ public static Query newLogQuery(String fieldName, String featureName, float weight, float scalingFactor) {
+ if (weight <= 0 || weight > MAX_WEIGHT) {
+ throw new IllegalArgumentException("weight must be in (0, " + MAX_WEIGHT + "], got: " + weight);
+ }
+ if (scalingFactor < 1 || Float.isFinite(scalingFactor) == false) {
+ throw new IllegalArgumentException("scalingFactor must be >= 1, got: " + scalingFactor);
+ }
+ Query q = new FeatureQuery(fieldName, featureName, new LogFunction(scalingFactor));
+ if (weight != 1f) {
+ q = new BoostQuery(q, weight);
+ }
+ return q;
+ }
+
+ /**
+ * Return a new {@link Query} that will score documents as
+ * {@code weight * S / (S + pivot)} where S is the value of the static feature.
+ * @param fieldName field that stores features
+ * @param featureName name of the feature
+ * @param weight weight to give to this feature, must be in (0,64]
+ * @param pivot feature value that would give a score contribution equal to weight/2, must be in (0, +Infinity)
+ * @throws IllegalArgumentException if weight is not in (0,64] or pivot is not in (0, +Infinity)
+ */
+ public static Query newSaturationQuery(String fieldName, String featureName, float weight, float pivot) {
+ if (weight <= 0 || weight > MAX_WEIGHT) {
+ throw new IllegalArgumentException("weight must be in (0, " + MAX_WEIGHT + "], got: " + weight);
+ }
+ if (pivot <= 0 || Float.isFinite(pivot) == false) {
+ throw new IllegalArgumentException("pivot must be > 0, got: " + pivot);
+ }
+ Query q = new FeatureQuery(fieldName, featureName, new SaturationFunction(pivot));
+ if (weight != 1f) {
+ q = new BoostQuery(q, weight);
+ }
+ return q;
+ }
+
+ /**
+ * Same as {@link #newSaturationQuery(String, String, float, float)} but
+ * uses {@code 1f} as a weight and tries to compute a sensible default value
+ * for {@code pivot} using
+ * {@link #computePivotFeatureValue(IndexSearcher, String, String)}. This
+ * isn't expected to give an optimal configuration of these parameters but
+ * should be a good start if you have no idea what the values of these
+ * parameters should be.
+ * @param searcher the {@link IndexSearcher} that you will search against
+ * @param featureFieldName the field that stores features
+ * @param featureName the name of the feature
+ */
+ public static Query newSaturationQuery(IndexSearcher searcher,
+ String featureFieldName, String featureName) throws IOException {
+ float weight = 1f;
+ float pivot = computePivotFeatureValue(searcher, featureFieldName, featureName);
+ return newSaturationQuery(featureFieldName, featureName, weight, pivot);
+ }
+
+ /**
+ * Return a new {@link Query} that will score documents as
+ * {@code weight * S^a / (S^a + pivot^a)} where S is the value of the static feature.
+ * @param fieldName field that stores features
+ * @param featureName name of the feature
+ * @param weight weight to give to this feature, must be in (0,64]
+ * @param pivot feature value that would give a score contribution equal to weight/2, must be in (0, +Infinity)
+ * @param exp exponent, higher values make the function grow slower before 'pivot' and faster after 'pivot', must be in (0, +Infinity)
+ * @throws IllegalArgumentException if w is not in (0,64] or either k or a are not in (0, +Infinity)
+ */
+ public static Query newSigmoidQuery(String fieldName, String featureName, float weight, float pivot, float exp) {
+ if (weight <= 0 || weight > MAX_WEIGHT) {
+ throw new IllegalArgumentException("weight must be in (0, " + MAX_WEIGHT + "], got: " + weight);
+ }
+ if (pivot <= 0 || Float.isFinite(pivot) == false) {
+ throw new IllegalArgumentException("pivot must be > 0, got: " + pivot);
+ }
+ if (exp <= 0 || Float.isFinite(exp) == false) {
+ throw new IllegalArgumentException("exp must be > 0, got: " + exp);
+ }
+ Query q = new FeatureQuery(fieldName, featureName, new SigmoidFunction(pivot, exp));
+ if (weight != 1f) {
+ q = new BoostQuery(q, weight);
+ }
+ return q;
+ }
+
+ /**
+ * Compute a feature value that may be used as the {@code pivot} parameter of
+ * the {@link #newSaturationQuery(String, String, float, float)} and
+ * {@link #newSigmoidQuery(String, String, float, float, float)} factory
+ * methods. The implementation takes the average of the int bits of the float
+ * representation in practice before converting it back to a float. Given that
+ * floats store the exponent in the higher bits, it means that the result will
+ * be an approximation of the geometric mean of all feature values.
+ * @param searcher the {@link IndexSearcher} to search against
+ * @param featureField the field that stores features
+ * @param featureName the name of the feature
+ */
+ public static float computePivotFeatureValue(IndexSearcher searcher, String featureField, String featureName) throws IOException {
+ Term term = new Term(featureField, featureName);
+ TermStates states = TermStates.build(searcher.getIndexReader().getContext(), term, true);
+ float avgFreq = (float) ((double) states.totalTermFreq() / states.docFreq());
+ return decodeFeatureValue(avgFreq);
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/71099343/lucene/core/src/java/org/apache/lucene/document/FeatureQuery.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/document/FeatureQuery.java b/lucene/core/src/java/org/apache/lucene/document/FeatureQuery.java
new file mode 100644
index 0000000..841b2ad
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/document/FeatureQuery.java
@@ -0,0 +1,157 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.document;
+
+import java.io.IOException;
+import java.util.Objects;
+import java.util.Set;
+
+import org.apache.lucene.document.FeatureField.FeatureFunction;
+import org.apache.lucene.index.ImpactsEnum;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.PostingsEnum;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.Explanation;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.ScoreMode;
+import org.apache.lucene.search.Scorer;
+import org.apache.lucene.search.Weight;
+import org.apache.lucene.search.similarities.Similarity.SimScorer;
+import org.apache.lucene.util.BytesRef;
+
+final class FeatureQuery extends Query {
+
+ private final String fieldName;
+ private final String featureName;
+ private final FeatureFunction function;
+
+ FeatureQuery(String fieldName, String featureName, FeatureFunction function) {
+ this.fieldName = Objects.requireNonNull(fieldName);
+ this.featureName = Objects.requireNonNull(featureName);
+ this.function = Objects.requireNonNull(function);
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (obj == null || getClass() != obj.getClass()) {
+ return false;
+ }
+ FeatureQuery that = (FeatureQuery) obj;
+ return Objects.equals(fieldName, that.fieldName) &&
+ Objects.equals(featureName, that.featureName) &&
+ Objects.equals(function, that.function);
+ }
+
+ @Override
+ public int hashCode() {
+ int h = getClass().hashCode();
+ h = 31 * h + fieldName.hashCode();
+ h = 31 * h + featureName.hashCode();
+ h = 31 * h + function.hashCode();
+ return h;
+ }
+
+ @Override
+ public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
+ return new Weight(this) {
+
+ @Override
+ public boolean isCacheable(LeafReaderContext ctx) {
+ return false;
+ }
+
+ @Override
+ public void extractTerms(Set<Term> terms) {}
+
+ @Override
+ public Explanation explain(LeafReaderContext context, int doc) throws IOException {
+ String desc = "weight(" + getQuery() + " in " + doc + ") [" + function + "]";
+
+ Terms terms = context.reader().terms(fieldName);
+ if (terms == null) {
+ return Explanation.noMatch(desc + ". Field " + fieldName + " doesn't exist.");
+ }
+ TermsEnum termsEnum = terms.iterator();
+ if (termsEnum.seekExact(new BytesRef(featureName)) == false) {
+ return Explanation.noMatch(desc + ". Feature " + featureName + " doesn't exist.");
+ }
+
+ PostingsEnum postings = termsEnum.postings(null, PostingsEnum.FREQS);
+ if (postings.advance(doc) != doc) {
+ return Explanation.noMatch(desc + ". Feature " + featureName + " isn't set.");
+ }
+
+ return function.explain(fieldName, featureName, boost, postings.freq());
+ }
+
+ @Override
+ public Scorer scorer(LeafReaderContext context) throws IOException {
+ Terms terms = context.reader().terms(fieldName);
+ if (terms == null) {
+ return null;
+ }
+ TermsEnum termsEnum = terms.iterator();
+ if (termsEnum.seekExact(new BytesRef(featureName)) == false) {
+ return null;
+ }
+
+ SimScorer scorer = function.scorer(fieldName, boost);
+ ImpactsEnum impacts = termsEnum.impacts(scorer, PostingsEnum.FREQS);
+
+ return new Scorer(this) {
+
+ @Override
+ public int docID() {
+ return impacts.docID();
+ }
+
+ @Override
+ public float score() throws IOException {
+ return scorer.score(impacts.freq(), 1L);
+ }
+
+ @Override
+ public DocIdSetIterator iterator() {
+ return impacts;
+ }
+
+ @Override
+ public int advanceShallow(int target) throws IOException {
+ return impacts.advanceShallow(target);
+ }
+
+ @Override
+ public float getMaxScore(int upTo) throws IOException {
+ return impacts.getMaxScore(upTo);
+ }
+
+ };
+ }
+
+ };
+ }
+
+ @Override
+ public String toString(String field) {
+ return "FeatureQuery(field=" + fieldName + ", feature=" + featureName + ", function=" + function + ")";
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/71099343/lucene/core/src/test/org/apache/lucene/document/TestFeatureField.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/document/TestFeatureField.java b/lucene/core/src/test/org/apache/lucene/document/TestFeatureField.java
new file mode 100644
index 0000000..2afc250
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/document/TestFeatureField.java
@@ -0,0 +1,317 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.document;
+
+import java.io.IOException;
+
+import org.apache.lucene.document.Field.Store;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.BooleanClause.Occur;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.QueryUtils;
+import org.apache.lucene.search.ScoreMode;
+import org.apache.lucene.search.Scorer;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.search.Weight;
+import org.apache.lucene.search.similarities.BM25Similarity;
+import org.apache.lucene.search.similarities.Similarity.SimScorer;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCase;
+
+public class TestFeatureField extends LuceneTestCase {
+
+ /** Round a float value the same way that {@link FeatureField} rounds feature values. */
+ private static float round(float f) {
+ int bits = Float.floatToIntBits(f);
+ bits &= ~0 << 15; // clear last 15 bits
+ return Float.intBitsToFloat(bits);
+ }
+
+ public void testBasics() throws Exception {
+ Directory dir = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(random(), dir, newIndexWriterConfig()
+ .setMergePolicy(newLogMergePolicy(random().nextBoolean())));
+ Document doc = new Document();
+ FeatureField pagerank = new FeatureField("features", "pagerank", 1);
+ FeatureField urlLength = new FeatureField("features", "urlLen", 1);
+ doc.add(pagerank);
+ doc.add(urlLength);
+
+ pagerank.setFeatureValue(10);
+ urlLength.setFeatureValue(1f / 24);
+ writer.addDocument(doc);
+
+ pagerank.setFeatureValue(100);
+ urlLength.setFeatureValue(1f / 20);
+ writer.addDocument(doc);
+
+ writer.addDocument(new Document()); // gap
+
+ pagerank.setFeatureValue(1);
+ urlLength.setFeatureValue(1f / 100);
+ writer.addDocument(doc);
+
+ pagerank.setFeatureValue(42);
+ urlLength.setFeatureValue(1f / 23);
+ writer.addDocument(doc);
+
+ writer.forceMerge(1);
+ DirectoryReader reader = writer.getReader();
+ writer.close();
+
+ IndexSearcher searcher = new IndexSearcher(reader);
+ LeafReaderContext context = reader.leaves().get(0);
+
+ Query q = FeatureField.newLogQuery("features", "pagerank", 3f, 4.5f);
+ Weight w = q.createWeight(searcher, ScoreMode.TOP_SCORES, 2);
+ Scorer s = w.scorer(context);
+
+ assertEquals(0, s.iterator().nextDoc());
+ assertEquals((float) (6.0 * Math.log(4.5f + 10)), s.score(), 0f);
+
+ assertEquals(1, s.iterator().nextDoc());
+ assertEquals((float) (6.0 * Math.log(4.5f + 100)), s.score(), 0f);
+
+ assertEquals(3, s.iterator().nextDoc());
+ assertEquals((float) (6.0 * Math.log(4.5f + 1)), s.score(), 0f);
+
+ assertEquals(4, s.iterator().nextDoc());
+ assertEquals((float) (6.0 * Math.log(4.5f + 42)), s.score(), 0f);
+
+ assertEquals(DocIdSetIterator.NO_MORE_DOCS, s.iterator().nextDoc());
+
+ q = FeatureField.newSaturationQuery("features", "pagerank", 3f, 4.5f);
+ w = q.createWeight(searcher, ScoreMode.TOP_SCORES, 2);
+ s = w.scorer(context);
+
+ assertEquals(0, s.iterator().nextDoc());
+ assertEquals(6f * (1 - 4.5f / (4.5f + 10)), s.score(), 0f);
+
+ assertEquals(1, s.iterator().nextDoc());
+ assertEquals(6f * (1 - 4.5f / (4.5f + 100)), s.score(), 0f);
+
+ assertEquals(3, s.iterator().nextDoc());
+ assertEquals(6f * (1 - 4.5f / (4.5f + 1)), s.score(), 0f);
+
+ assertEquals(4, s.iterator().nextDoc());
+ assertEquals(6f * (1 - 4.5f / (4.5f + 42)), s.score(), 0f);
+
+ assertEquals(DocIdSetIterator.NO_MORE_DOCS, s.iterator().nextDoc());
+
+ q = FeatureField.newSigmoidQuery("features", "pagerank", 3f, 4.5f, 0.6f);
+ w = q.createWeight(searcher, ScoreMode.TOP_SCORES, 2);
+ s = w.scorer(context);
+ double kPa = Math.pow(4.5f, 0.6f);
+
+ assertEquals(0, s.iterator().nextDoc());
+ assertEquals((float) (6 * (1 - kPa / (kPa + Math.pow(10, 0.6f)))), s.score(), 0f);
+
+ assertEquals(1, s.iterator().nextDoc());
+ assertEquals((float) (6 * (1 - kPa / (kPa + Math.pow(100, 0.6f)))), s.score(), 0f);
+
+ assertEquals(3, s.iterator().nextDoc());
+ assertEquals((float) (6 * (1 - kPa / (kPa + Math.pow(1, 0.6f)))), s.score(), 0f);
+
+ assertEquals(4, s.iterator().nextDoc());
+ assertEquals((float) (6 * (1 - kPa / (kPa + Math.pow(42, 0.6f)))), s.score(), 0f);
+
+ assertEquals(DocIdSetIterator.NO_MORE_DOCS, s.iterator().nextDoc());
+
+ q = FeatureField.newSaturationQuery("features", "urlLen", 3f, 1f/24);
+ w = q.createWeight(searcher, ScoreMode.TOP_SCORES, 2);
+ s = w.scorer(context);
+
+ assertEquals(0, s.iterator().nextDoc());
+ assertEquals(6f * (1 - (1f/24) / (1f/24 + round(1f/24))), s.score(), 0f);
+
+ assertEquals(1, s.iterator().nextDoc());
+ assertEquals(6f * (1 - 1f/24 / (1f/24 + round(1f/20))), s.score(), 0f);
+
+ assertEquals(3, s.iterator().nextDoc());
+ assertEquals(6f * (1 - 1f/24 / (1f/24 + round(1f/100))), s.score(), 0f);
+
+ assertEquals(4, s.iterator().nextDoc());
+ assertEquals(6f * (1 - 1f/24 / (1f/24 + round(1f/23))), s.score(), 0f);
+
+ assertEquals(DocIdSetIterator.NO_MORE_DOCS, s.iterator().nextDoc());
+
+ reader.close();
+ dir.close();
+ }
+
+ public void testExplanations() throws Exception {
+ Directory dir = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(random(), dir, newIndexWriterConfig()
+ .setMergePolicy(newLogMergePolicy(random().nextBoolean())));
+ Document doc = new Document();
+ FeatureField pagerank = new FeatureField("features", "pagerank", 1);
+ doc.add(pagerank);
+
+ pagerank.setFeatureValue(10);
+ writer.addDocument(doc);
+
+ pagerank.setFeatureValue(100);
+ writer.addDocument(doc);
+
+ writer.addDocument(new Document()); // gap
+
+ pagerank.setFeatureValue(1);
+ writer.addDocument(doc);
+
+ pagerank.setFeatureValue(42);
+ writer.addDocument(doc);
+
+ DirectoryReader reader = writer.getReader();
+ writer.close();
+
+ IndexSearcher searcher = new IndexSearcher(reader);
+
+ QueryUtils.check(random(), FeatureField.newLogQuery("features", "pagerank", 1f, 4.5f), searcher);
+ QueryUtils.check(random(), FeatureField.newSaturationQuery("features", "pagerank", 1f, 12f), searcher);
+ QueryUtils.check(random(), FeatureField.newSigmoidQuery("features", "pagerank", 1f, 12f, 0.6f), searcher);
+
+ // Test boosts that are > 1
+ QueryUtils.check(random(), FeatureField.newLogQuery("features", "pagerank", 3f, 4.5f), searcher);
+ QueryUtils.check(random(), FeatureField.newSaturationQuery("features", "pagerank", 3f, 12f), searcher);
+ QueryUtils.check(random(), FeatureField.newSigmoidQuery("features", "pagerank", 3f, 12f, 0.6f), searcher);
+
+ // Test boosts that are < 1
+ QueryUtils.check(random(), FeatureField.newLogQuery("features", "pagerank", .2f, 4.5f), searcher);
+ QueryUtils.check(random(), FeatureField.newSaturationQuery("features", "pagerank", .2f, 12f), searcher);
+ QueryUtils.check(random(), FeatureField.newSigmoidQuery("features", "pagerank", .2f, 12f, 0.6f), searcher);
+
+ reader.close();
+ dir.close();
+ }
+
+ public void testLogSimScorer() {
+ doTestSimScorer(new FeatureField.LogFunction(4.5f).scorer("foo", 3f));
+ }
+
+ public void testSatuSimScorer() {
+ doTestSimScorer(new FeatureField.SaturationFunction(20f).scorer("foo", 3f));
+ }
+
+ public void testSigmSimScorer() {
+ doTestSimScorer(new FeatureField.SigmoidFunction(20f, 0.6f).scorer("foo", 3f));
+ }
+
+ private void doTestSimScorer(SimScorer s) {
+ float maxScore = s.score(Float.MAX_VALUE, 1);
+ assertTrue(Float.isFinite(maxScore)); // used to compute max scores
+ // Test that the score doesn't decrease with freq
+ for (int freq = 2; freq < 65536; ++freq) {
+ assertTrue(s.score(freq - 1, 1L) <= s.score(freq, 1L));
+ }
+ assertTrue(s.score(65535, 1L) <= maxScore);
+ }
+
+ public void testComputePivotFeatureValue() throws IOException {
+ Directory dir = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(random(), dir, newIndexWriterConfig());
+ Document doc = new Document();
+ FeatureField pagerank = new FeatureField("features", "pagerank", 1);
+ doc.add(pagerank);
+
+ pagerank.setFeatureValue(10);
+ writer.addDocument(doc);
+
+ pagerank.setFeatureValue(100);
+ writer.addDocument(doc);
+
+ writer.addDocument(new Document()); // gap
+
+ pagerank.setFeatureValue(1);
+ writer.addDocument(doc);
+
+ pagerank.setFeatureValue(42);
+ writer.addDocument(doc);
+
+ DirectoryReader reader = writer.getReader();
+ writer.close();
+
+ IndexSearcher searcher = new IndexSearcher(reader);
+ float pivot = FeatureField.computePivotFeatureValue(searcher, "features", "pagerank");
+ double expected = Math.pow(10 * 100 * 1 * 42, 1/4.); // geometric mean
+ assertEquals(expected, pivot, 0.1);
+
+ reader.close();
+ dir.close();
+ }
+
+ public void testDemo() throws IOException {
+ Directory dir = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(random(), dir, newIndexWriterConfig()
+ .setMergePolicy(newLogMergePolicy(random().nextBoolean())));
+ Document doc = new Document();
+ FeatureField pagerank = new FeatureField("features", "pagerank", 1);
+ doc.add(pagerank);
+ TextField body = new TextField("body", "", Store.NO);
+ doc.add(body);
+
+ pagerank.setFeatureValue(10);
+ body.setStringValue("Apache Lucene");
+ writer.addDocument(doc);
+
+ pagerank.setFeatureValue(1000);
+ body.setStringValue("Apache Web HTTP server");
+ writer.addDocument(doc);
+
+ pagerank.setFeatureValue(1);
+ body.setStringValue("Lucene is a search engine");
+ writer.addDocument(doc);
+
+ pagerank.setFeatureValue(42);
+ body.setStringValue("Lucene in the sky with diamonds");
+ writer.addDocument(doc);
+
+ DirectoryReader reader = writer.getReader();
+ writer.close();
+
+ // NOTE: If you need to make changes below, then you likely also need to
+ // update javadocs of FeatureField.
+
+ IndexSearcher searcher = new IndexSearcher(reader);
+ searcher.setSimilarity(new BM25Similarity());
+ Query query = new BooleanQuery.Builder()
+ .add(new TermQuery(new Term("body", "apache")), Occur.SHOULD)
+ .add(new TermQuery(new Term("body", "lucene")), Occur.SHOULD)
+ .build();
+ Query boost = FeatureField.newSaturationQuery(searcher, "features", "pagerank");
+ Query boostedQuery = new BooleanQuery.Builder()
+ .add(query, Occur.MUST)
+ .add(boost, Occur.SHOULD)
+ .build();
+ TopDocs topDocs = searcher.search(boostedQuery, 10);
+ assertEquals(4, topDocs.scoreDocs.length);
+ assertEquals(1, topDocs.scoreDocs[0].doc);
+ assertEquals(0, topDocs.scoreDocs[1].doc);
+ assertEquals(3, topDocs.scoreDocs[2].doc);
+ assertEquals(2, topDocs.scoreDocs[3].doc);
+
+ reader.close();
+ dir.close();
+ }
+
+}