You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ma...@apache.org on 2020/11/10 22:33:40 UTC
[lucene-solr] branch branch_8x updated: LUCENE-9594 Add linear
function for FeatureField
This is an automated email from the ASF dual-hosted git repository.
mayya pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/branch_8x by this push:
new e76a25e LUCENE-9594 Add linear function for FeatureField
e76a25e is described below
commit e76a25ebc72520ee3cbbeca31df82dd3a3a31048
Author: Mayya Sharipova <ma...@elastic.co>
AuthorDate: Tue Nov 10 17:08:08 2020 -0500
LUCENE-9594 Add linear function for FeatureField
This adds a linear function and newLinearQuery for FeatureField
---
lucene/CHANGES.txt | 5 +-
.../org/apache/lucene/document/FeatureField.java | 68 +++++++++++++++++++++-
.../apache/lucene/document/TestFeatureField.java | 25 ++++++++
3 files changed, 96 insertions(+), 2 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 9cbc402..e5eb1d8 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -14,7 +14,10 @@ New Features
* LUCENE-9552: New LatLonPoint query that accepts an array of LatLonGeometries. (Ignacio Vera)
-* LUCENE-9553: New XYPoint query that accepts an array of XYGeometries. (Ignacio Vera)
+* LUCENE-9553: New XYPoint query that accepts an array of XYGeometries. (Ignacio Vera)
+
+* LUCENE-9594: FeatureField supports newLinearQuery that for scoring uses raw indexed
+ values of features without any transformation. (Mayya Sharipova, Adrien Grand)
Improvements
---------------------
diff --git a/lucene/core/src/java/org/apache/lucene/document/FeatureField.java b/lucene/core/src/java/org/apache/lucene/document/FeatureField.java
index 2ca048c..dafcbf4 100644
--- a/lucene/core/src/java/org/apache/lucene/document/FeatureField.java
+++ b/lucene/core/src/java/org/apache/lucene/document/FeatureField.java
@@ -66,7 +66,7 @@ import org.apache.lucene.search.similarities.Similarity.SimScorer;
* 2<sup>-8</sup> = 0.00390625.
* <p>
* Given a scoring factor {@code S > 0} and its weight {@code w > 0}, there
- * are three ways that S can be turned into a score:
+ * are four ways that S can be turned into a score:
* <ul>
* <li>{@link #newLogQuery w * log(a + S)}, with a ≥ 1. This function
* usually makes sense because the distribution of scoring factors
@@ -82,6 +82,12 @@ import org.apache.lucene.search.similarities.Similarity.SimScorer;
* than the two above but is also harder to tune due to the fact it has
* 2 parameters. Like with {@code satu}, values are in the 0..1 range and
* 0.5 is obtained when S and k are equal.
+ * <li>{@link #newLinearQuery w * S}. Expert: This function doesn't apply
+ * any transformation to an indexed feature value, and the indexed value itself,
+ * multiplied by weight, determines the score. Thus, there is an expectation
+ * that a feature value is encoded in the index in a way that makes
+ * sense for scoring.
+ *
* </ul>
* <p>
* The constants in the above formulas typically need training in order to
@@ -217,6 +223,46 @@ public final class FeatureField extends Field {
FeatureFunction rewrite(IndexReader reader) throws IOException { return this; }
}
+ static final class LinearFunction extends FeatureFunction {
+ @Override
+ SimScorer scorer(float w) {
+ return new SimScorer() {
+ @Override
+ public float score(float freq, long norm) {
+ return (w * decodeFeatureValue(freq));
+ }
+ };
+ }
+
+ @Override
+ Explanation explain(String field, String feature, float w, int freq) {
+ float featureValue = decodeFeatureValue(freq);
+ float score = scorer(w).score(freq, 1L);
+ return Explanation.match(score,
+ "Linear function on the " + field + " field for the " + feature + " feature, computed as w * S from:",
+ Explanation.match(w, "w, weight of this function"),
+ Explanation.match(featureValue, "S, feature value"));
+ }
+
+ @Override
+ public String toString() {
+ return "LinearFunction";
+ }
+
+ @Override
+ public int hashCode() {
+ return getClass().hashCode();
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (obj == null || getClass() != obj.getClass()) {
+ return false;
+ }
+ return true;
+ }
+ };
+
static final class LogFunction extends FeatureFunction {
private final float scalingFactor;
@@ -406,6 +452,26 @@ public final class FeatureField extends Field {
*/
private static final float MAX_WEIGHT = Long.SIZE;
+
+ /**
+ * Return a new {@link Query} that will score documents as
+ * {@code weight * S} where S is the value of the static feature.
+ * @param fieldName field that stores features
+ * @param featureName name of the feature
+ * @param weight weight to give to this feature, must be in (0,64]
+ * @throws IllegalArgumentException if weight is not in (0,64]
+ */
+ public static Query newLinearQuery(String fieldName, String featureName, float weight) {
+ if (weight <= 0 || weight > MAX_WEIGHT) {
+ throw new IllegalArgumentException("weight must be in (0, " + MAX_WEIGHT + "], got: " + weight);
+ }
+ Query q = new FeatureQuery(fieldName, featureName, new LinearFunction());
+ if (weight != 1f) {
+ q = new BoostQuery(q, weight);
+ }
+ return q;
+ }
+
/**
* Return a new {@link Query} that will score documents as
* {@code weight * Math.log(scalingFactor + S)} where S is the value of the static feature.
diff --git a/lucene/core/src/test/org/apache/lucene/document/TestFeatureField.java b/lucene/core/src/test/org/apache/lucene/document/TestFeatureField.java
index 6a0a335..f00e475 100644
--- a/lucene/core/src/test/org/apache/lucene/document/TestFeatureField.java
+++ b/lucene/core/src/test/org/apache/lucene/document/TestFeatureField.java
@@ -106,6 +106,24 @@ public class TestFeatureField extends LuceneTestCase {
assertEquals(DocIdSetIterator.NO_MORE_DOCS, s.iterator().nextDoc());
+ q = FeatureField.newLinearQuery("features", "pagerank", 3f);
+ w = q.createWeight(searcher, ScoreMode.TOP_SCORES, 2);
+ s = w.scorer(context);
+
+ assertEquals(0, s.iterator().nextDoc());
+ assertEquals((float) (6.0 * 10), s.score(), 0f);
+
+ assertEquals(1, s.iterator().nextDoc());
+ assertEquals((float) (6.0 * 100), s.score(), 0f);
+
+ assertEquals(3, s.iterator().nextDoc());
+ assertEquals((float) (6.0 * 1), s.score(), 0f);
+
+ assertEquals(4, s.iterator().nextDoc());
+ assertEquals((float) (6.0 * 42), s.score(), 0f);
+
+ assertEquals(DocIdSetIterator.NO_MORE_DOCS, s.iterator().nextDoc());
+
q = FeatureField.newSaturationQuery("features", "pagerank", 3f, 4.5f);
w = q.createWeight(searcher, ScoreMode.TOP_SCORES, 2);
s = w.scorer(context);
@@ -193,16 +211,19 @@ public class TestFeatureField extends LuceneTestCase {
IndexSearcher searcher = new IndexSearcher(reader);
QueryUtils.check(random(), FeatureField.newLogQuery("features", "pagerank", 1f, 4.5f), searcher);
+ QueryUtils.check(random(), FeatureField.newLinearQuery("features", "pagerank", 1f), searcher);
QueryUtils.check(random(), FeatureField.newSaturationQuery("features", "pagerank", 1f, 12f), searcher);
QueryUtils.check(random(), FeatureField.newSigmoidQuery("features", "pagerank", 1f, 12f, 0.6f), searcher);
// Test boosts that are > 1
QueryUtils.check(random(), FeatureField.newLogQuery("features", "pagerank", 3f, 4.5f), searcher);
+ QueryUtils.check(random(), FeatureField.newLinearQuery("features", "pagerank", 3f), searcher);
QueryUtils.check(random(), FeatureField.newSaturationQuery("features", "pagerank", 3f, 12f), searcher);
QueryUtils.check(random(), FeatureField.newSigmoidQuery("features", "pagerank", 3f, 12f, 0.6f), searcher);
// Test boosts that are < 1
QueryUtils.check(random(), FeatureField.newLogQuery("features", "pagerank", .2f, 4.5f), searcher);
+ QueryUtils.check(random(), FeatureField.newLinearQuery("features", "pagerank", .2f), searcher);
QueryUtils.check(random(), FeatureField.newSaturationQuery("features", "pagerank", .2f, 12f), searcher);
QueryUtils.check(random(), FeatureField.newSigmoidQuery("features", "pagerank", .2f, 12f, 0.6f), searcher);
@@ -214,6 +235,10 @@ public class TestFeatureField extends LuceneTestCase {
doTestSimScorer(new FeatureField.LogFunction(4.5f).scorer(3f));
}
+ public void testLinearSimScorer() {
+ doTestSimScorer(new FeatureField.LinearFunction().scorer(1f));
+ }
+
public void testSatuSimScorer() {
doTestSimScorer(new FeatureField.SaturationFunction("foo", "bar", 20f).scorer(3f));
}