You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ro...@apache.org on 2020/02/24 10:45:27 UTC
[lucene-solr] branch branch_8x updated: SOLR-12238: Handle boosts
in QueryBuilder
This is an automated email from the ASF dual-hosted git repository.
romseygeek pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/branch_8x by this push:
new 2752d50 SOLR-12238: Handle boosts in QueryBuilder
2752d50 is described below
commit 2752d50dd1dcf758a32dc573d02967612a2cf1ff
Author: Alessandro Benedetti <a....@sease.io>
AuthorDate: Mon Feb 24 10:29:41 2020 +0000
SOLR-12238: Handle boosts in QueryBuilder
QueryBuilder now detects per-term boosts supplied by a BoostAttribute when
building queries using a TokenStream. This commit also adds a DelimitedBoostTokenFilter
that parses boosts from tokens using a delimiter token, and exposes this in Solr
---
.../analysis/boost/DelimitedBoostTokenFilter.java | 63 ++++++
.../boost/DelimitedBoostTokenFilterFactory.java | 63 ++++++
.../apache/lucene/analysis/boost/package-info.java | 21 ++
....apache.lucene.analysis.util.TokenFilterFactory | 1 +
.../boost/DelimitedBoostTokenFilterTest.java | 85 ++++++++
.../org/apache/lucene/search/BoostAttribute.java | 1 +
.../java/org/apache/lucene/util/QueryBuilder.java | 117 ++++++++---
.../org/apache/lucene/util/TestQueryBuilder.java | 50 +++++
.../complexPhrase/ComplexPhraseQueryParser.java | 4 +-
.../apache/solr/parser/SolrQueryParserBase.java | 28 ++-
.../test-files/solr/collection1/conf/schema12.xml | 38 ++++
.../test-files/solr/collection1/conf/synonyms.txt | 16 +-
.../TestManagedSynonymGraphFilterFactory.java | 76 +++++++
.../apache/solr/search/TestSolrQueryParser.java | 219 ++++++++++++++++++++-
solr/solr-ref-guide/src/filter-descriptions.adoc | 99 ++++++++++
15 files changed, 840 insertions(+), 41 deletions(-)
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilter.java
new file mode 100644
index 0000000..c37f7d7
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilter.java
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.boost;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.search.BoostAttribute;
+
+import java.io.IOException;
+
+
+/**
+ * Characters before the delimiter are the "token", those after are the boost.
+ * <p>
+ * For example, if the delimiter is '|', then for the string "foo|0.7", foo is the token
+ * and 0.7 is the boost.
+ * <p>
+ * Note make sure your Tokenizer doesn't split on the delimiter, or this won't work
+ */
+public final class DelimitedBoostTokenFilter extends TokenFilter {
+ private final char delimiter;
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final BoostAttribute boostAtt = addAttribute(BoostAttribute.class);
+
+ public DelimitedBoostTokenFilter(TokenStream input, char delimiter) {
+ super(input);
+ this.delimiter = delimiter;
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ final char[] buffer = termAtt.buffer();
+ final int length = termAtt.length();
+ for (int i = 0; i < length; i++) {
+ if (buffer[i] == delimiter) {
+ float boost = Float.parseFloat(new String(buffer, i + 1, (length - (i + 1))));
+ boostAtt.setBoost(boost);
+ termAtt.setLength(i);
+ return true;
+ }
+ }
+ return true;
+ } else {
+ return false;
+ }
+ }
+}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilterFactory.java
new file mode 100644
index 0000000..7436034
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilterFactory.java
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.boost;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+
+import java.util.Map;
+
+/**
+ * Factory for {@link DelimitedBoostTokenFilter}.
+ * <pre class="prettyprint">
+ * <fieldType name="text_dlmtd" class="solr.TextField" positionIncrementGap="100">
+ * <analyzer>
+ * <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ * <filter class="solr.DelimitedBoostTokenFilterFactory" delimiter="|"/>
+ * </analyzer>
+ * </fieldType></pre>
+ *
+ * @lucene.spi {@value #NAME}
+ */
+public class DelimitedBoostTokenFilterFactory extends TokenFilterFactory {
+
+ /**
+ * SPI name
+ */
+ public static final String NAME = "delimitedBoost";
+ public static final String DELIMITER_ATTR = "delimiter";
+ public static final char DEFAULT_DELIMITER = '|';
+
+ private final char delimiter;
+
+ /**
+ * Creates a new DelimitedPayloadTokenFilterFactory
+ */
+ public DelimitedBoostTokenFilterFactory(Map<String, String> args) {
+ super(args);
+ delimiter = getChar(args, DELIMITER_ATTR, DEFAULT_DELIMITER);
+ if (!args.isEmpty()) {
+ throw new IllegalArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ @Override
+ public DelimitedBoostTokenFilter create(TokenStream input) {
+ return new DelimitedBoostTokenFilter(input, delimiter);
+ }
+
+}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/package-info.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/package-info.java
new file mode 100644
index 0000000..9bae5dc
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/boost/package-info.java
@@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Provides various convenience classes for creating boosts on Tokens.
+ */
+package org.apache.lucene.analysis.boost;
diff --git a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
index 16fca20..fd13e6f 100644
--- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
+++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
@@ -17,6 +17,7 @@ org.apache.lucene.analysis.tr.ApostropheFilterFactory
org.apache.lucene.analysis.ar.ArabicNormalizationFilterFactory
org.apache.lucene.analysis.ar.ArabicStemFilterFactory
org.apache.lucene.analysis.bg.BulgarianStemFilterFactory
+org.apache.lucene.analysis.boost.DelimitedBoostTokenFilterFactory
org.apache.lucene.analysis.bn.BengaliNormalizationFilterFactory
org.apache.lucene.analysis.bn.BengaliStemFilterFactory
org.apache.lucene.analysis.br.BrazilianStemFilterFactory
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilterTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilterTest.java
new file mode 100644
index 0000000..8b9d690
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/boost/DelimitedBoostTokenFilterTest.java
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.boost;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.search.BoostAttribute;
+
+public class DelimitedBoostTokenFilterTest extends BaseTokenStreamTestCase {
+
+ public void testBoosts() throws Exception {
+ String test = "The quick|0.4 red|0.5 fox|0.2 jumped|0.1 over the lazy|0.8 brown|0.9 dogs|0.9";
+ DelimitedBoostTokenFilter filter = new DelimitedBoostTokenFilter
+ (whitespaceMockTokenizer(test),
+ DelimitedBoostTokenFilterFactory.DEFAULT_DELIMITER);
+ CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
+ BoostAttribute boostAtt = filter.addAttribute(BoostAttribute.class);
+ filter.reset();
+ assertTermEquals("The", filter, termAtt, boostAtt, 1.0f);
+ assertTermEquals("quick", filter, termAtt, boostAtt, 0.4f);
+ assertTermEquals("red", filter, termAtt, boostAtt, 0.5f);
+ assertTermEquals("fox", filter, termAtt, boostAtt, 0.2f);
+ assertTermEquals("jumped", filter, termAtt, boostAtt, 0.1f);
+ assertTermEquals("over", filter, termAtt, boostAtt, 1.0f);
+ assertTermEquals("the", filter, termAtt, boostAtt, 1.0f);
+ assertTermEquals("lazy", filter, termAtt, boostAtt, 0.8f);
+ assertTermEquals("brown", filter, termAtt, boostAtt, 0.9f);
+ assertTermEquals("dogs", filter, termAtt, boostAtt, 0.9f);
+ assertFalse(filter.incrementToken());
+ filter.end();
+ filter.close();
+ }
+
+ public void testNext() throws Exception {
+ String test = "The quick|0.1 red|0.2 fox|0.3 jumped|0.4 over the lazy|0.5 brown|0.6 dogs|0.6";
+ DelimitedBoostTokenFilter filter = new DelimitedBoostTokenFilter
+ (whitespaceMockTokenizer(test),
+ DelimitedBoostTokenFilterFactory.DEFAULT_DELIMITER);
+ filter.reset();
+ assertTermEquals("The", filter, 1.0f);
+ assertTermEquals("quick", filter, 0.1f);
+ assertTermEquals("red", filter, 0.2f);
+ assertTermEquals("fox", filter, 0.3f);
+ assertTermEquals("jumped", filter, 0.4f);
+ assertTermEquals("over", filter, 1.0f);
+ assertTermEquals("the", filter, 1.0f);
+ assertTermEquals("lazy", filter, 0.5f);
+ assertTermEquals("brown", filter, 0.6f);
+ assertTermEquals("dogs", filter, 0.6f);
+ assertFalse(filter.incrementToken());
+ filter.end();
+ filter.close();
+ }
+
+ void assertTermEquals(String expected, TokenStream stream, float expectedBoost) throws Exception {
+ CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
+ BoostAttribute boostAtt = stream.addAttribute(BoostAttribute.class);
+ assertTrue(stream.incrementToken());
+ assertEquals(expected, termAtt.toString());
+ float actualBoost = boostAtt.getBoost();
+ assertTrue(actualBoost + " does not equal: " + expectedBoost, actualBoost == expectedBoost);
+ }
+
+ void assertTermEquals(String expected, TokenStream stream, CharTermAttribute termAtt, BoostAttribute boostAtt, float expectedBoost) throws Exception {
+ assertTrue(stream.incrementToken());
+ assertEquals(expected, termAtt.toString());
+ float actualBoost = boostAtt.getBoost();
+ assertTrue(actualBoost + " does not equal: " + expectedBoost, actualBoost == expectedBoost);
+ }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/search/BoostAttribute.java b/lucene/core/src/java/org/apache/lucene/search/BoostAttribute.java
index 2a99a08..9030b57 100644
--- a/lucene/core/src/java/org/apache/lucene/search/BoostAttribute.java
+++ b/lucene/core/src/java/org/apache/lucene/search/BoostAttribute.java
@@ -32,6 +32,7 @@ import org.apache.lucene.index.Terms; // javadocs only
* @lucene.internal
*/
public interface BoostAttribute extends Attribute {
+ float DEFAULT_BOOST = 1.0f;
/** Sets the boost in this attribute */
public void setBoost(float boost);
/** Retrieves the boost, default is {@code 1.0f}. */
diff --git a/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java b/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java
index 66837b7..9eaeb0e 100644
--- a/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java
+++ b/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java
@@ -30,17 +30,22 @@ import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.BoostAttribute;
+import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.MultiPhraseQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.SynonymQuery;
import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.spans.SpanBoostQuery;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanOrQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.util.graph.GraphTokenStreamFiniteStrings;
+import static org.apache.lucene.search.BoostAttribute.DEFAULT_BOOST;
+
/**
* Creates queries from the {@link Analyzer} chain.
* <p>
@@ -62,6 +67,24 @@ public class QueryBuilder {
protected boolean enableGraphQueries = true;
protected boolean autoGenerateMultiTermSynonymsPhraseQuery = false;
+ /**
+ * Wraps a term and boost
+ */
+ public static class TermAndBoost {
+ /** the term */
+ public final Term term;
+ /** the boost */
+ public final float boost;
+
+ /**
+ * Creates a new TermAndBoost
+ */
+ public TermAndBoost(Term term, float boost) {
+ this.term = term;
+ this.boost = boost;
+ }
+ }
+
/** Creates a new QueryBuilder using the given analyzer. */
public QueryBuilder(Analyzer analyzer) {
this.analyzer = analyzer;
@@ -349,22 +372,32 @@ public class QueryBuilder {
*/
protected SpanQuery createSpanQuery(TokenStream in, String field) throws IOException {
TermToBytesRefAttribute termAtt = in.getAttribute(TermToBytesRefAttribute.class);
+ BoostAttribute boostAtt = in.addAttribute(BoostAttribute.class);
+
+ SpanQuery result;
+ float boost = DEFAULT_BOOST;
if (termAtt == null) {
return null;
}
List<SpanTermQuery> terms = new ArrayList<>();
while (in.incrementToken()) {
+ boost *= boostAtt.getBoost();
terms.add(new SpanTermQuery(new Term(field, termAtt.getBytesRef())));
}
if (terms.isEmpty()) {
return null;
} else if (terms.size() == 1) {
- return terms.get(0);
+ result = terms.get(0);
} else {
- return new SpanNearQuery(terms.toArray(new SpanTermQuery[0]), 0, true);
+ result = new SpanNearQuery(terms.toArray(new SpanQuery[0]), 0, true);
}
+
+ if (boost != DEFAULT_BOOST) {
+ result = new SpanBoostQuery(result, boost);
+ }
+ return result;
}
/**
@@ -372,13 +405,14 @@ public class QueryBuilder {
*/
protected Query analyzeTerm(String field, TokenStream stream) throws IOException {
TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
+ BoostAttribute boostAtt = stream.addAttribute(BoostAttribute.class);
stream.reset();
if (!stream.incrementToken()) {
throw new AssertionError();
}
- return newTermQuery(new Term(field, termAtt.getBytesRef()));
+ return newTermQuery(new Term(field, termAtt.getBytesRef()), boostAtt.getBoost());
}
/**
@@ -386,24 +420,25 @@ public class QueryBuilder {
*/
protected Query analyzeBoolean(String field, TokenStream stream) throws IOException {
TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
+ BoostAttribute boostAtt = stream.addAttribute(BoostAttribute.class);
stream.reset();
- List<Term> terms = new ArrayList<>();
+ List<TermAndBoost> terms = new ArrayList<>();
while (stream.incrementToken()) {
- terms.add(new Term(field, termAtt.getBytesRef()));
+ terms.add(new TermAndBoost(new Term(field, termAtt.getBytesRef()), boostAtt.getBoost()));
}
- return newSynonymQuery(terms.toArray(new Term[terms.size()]));
+ return newSynonymQuery(terms.toArray(new TermAndBoost[0]));
}
- protected void add(BooleanQuery.Builder q, List<Term> current, BooleanClause.Occur operator) {
+ protected void add(BooleanQuery.Builder q, List<TermAndBoost> current, BooleanClause.Occur operator) {
if (current.isEmpty()) {
return;
}
if (current.size() == 1) {
- q.add(newTermQuery(current.get(0)), operator);
+ q.add(newTermQuery(current.get(0).term, current.get(0).boost), operator);
} else {
- q.add(newSynonymQuery(current.toArray(new Term[current.size()])), operator);
+ q.add(newSynonymQuery(current.toArray(new TermAndBoost[0])), operator);
}
}
@@ -412,10 +447,11 @@ public class QueryBuilder {
*/
protected Query analyzeMultiBoolean(String field, TokenStream stream, BooleanClause.Occur operator) throws IOException {
BooleanQuery.Builder q = newBooleanQuery();
- List<Term> currentQuery = new ArrayList<>();
+ List<TermAndBoost> currentQuery = new ArrayList<>();
TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
+ BoostAttribute boostAtt = stream.addAttribute(BoostAttribute.class);
stream.reset();
while (stream.incrementToken()) {
@@ -423,7 +459,7 @@ public class QueryBuilder {
add(q, currentQuery, operator);
currentQuery.clear();
}
- currentQuery.add(new Term(field, termAtt.getBytesRef()));
+ currentQuery.add(new TermAndBoost(new Term(field, termAtt.getBytesRef()), boostAtt.getBoost()));
}
add(q, currentQuery, operator);
@@ -438,9 +474,10 @@ public class QueryBuilder {
builder.setSlop(slop);
TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
+ BoostAttribute boostAtt = stream.addAttribute(BoostAttribute.class);
PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
- int position = -1;
-
+ int position = -1;
+ float phraseBoost = DEFAULT_BOOST;
stream.reset();
while (stream.incrementToken()) {
if (enablePositionIncrements) {
@@ -449,9 +486,13 @@ public class QueryBuilder {
position += 1;
}
builder.add(new Term(field, termAtt.getBytesRef()), position);
+ phraseBoost *= boostAtt.getBoost();
}
-
- return builder.build();
+ PhraseQuery query = builder.build();
+ if (phraseBoost == DEFAULT_BOOST) {
+ return query;
+ }
+ return new BoostQuery(query, phraseBoost);
}
/**
@@ -508,33 +549,40 @@ public class QueryBuilder {
end = articulationPoints[i];
}
lastState = end;
- final Query queryPos;
+ final Query positionalQuery;
if (graph.hasSidePath(start)) {
- final Iterator<TokenStream> it = graph.getFiniteStrings(start, end);
+ final Iterator<TokenStream> sidePathsIterator = graph.getFiniteStrings(start, end);
Iterator<Query> queries = new Iterator<Query>() {
@Override
public boolean hasNext() {
- return it.hasNext();
+ return sidePathsIterator.hasNext();
}
@Override
public Query next() {
- TokenStream ts = it.next();
- return createFieldQuery(ts, BooleanClause.Occur.MUST, field, getAutoGenerateMultiTermSynonymsPhraseQuery(), 0);
+ TokenStream sidePath = sidePathsIterator.next();
+ return createFieldQuery(sidePath, BooleanClause.Occur.MUST, field, getAutoGenerateMultiTermSynonymsPhraseQuery(), 0);
}
};
- queryPos = newGraphSynonymQuery(queries);
+ positionalQuery = newGraphSynonymQuery(queries);
} else {
- Term[] terms = graph.getTerms(field, start);
+ List<AttributeSource> attributes = graph.getTerms(start);
+ TermAndBoost[] terms = attributes.stream()
+ .map(s -> {
+ TermToBytesRefAttribute t = s.addAttribute(TermToBytesRefAttribute.class);
+ BoostAttribute b = s.addAttribute(BoostAttribute.class);
+ return new TermAndBoost(new Term(field, t.getBytesRef()), b.getBoost());
+ })
+ .toArray(TermAndBoost[]::new);
assert terms.length > 0;
if (terms.length == 1) {
- queryPos = newTermQuery(terms[0]);
+ positionalQuery = newTermQuery(terms[0].term, terms[0].boost);
} else {
- queryPos = newSynonymQuery(terms);
+ positionalQuery = newSynonymQuery(terms);
}
}
- if (queryPos != null) {
- builder.add(queryPos, operator);
+ if (positionalQuery != null) {
+ builder.add(positionalQuery, operator);
}
}
return builder.build();
@@ -649,10 +697,10 @@ public class QueryBuilder {
* This is intended for subclasses that wish to customize the generated queries.
* @return new Query instance
*/
- protected Query newSynonymQuery(Term terms[]) {
- SynonymQuery.Builder builder = new SynonymQuery.Builder(terms[0].field());
- for (Term term : terms) {
- builder.addTerm(term);
+ protected Query newSynonymQuery(TermAndBoost[] terms) {
+ SynonymQuery.Builder builder = new SynonymQuery.Builder(terms[0].term.field());
+ for (TermAndBoost t : terms) {
+ builder.addTerm(t.term, t.boost);
}
return builder.build();
}
@@ -682,10 +730,15 @@ public class QueryBuilder {
* @param term term
* @return new TermQuery instance
*/
- protected Query newTermQuery(Term term) {
- return new TermQuery(term);
+ protected Query newTermQuery(Term term, float boost) {
+ Query q = new TermQuery(term);
+ if (boost == DEFAULT_BOOST) {
+ return q;
+ }
+ return new BoostQuery(q, boost);
}
+
/**
* Builds a new MultiPhraseQuery instance.
* <p>
diff --git a/lucene/core/src/test/org/apache/lucene/util/TestQueryBuilder.java b/lucene/core/src/test/org/apache/lucene/util/TestQueryBuilder.java
index dc54683..9cbc355 100644
--- a/lucene/core/src/test/org/apache/lucene/util/TestQueryBuilder.java
+++ b/lucene/core/src/test/org/apache/lucene/util/TestQueryBuilder.java
@@ -20,6 +20,7 @@ package org.apache.lucene.util;
import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.AnalyzerWrapper;
import org.apache.lucene.analysis.CannedBinaryTokenStream;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockSynonymFilter;
@@ -32,6 +33,8 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.BoostAttribute;
+import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.MultiPhraseQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
@@ -506,4 +509,51 @@ public class TestQueryBuilder extends LuceneTestCase {
expectThrows(BooleanQuery.TooManyClauses.class, () -> qb.analyzeGraphPhrase(ts, "", 0));
}
}
+
+ private static final class MockBoostTokenFilter extends TokenFilter {
+
+ final BoostAttribute boostAtt = addAttribute(BoostAttribute.class);
+ final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+
+ protected MockBoostTokenFilter(TokenStream input) {
+ super(input);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken() == false) {
+ return false;
+ }
+ if (termAtt.length() == 3) {
+ boostAtt.setBoost(0.5f);
+ }
+ return true;
+ }
+ }
+
+ public void testTokenStreamBoosts() {
+ Analyzer msa = new MockSynonymAnalyzer();
+ Analyzer a = new AnalyzerWrapper(msa.getReuseStrategy()) {
+ @Override
+ protected Analyzer getWrappedAnalyzer(String fieldName) {
+ return msa;
+ }
+ @Override
+ protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
+ return new TokenStreamComponents(components.getSource(), new MockBoostTokenFilter(components.getTokenStream()));
+ }
+ };
+
+ QueryBuilder builder = new QueryBuilder(a);
+ Query q = builder.createBooleanQuery("field", "hot dogs");
+ Query expected = new BooleanQuery.Builder()
+ .add(new BoostQuery(new TermQuery(new Term("field", "hot")), 0.5f), BooleanClause.Occur.SHOULD)
+ .add(new SynonymQuery.Builder("field")
+ .addTerm(new Term("field", "dogs"))
+ .addTerm(new Term("field", "dog"), 0.5f)
+ .build(), BooleanClause.Occur.SHOULD)
+ .build();
+
+ assertEquals(expected, q);
+ }
}
diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/complexPhrase/ComplexPhraseQueryParser.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/complexPhrase/ComplexPhraseQueryParser.java
index 9a4043d..d552aef 100644
--- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/complexPhrase/ComplexPhraseQueryParser.java
+++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/complexPhrase/ComplexPhraseQueryParser.java
@@ -147,7 +147,7 @@ public class ComplexPhraseQueryParser extends QueryParser {
// to throw a runtime exception here if a term for another field is embedded
// in phrase query
@Override
- protected Query newTermQuery(Term term) {
+ protected Query newTermQuery(Term term, float boost) {
if (isPass2ResolvingPhrases) {
try {
checkPhraseClauseIsForSameField(term.field());
@@ -155,7 +155,7 @@ public class ComplexPhraseQueryParser extends QueryParser {
throw new RuntimeException("Error parsing complex phrase", pe);
}
}
- return super.newTermQuery(term);
+ return super.newTermQuery(term, boost);
}
// Helper method used to report on any clauses that appear in query syntax
diff --git a/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java b/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java
index 2f65d1c..f0d6ed1 100644
--- a/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java
+++ b/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java
@@ -21,6 +21,8 @@ import java.util.ArrayList;
import java.util.Collections;
import java.util.EnumSet;
import java.util.HashMap;
+import java.util.Iterator;
+import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
@@ -599,19 +601,35 @@ public abstract class SolrQueryParserBase extends QueryBuilder {
}
@Override
- protected Query newSynonymQuery(Term terms[]) {
+ protected Query newGraphSynonymQuery(Iterator<Query> sidePathQueriesIterator) {
+ switch (synonymQueryStyle) {
+ case PICK_BEST: {
+ List<Query> sidePathSynonymQueries = new LinkedList<>();
+ sidePathQueriesIterator.forEachRemaining(sidePathSynonymQueries::add);
+ return new DisjunctionMaxQuery(sidePathSynonymQueries, 0.0f);
+ }
+ case AS_SAME_TERM:
+ case AS_DISTINCT_TERMS:{
+ return super.newGraphSynonymQuery(sidePathQueriesIterator);}
+ default:
+ throw new AssertionError("unrecognized synonymQueryStyle passed when creating newSynonymQuery");
+ }
+ }
+
+ @Override
+ protected Query newSynonymQuery(TermAndBoost[] terms) {
switch (synonymQueryStyle) {
case PICK_BEST:
List<Query> currPosnClauses = new ArrayList<Query>(terms.length);
- for (Term term : terms) {
- currPosnClauses.add(newTermQuery(term));
+ for (TermAndBoost term : terms) {
+ currPosnClauses.add(newTermQuery(term.term, term.boost));
}
DisjunctionMaxQuery dm = new DisjunctionMaxQuery(currPosnClauses, 0.0f);
return dm;
case AS_DISTINCT_TERMS:
BooleanQuery.Builder builder = new BooleanQuery.Builder();
- for (Term term : terms) {
- builder.add(newTermQuery(term), BooleanClause.Occur.SHOULD);
+ for (TermAndBoost term : terms) {
+ builder.add(newTermQuery(term.term, term.boost), BooleanClause.Occur.SHOULD);
}
return builder.build();
case AS_SAME_TERM:
diff --git a/solr/core/src/test-files/solr/collection1/conf/schema12.xml b/solr/core/src/test-files/solr/collection1/conf/schema12.xml
index 1368e6b..d4cb89e 100644
--- a/solr/core/src/test-files/solr/collection1/conf/schema12.xml
+++ b/solr/core/src/test-files/solr/collection1/conf/schema12.xml
@@ -227,6 +227,41 @@
</analyzer>
</fieldType>
+ <fieldType name="text_pick_best_boosted" class="solr.TextField" positionIncrementGap="100" synonymQueryStyle="pick_best" autoGeneratePhraseQueries="true">
+ <analyzer type="index">
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.DelimitedBoostTokenFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <fieldType name="text_as_distinct_boosted" class="solr.TextField" positionIncrementGap="100" synonymQueryStyle="as_distinct_terms" autoGeneratePhraseQueries="true">
+ <analyzer type="index">
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.DelimitedBoostTokenFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <fieldType name="text_as_same_term_boosted" class="solr.TextField" positionIncrementGap="100" synonymQueryStyle="as_same_term" autoGeneratePhraseQueries="true">
+ <analyzer type="index">
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.DelimitedBoostTokenFilterFactory"/>
+ </analyzer>
+ </fieldType>
<fieldType name="nametext" class="solr.TextField">
<analyzer class="org.apache.lucene.analysis.core.WhitespaceAnalyzer"/>
@@ -656,6 +691,9 @@
<dynamicField name="*_sI" type="string" indexed="true" stored="false"/>
<dynamicField name="*_sS" type="string" indexed="false" stored="true"/>
<dynamicField name="t_pick_best_*" type="text_pick_best" indexed="true" stored="true"/>
+ <dynamicField name="t_pick_best_boosted_*" type="text_pick_best_boosted" indexed="true" stored="true"/>
+ <dynamicField name="t_as_distinct_boosted_*" type="text_as_distinct_boosted" indexed="true" stored="true"/>
+ <dynamicField name="t_as_same_term_boosted_*" type="text_as_same_term_boosted" indexed="true" stored="true"/>
<dynamicField name="t_as_distinct_*" type="text_as_distinct" indexed="true" stored="true"/>
diff --git a/solr/core/src/test-files/solr/collection1/conf/synonyms.txt b/solr/core/src/test-files/solr/collection1/conf/synonyms.txt
index 68dbf0b..d7feb34 100644
--- a/solr/core/src/test-files/solr/collection1/conf/synonyms.txt
+++ b/solr/core/src/test-files/solr/collection1/conf/synonyms.txt
@@ -37,4 +37,18 @@ crow blackbird, grackle
tabby => tabby, cat, feline, animal
persian => persian, cat, feline, animal
-jeans, denim pants
\ No newline at end of file
+jeans, denim pants
+
+# Boosted Synonyms
+tiger, tigre|0.9
+lynx => lince|0.8, lynx_canadensis|0.9
+
+leopard, big cat|0.8, bagheera|0.9, panthera pardus|0.85
+lion => panthera leo|0.9, simba leo|0.8, kimba|0.75
+
+panthera pardus, leopard|0.6
+panthera tigris => tiger|0.99
+
+snow leopard, panthera uncia|0.9, big cat|0.8, white_leopard|0.6
+panthera onca => jaguar|0.95, big cat|0.85, black panther|0.65
+panthera blytheae, oldest|0.5 ancient|0.9 panthera
\ No newline at end of file
diff --git a/solr/core/src/test/org/apache/solr/rest/schema/analysis/TestManagedSynonymGraphFilterFactory.java b/solr/core/src/test/org/apache/solr/rest/schema/analysis/TestManagedSynonymGraphFilterFactory.java
index fc1e735..66e9efe 100644
--- a/solr/core/src/test/org/apache/solr/rest/schema/analysis/TestManagedSynonymGraphFilterFactory.java
+++ b/solr/core/src/test/org/apache/solr/rest/schema/analysis/TestManagedSynonymGraphFilterFactory.java
@@ -300,4 +300,80 @@ public class TestManagedSynonymGraphFilterFactory extends RestTestBase {
assertJDelete(endpoint+"/fröhlich",
"/error/code==404");
}
+
+ /**
+ * Can we add and single term synonyms with weight
+ */
+ @Test
+ public void testManagedSynonyms_singleTermWithWeight_shouldHandleSynonym() throws Exception {
+ String endpoint = "/schema/analysis/synonyms/englishgraph";
+
+ assertJQ(endpoint,
+ "/synonymMappings/initArgs/ignoreCase==false",
+ "/synonymMappings/managedMap=={}");
+
+ // does not exist
+ assertJQ(endpoint+"/tiger",
+ "/error/code==404");
+
+ Map<String,List<String>> syns = new HashMap<>();
+
+ // now put a synonym
+ syns.put("tiger", Arrays.asList("tiger|1.0"));
+ assertJPut(endpoint,
+ toJSONString(syns),
+ "/responseHeader/status==0");
+
+ // and check if it exists
+ assertJQ(endpoint,
+ "/synonymMappings/managedMap/tiger==['tiger|1.0']");
+
+ // verify delete works
+ assertJDelete(endpoint+"/tiger",
+ "/responseHeader/status==0");
+
+
+ // was it really deleted?
+ assertJDelete(endpoint+"/tiger",
+ "/error/code==404");
+ }
+
+ /**
+ * Can we add multi term synonyms with weight
+ */
+ @Test
+ public void testManagedSynonyms_multiTermWithWeight_shouldHandleSynonym() throws Exception {
+ String endpoint = "/schema/analysis/synonyms/englishgraph";
+
+ assertJQ(endpoint,
+ "/synonymMappings/initArgs/ignoreCase==false",
+ "/synonymMappings/managedMap=={}");
+
+ // does not exist
+ assertJQ(endpoint+"/tiger",
+ "/error/code==404");
+
+ Map<String,List<String>> syns = new HashMap<>();
+
+ // now put a synonym
+ List<String> tigerSyonyms = Arrays.asList("tiger|1.0", "panthera tigris|0.9", "Shere Kan|0.8");
+ syns.put("tiger", tigerSyonyms);
+ String jsonTigerSynonyms = toJSONString(syns);
+ assertJPut(endpoint,
+ jsonTigerSynonyms,
+ "/responseHeader/status==0");
+
+ // and check if it exists
+ assertJQ(endpoint,
+ "/synonymMappings/managedMap/tiger==[\"Shere Kan|0.8\",\"panthera tigris|0.9\",\"tiger|1.0\"]");
+
+ // verify delete works
+ assertJDelete(endpoint+"/tiger",
+ "/responseHeader/status==0");
+
+
+ // was it really deleted?
+ assertJDelete(endpoint+"/tiger",
+ "/error/code==404");
+ }
}
diff --git a/solr/core/src/test/org/apache/solr/search/TestSolrQueryParser.java b/solr/core/src/test/org/apache/solr/search/TestSolrQueryParser.java
index 5812073..4205aac 100644
--- a/solr/core/src/test/org/apache/solr/search/TestSolrQueryParser.java
+++ b/solr/core/src/test/org/apache/solr/search/TestSolrQueryParser.java
@@ -1220,8 +1220,225 @@ public class TestSolrQueryParser extends SolrTestCaseJ4 {
assertEquals("(t_as_distinct_foo:\"denim pant\" t_as_distinct_foo:jean)", q.toString());
q = QParser.getParser("jeans", req(params("df", "t_pick_best_foo", "sow", "false"))).getQuery();
- assertEquals("(t_pick_best_foo:\"denim pant\" t_pick_best_foo:jean)", q.toString());
+ assertEquals("(t_pick_best_foo:\"denim pant\" | t_pick_best_foo:jean)", q.toString());
+ }
+
+ public void testSynonymsBoost_singleTermQuerySingleTermSynonyms_shouldParseBoostedQuery() throws Exception {
+ //tiger, tigre|0.9
+ Query q = QParser.getParser("tiger", req(params("df", "t_pick_best_boosted_foo"))).getQuery();
+ assertEquals("((t_pick_best_boosted_foo:tigre)^0.9 | t_pick_best_boosted_foo:tiger)", q.toString());
+
+ q = QParser.getParser("tiger", req(params("df", "t_as_distinct_boosted_foo"))).getQuery();
+ assertEquals("(t_as_distinct_boosted_foo:tigre)^0.9 t_as_distinct_boosted_foo:tiger", q.toString());
+
+ q = QParser.getParser("tiger", req(params("df", "t_as_same_term_boosted_foo"))).getQuery();
+ assertEquals("Synonym(t_as_same_term_boosted_foo:tiger t_as_same_term_boosted_foo:tigre^0.9)", q.toString());
+
+ //lynx => lince|0.8, lynx_canadensis|0.9
+ q = QParser.getParser("lynx", req(params("df", "t_pick_best_boosted_foo"))).getQuery();
+ assertEquals("((t_pick_best_boosted_foo:lince)^0.8 | (t_pick_best_boosted_foo:lynx_canadensis)^0.9)", q.toString());
+
+ q = QParser.getParser("lynx", req(params("df", "t_as_distinct_boosted_foo"))).getQuery();
+ assertEquals("(t_as_distinct_boosted_foo:lince)^0.8 (t_as_distinct_boosted_foo:lynx_canadensis)^0.9", q.toString());
+
+ q = QParser.getParser("lynx", req(params("df", "t_as_same_term_boosted_foo"))).getQuery();
+ assertEquals("Synonym(t_as_same_term_boosted_foo:lince^0.8 t_as_same_term_boosted_foo:lynx_canadensis^0.9)", q.toString());
+ }
+
+ public void testSynonymsBoost_singleTermQueryMultiTermSynonyms_shouldParseBoostedQuery() throws Exception {
+ //leopard, big cat|0.8, bagheera|0.9, panthera pardus|0.85
+ Query q = QParser.getParser("leopard", req(params("df", "t_pick_best_boosted_foo"))).getQuery();
+ assertEquals("((t_pick_best_boosted_foo:\"big cat\")^0.8 | (t_pick_best_boosted_foo:bagheera)^0.9 | (t_pick_best_boosted_foo:\"panthera pardus\")^0.85 | t_pick_best_boosted_foo:leopard)", q.toString());
+
+ q = QParser.getParser("leopard", req(params("df", "t_as_distinct_boosted_foo"))).getQuery();
+ assertEquals("((t_as_distinct_boosted_foo:\"big cat\")^0.8 (t_as_distinct_boosted_foo:bagheera)^0.9 (t_as_distinct_boosted_foo:\"panthera pardus\")^0.85 t_as_distinct_boosted_foo:leopard)", q.toString());
+
+ q = QParser.getParser("leopard", req(params("df", "t_as_same_term_boosted_foo"))).getQuery();
+ assertEquals("((t_as_same_term_boosted_foo:\"big cat\")^0.8 (t_as_same_term_boosted_foo:bagheera)^0.9 (t_as_same_term_boosted_foo:\"panthera pardus\")^0.85 t_as_same_term_boosted_foo:leopard)", q.toString());
+
+ //lion => panthera leo|0.9, simba leo|0.8, kimba|0.75
+ q = QParser.getParser("lion", req(params("df", "t_pick_best_boosted_foo"))).getQuery();
+ assertEquals("((t_pick_best_boosted_foo:\"panthera leo\")^0.9 | (t_pick_best_boosted_foo:\"simba leo\")^0.8 | (t_pick_best_boosted_foo:kimba)^0.75)", q.toString());
+
+ q = QParser.getParser("lion", req(params("df", "t_as_distinct_boosted_foo"))).getQuery();
+ assertEquals("((t_as_distinct_boosted_foo:\"panthera leo\")^0.9 (t_as_distinct_boosted_foo:\"simba leo\")^0.8 (t_as_distinct_boosted_foo:kimba)^0.75)", q.toString());
+
+ q = QParser.getParser("lion", req(params("df", "t_as_same_term_boosted_foo"))).getQuery();
+ assertEquals("((t_as_same_term_boosted_foo:\"panthera leo\")^0.9 (t_as_same_term_boosted_foo:\"simba leo\")^0.8 (t_as_same_term_boosted_foo:kimba)^0.75)", q.toString());
+ }
+
+ public void testSynonymsBoost_multiTermQuerySingleTermSynonyms_shouldParseBoostedQuery() throws Exception {
+ //tiger, tigre|0.9
+ //lynx => lince|0.8, lynx_canadensis|0.9
+ Query q = QParser.getParser("tiger lynx", req(params("df", "t_pick_best_boosted_foo"))).getQuery();
+ assertEquals("((t_pick_best_boosted_foo:tigre)^0.9 | t_pick_best_boosted_foo:tiger)" +
+ " ((t_pick_best_boosted_foo:lince)^0.8 | (t_pick_best_boosted_foo:lynx_canadensis)^0.9)", q.toString());
+
+ q = QParser.getParser("tiger lynx", req(params("df", "t_as_distinct_boosted_foo"))).getQuery();
+ assertEquals("((t_as_distinct_boosted_foo:tigre)^0.9 t_as_distinct_boosted_foo:tiger)" +
+ " ((t_as_distinct_boosted_foo:lince)^0.8 (t_as_distinct_boosted_foo:lynx_canadensis)^0.9)", q.toString());
+
+ q = QParser.getParser("tiger lynx", req(params("df", "t_as_same_term_boosted_foo"))).getQuery();
+ assertEquals("Synonym(t_as_same_term_boosted_foo:tiger t_as_same_term_boosted_foo:tigre^0.9)" +
+ " Synonym(t_as_same_term_boosted_foo:lince^0.8 t_as_same_term_boosted_foo:lynx_canadensis^0.9)", q.toString());
+ }
+
+ public void testSynonymsBoost_multiTermQueryMultiTermSynonyms_shouldParseBoostedQuery() throws Exception {
+ //leopard, big cat|0.8, bagheera|0.9, panthera pardus|0.85
+ //lion => panthera leo|0.9, simba leo|0.8, kimba|0.75
+ Query q = QParser.getParser("leopard lion", req(params("df", "t_pick_best_boosted_foo"))).getQuery();
+ assertEquals("((t_pick_best_boosted_foo:\"big cat\")^0.8 | (t_pick_best_boosted_foo:bagheera)^0.9 | (t_pick_best_boosted_foo:\"panthera pardus\")^0.85 | t_pick_best_boosted_foo:leopard)" +
+ " ((t_pick_best_boosted_foo:\"panthera leo\")^0.9 | (t_pick_best_boosted_foo:\"simba leo\")^0.8 | (t_pick_best_boosted_foo:kimba)^0.75)", q.toString());
+
+ q = QParser.getParser("leopard lion", req(params("df", "t_as_distinct_boosted_foo"))).getQuery();
+ assertEquals("((t_as_distinct_boosted_foo:\"big cat\")^0.8 (t_as_distinct_boosted_foo:bagheera)^0.9 (t_as_distinct_boosted_foo:\"panthera pardus\")^0.85 t_as_distinct_boosted_foo:leopard)" +
+ " ((t_as_distinct_boosted_foo:\"panthera leo\")^0.9 (t_as_distinct_boosted_foo:\"simba leo\")^0.8 (t_as_distinct_boosted_foo:kimba)^0.75)", q.toString());
+
+ q = QParser.getParser("leopard lion", req(params("df", "t_as_same_term_boosted_foo"))).getQuery();
+ assertEquals("((t_as_same_term_boosted_foo:\"big cat\")^0.8 (t_as_same_term_boosted_foo:bagheera)^0.9 (t_as_same_term_boosted_foo:\"panthera pardus\")^0.85 t_as_same_term_boosted_foo:leopard)" +
+ " ((t_as_same_term_boosted_foo:\"panthera leo\")^0.9 (t_as_same_term_boosted_foo:\"simba leo\")^0.8 (t_as_same_term_boosted_foo:kimba)^0.75)", q.toString());
+
+ }
+
+ public void testSynonymsBoost_singleConceptQuerySingleTermSynonym_shouldParseBoostedQuery() throws Exception {
+ //panthera pardus, leopard|0.6
+ Query q = QParser.getParser("panthera pardus story",req(params("df", "t_pick_best_boosted_foo","sow", "false"))).getQuery();
+ assertEquals("((t_pick_best_boosted_foo:leopard)^0.6 | t_pick_best_boosted_foo:\"panthera pardus\") t_pick_best_boosted_foo:story", q.toString());
+
+ q = QParser.getParser("panthera pardus story", req(params("df", "t_as_distinct_boosted_foo","sow", "false"))).getQuery();
+ assertEquals("((t_as_distinct_boosted_foo:leopard)^0.6 t_as_distinct_boosted_foo:\"panthera pardus\") t_as_distinct_boosted_foo:story", q.toString());
+
+ q = QParser.getParser("panthera pardus story", req(params("df", "t_as_same_term_boosted_foo","sow", "false"))).getQuery();
+ assertEquals("((t_as_same_term_boosted_foo:leopard)^0.6 t_as_same_term_boosted_foo:\"panthera pardus\") t_as_same_term_boosted_foo:story", q.toString());
+
+ //panthera tigris => tiger|0.99
+ q = QParser.getParser("panthera tigris story", req(params("df", "t_pick_best_boosted_foo","sow", "false"))).getQuery();
+ assertEquals("(t_pick_best_boosted_foo:tiger)^0.99 t_pick_best_boosted_foo:story", q.toString());
+
+ q = QParser.getParser("panthera tigris story", req(params("df", "t_as_distinct_boosted_foo","sow", "false"))).getQuery();
+ assertEquals("(t_as_distinct_boosted_foo:tiger)^0.99 t_as_distinct_boosted_foo:story", q.toString());
+
+ q = QParser.getParser("panthera tigris story", req(params("df", "t_as_same_term_boosted_foo","sow", "false"))).getQuery();
+ assertEquals("(t_as_same_term_boosted_foo:tiger)^0.99 t_as_same_term_boosted_foo:story", q.toString());
+ }
+
+ public void testSynonymsBoost_singleConceptQueryMultiTermSynonymWithMultipleBoost_shouldParseMultiplicativeBoostedQuery() throws Exception {
+ //panthera blytheae, oldest|0.5 ancient|0.9 panthera
+ Query q = QParser.getParser("panthera blytheae",req(params("df", "t_pick_best_boosted_foo","sow", "false"))).getQuery();
+ assertEquals("((t_pick_best_boosted_foo:\"oldest ancient panthera\")^0.45 | t_pick_best_boosted_foo:\"panthera blytheae\")", q.toString());
+
+ q = QParser.getParser("panthera blytheae", req(params("df", "t_as_distinct_boosted_foo","sow", "false"))).getQuery();
+ assertEquals("((t_as_distinct_boosted_foo:\"oldest ancient panthera\")^0.45 t_as_distinct_boosted_foo:\"panthera blytheae\")", q.toString());
+
+ q = QParser.getParser("panthera blytheae", req(params("df", "t_as_same_term_boosted_foo","sow", "false"))).getQuery();
+ assertEquals("((t_as_same_term_boosted_foo:\"oldest ancient panthera\")^0.45 t_as_same_term_boosted_foo:\"panthera blytheae\")", q.toString());
+ }
+
+ public void testSynonymsBoost_singleConceptQueryMultiTermSynonyms_shouldParseBoostedQuery() throws Exception {
+ //snow leopard, panthera uncia|0.9, big cat|0.8, white_leopard|0.6
+ Query q = QParser.getParser("snow leopard",req(params("df", "t_pick_best_boosted_foo","sow", "false"))).getQuery();
+ assertEquals("((t_pick_best_boosted_foo:\"panthera uncia\")^0.9 | (t_pick_best_boosted_foo:\"big cat\")^0.8 | (t_pick_best_boosted_foo:white_leopard)^0.6 | t_pick_best_boosted_foo:\"snow leopard\")", q.toString());
+
+ q = QParser.getParser("snow leopard", req(params("df", "t_as_distinct_boosted_foo","sow", "false"))).getQuery();
+ assertEquals("((t_as_distinct_boosted_foo:\"panthera uncia\")^0.9 (t_as_distinct_boosted_foo:\"big cat\")^0.8 (t_as_distinct_boosted_foo:white_leopard)^0.6 t_as_distinct_boosted_foo:\"snow leopard\")", q.toString());
+
+ q = QParser.getParser("snow leopard", req(params("df", "t_as_same_term_boosted_foo","sow", "false"))).getQuery();
+ assertEquals("((t_as_same_term_boosted_foo:\"panthera uncia\")^0.9 (t_as_same_term_boosted_foo:\"big cat\")^0.8 (t_as_same_term_boosted_foo:white_leopard)^0.6 t_as_same_term_boosted_foo:\"snow leopard\")", q.toString());
+
+ //panthera onca => jaguar|0.95, big cat|0.85, black panther|0.65
+ q = QParser.getParser("panthera onca", req(params("df", "t_pick_best_boosted_foo","sow", "false"))).getQuery();
+ assertEquals("((t_pick_best_boosted_foo:jaguar)^0.95 | (t_pick_best_boosted_foo:\"big cat\")^0.85 | (t_pick_best_boosted_foo:\"black panther\")^0.65)", q.toString());
+
+ q = QParser.getParser("panthera onca", req(params("df", "t_as_distinct_boosted_foo","sow", "false"))).getQuery();
+ assertEquals("((t_as_distinct_boosted_foo:jaguar)^0.95 (t_as_distinct_boosted_foo:\"big cat\")^0.85 (t_as_distinct_boosted_foo:\"black panther\")^0.65)", q.toString());
+
+ q = QParser.getParser("panthera onca", req(params("df", "t_as_same_term_boosted_foo","sow", "false"))).getQuery();
+ assertEquals("((t_as_same_term_boosted_foo:jaguar)^0.95 (t_as_same_term_boosted_foo:\"big cat\")^0.85 (t_as_same_term_boosted_foo:\"black panther\")^0.65)", q.toString());
+
+ }
+
+ public void testSynonymsBoost_multiConceptQuerySingleTermSynonym_shouldParseBoostedQuery() throws Exception {
+ //panthera pardus, leopard|0.6
+ //tiger, tigre|0.9
+ Query q = QParser.getParser("panthera pardus tiger",req(params("df", "t_pick_best_boosted_foo","sow", "false"))).getQuery();
+ assertEquals("((t_pick_best_boosted_foo:leopard)^0.6 | t_pick_best_boosted_foo:\"panthera pardus\") ((t_pick_best_boosted_foo:tigre)^0.9 | t_pick_best_boosted_foo:tiger)", q.toString());
+
+ q = QParser.getParser("panthera pardus tiger", req(params("df", "t_as_distinct_boosted_foo","sow", "false"))).getQuery();
+ assertEquals("((t_as_distinct_boosted_foo:leopard)^0.6 t_as_distinct_boosted_foo:\"panthera pardus\") ((t_as_distinct_boosted_foo:tigre)^0.9 t_as_distinct_boosted_foo:tiger)", q.toString());
+
+ q = QParser.getParser("panthera pardus tiger", req(params("df", "t_as_same_term_boosted_foo","sow", "false"))).getQuery();
+ assertEquals("((t_as_same_term_boosted_foo:leopard)^0.6 t_as_same_term_boosted_foo:\"panthera pardus\") Synonym(t_as_same_term_boosted_foo:tiger t_as_same_term_boosted_foo:tigre^0.9)", q.toString());
+ }
+
+ public void testSynonymsBoost_multiConceptsQueryMultiTermSynonyms_shouldParseBoostedQuery() throws Exception {
+ //snow leopard, panthera uncia|0.9, big cat|0.8, white_leopard|0.6
+ //panthera onca => jaguar|0.95, big cat|0.85, black panther|0.65
+ Query q = QParser.getParser("snow leopard panthera onca",req(params("df", "t_pick_best_boosted_foo","sow", "false"))).getQuery();
+ assertEquals("((t_pick_best_boosted_foo:\"panthera uncia\")^0.9 | (t_pick_best_boosted_foo:\"big cat\")^0.8 | (t_pick_best_boosted_foo:white_leopard)^0.6 | t_pick_best_boosted_foo:\"snow leopard\")" +
+ " ((t_pick_best_boosted_foo:jaguar)^0.95 | (t_pick_best_boosted_foo:\"big cat\")^0.85 | (t_pick_best_boosted_foo:\"black panther\")^0.65)", q.toString());
+
+ q = QParser.getParser("snow leopard panthera onca", req(params("df", "t_as_distinct_boosted_foo","sow", "false"))).getQuery();
+ assertEquals("((t_as_distinct_boosted_foo:\"panthera uncia\")^0.9 (t_as_distinct_boosted_foo:\"big cat\")^0.8 (t_as_distinct_boosted_foo:white_leopard)^0.6 t_as_distinct_boosted_foo:\"snow leopard\")" +
+ " ((t_as_distinct_boosted_foo:jaguar)^0.95 (t_as_distinct_boosted_foo:\"big cat\")^0.85 (t_as_distinct_boosted_foo:\"black panther\")^0.65)", q.toString());
+
+ q = QParser.getParser("snow leopard panthera onca", req(params("df", "t_as_same_term_boosted_foo","sow", "false"))).getQuery();
+ assertEquals("((t_as_same_term_boosted_foo:\"panthera uncia\")^0.9 (t_as_same_term_boosted_foo:\"big cat\")^0.8 (t_as_same_term_boosted_foo:white_leopard)^0.6 t_as_same_term_boosted_foo:\"snow leopard\")" +
+ " ((t_as_same_term_boosted_foo:jaguar)^0.95 (t_as_same_term_boosted_foo:\"big cat\")^0.85 (t_as_same_term_boosted_foo:\"black panther\")^0.65)", q.toString());
+
+ }
+
+ public void testSynonymsBoost_edismaxBoost_shouldParseBoostedPhraseQuery() throws Exception {
+ Query q = QParser.getParser("snow leopard lion","edismax",true, req(params("sow", "false","qf", "t_pick_best_boosted_foo^10"))).getQuery();
+ assertEquals("+(" +
+ "((((t_pick_best_boosted_foo:\"panthera uncia\")^0.9 | (t_pick_best_boosted_foo:\"big cat\")^0.8 | (t_pick_best_boosted_foo:white_leopard)^0.6 | t_pick_best_boosted_foo:\"snow leopard\"))^10.0)" +
+ " ((((t_pick_best_boosted_foo:\"panthera leo\")^0.9 | (t_pick_best_boosted_foo:\"simba leo\")^0.8 | (t_pick_best_boosted_foo:kimba)^0.75))^10.0)" +
+ ")", q.toString());
+
+ q = QParser.getParser("snow leopard lion","edismax",true, req(params("sow", "false","qf", "t_as_distinct_boosted_foo^10"))).getQuery();
+ assertEquals("+(" +
+ "(((t_as_distinct_boosted_foo:\"panthera uncia\")^0.9 (t_as_distinct_boosted_foo:\"big cat\")^0.8 (t_as_distinct_boosted_foo:white_leopard)^0.6 t_as_distinct_boosted_foo:\"snow leopard\")^10.0)" +
+ " (((t_as_distinct_boosted_foo:\"panthera leo\")^0.9 (t_as_distinct_boosted_foo:\"simba leo\")^0.8 (t_as_distinct_boosted_foo:kimba)^0.75)^10.0))", q.toString());
+
+ q = QParser.getParser("snow leopard lion","edismax",true, req(params("sow", "false","qf", "t_as_same_term_boosted_foo^10"))).getQuery();
+ assertEquals("+(" +
+ "(((t_as_same_term_boosted_foo:\"panthera uncia\")^0.9 (t_as_same_term_boosted_foo:\"big cat\")^0.8 (t_as_same_term_boosted_foo:white_leopard)^0.6 t_as_same_term_boosted_foo:\"snow leopard\")^10.0)" +
+ " (((t_as_same_term_boosted_foo:\"panthera leo\")^0.9 (t_as_same_term_boosted_foo:\"simba leo\")^0.8 (t_as_same_term_boosted_foo:kimba)^0.75)^10.0))", q.toString());
+
+ }
+
+ public void testSynonymsBoost_phraseQueryMultiTermSynonymsBoost_shouldParseBoostedSpanQuery() throws Exception {
+ Query q = QParser.getParser("\"snow leopard lion\"", req(params("df", "t_pick_best_boosted_foo", "sow", "false"))).getQuery();
+ assertEquals("spanNear([" +
+ "spanOr([" +
+ "(spanNear([t_pick_best_boosted_foo:panthera, t_pick_best_boosted_foo:uncia], 0, true))^0.9," +
+ " (spanNear([t_pick_best_boosted_foo:big, t_pick_best_boosted_foo:cat], 0, true))^0.8," +
+ " (t_pick_best_boosted_foo:white_leopard)^0.6," +
+ " spanNear([t_pick_best_boosted_foo:snow, t_pick_best_boosted_foo:leopard], 0, true)])," +
+ " spanOr([" +
+ "(spanNear([t_pick_best_boosted_foo:panthera, t_pick_best_boosted_foo:leo], 0, true))^0.9," +
+ " (spanNear([t_pick_best_boosted_foo:simba, t_pick_best_boosted_foo:leo], 0, true))^0.8," +
+ " (t_pick_best_boosted_foo:kimba)^0.75])], 0, true)", q.toString());
+ }
+
+ public void testSynonymsBoost_phraseQueryMultiTermSynonymsMultipleBoost_shouldParseMultiplicativeBoostedSpanQuery() throws Exception {
+ Query q = QParser.getParser("\"panthera blytheae lion\"", req(params("df", "t_pick_best_boosted_foo", "sow", "false"))).getQuery();
+ assertEquals("spanNear([" +
+ "spanOr([" +
+ "(spanNear([t_pick_best_boosted_foo:oldest, t_pick_best_boosted_foo:ancient, t_pick_best_boosted_foo:panthera], 0, true))^0.45," +
+ " spanNear([t_pick_best_boosted_foo:panthera, t_pick_best_boosted_foo:blytheae], 0, true)])," +
+ " spanOr([" +
+ "(spanNear([t_pick_best_boosted_foo:panthera, t_pick_best_boosted_foo:leo], 0, true))^0.9," +
+ " (spanNear([t_pick_best_boosted_foo:simba, t_pick_best_boosted_foo:leo], 0, true))^0.8," +
+ " (t_pick_best_boosted_foo:kimba)^0.75])], 0, true)", q.toString());
+ }
+
+ public void testSynonymsBoost_BoostMissing_shouldAssignDefaultBoost() throws Exception {
+ //leopard, big cat|0.8, bagheera|0.9, panthera pardus|0.85
+ Query q = QParser.getParser("leopard", req(params("df", "t_pick_best_boosted_foo"))).getQuery();
+ assertEquals("((t_pick_best_boosted_foo:\"big cat\")^0.8 | (t_pick_best_boosted_foo:bagheera)^0.9 | (t_pick_best_boosted_foo:\"panthera pardus\")^0.85 | t_pick_best_boosted_foo:leopard)", q.toString());
+ q = QParser.getParser("leopard", req(params("df", "t_as_distinct_boosted_foo"))).getQuery();
+ assertEquals("((t_as_distinct_boosted_foo:\"big cat\")^0.8 (t_as_distinct_boosted_foo:bagheera)^0.9 (t_as_distinct_boosted_foo:\"panthera pardus\")^0.85 t_as_distinct_boosted_foo:leopard)", q.toString());
}
@Test
diff --git a/solr/solr-ref-guide/src/filter-descriptions.adoc b/solr/solr-ref-guide/src/filter-descriptions.adoc
index 86b7aa4..8d73d9f0 100644
--- a/solr/solr-ref-guide/src/filter-descriptions.adoc
+++ b/solr/solr-ref-guide/src/filter-descriptions.adoc
@@ -249,6 +249,72 @@ Discard original token (`inject="false"`).
Note that "Kuczewski" has two encodings, which are added at the same position.
+== Delimited Boost Filter
+
+This filter adds a numeric floating point boost value to tokens, splitting on a delimiter character.
+
+*Factory class:* `solr.DelimitedBoostTokenFilterFactory`
+
+*Arguments:*
+
+`delimiter`:: The character used to separate the token and the boost. Defaults to '|'.
+
+*Example:*
+
+[.dynamic-tabs]
+--
+[example.tab-pane#byname-filter-delimitedBoost]
+====
+[.tab-label]*With name*
+[source,xml]
+----
+<analyzer>
+<tokenizer name="standard"/>
+<filter name="delimitedBoost"/>
+</analyzer>
+----
+====
+[example.tab-pane#byclass-filter-delimitedBoost]
+====
+[.tab-label]*With class name (legacy)*
+[source,xml]
+----
+<analyzer>
+<tokenizer class="solr.StandardTokenizerFactory"/>
+<filter class="solr.DelimitedBoostTokenFilterFactory"/>
+</analyzer>
+----
+====
+--
+
+*In:* "leopard|0.5 panthera uncia|0.9"
+
+*Tokenizer to Filter:* "leopard|0.5"(1), "panthera"(2), "uncia|0.9"(3)
+
+*Out:* "leopard"(1)[0.5], "panthera"(2), "uncia"(3)[0.9]
+
+The numeric floating point in square brackets is a float token boost attribute.
+
+*Example:*
+
+Using a different delimiter (`delimiter="/"`).
+
+[source,xml]
+----
+<analyzer>
+<tokenizer name="standard"/>
+<filter name="delimitedBoost" delimiter="/"/>
+</analyzer>
+----
+
+*In:* "leopard/0.5 panthera uncia/0.9"
+
+*Tokenizer to Filter:* "leopard/0.5"(1), "panthera"(2), "uncia/0.9"(3)
+
+*Out:* "leopard"(1)[0.5], "panthera"(2), "uncia"(3)[0.9]
+
+*N.B.* make sure the delimiter is compatible with the tokenizer you use
+
== Edge N-Gram Filter
This filter generates edge n-gram tokens of sizes within the given range.
@@ -1560,6 +1626,39 @@ small => tiny,teeny,weeny
*Out:* "the"(1), "large"(2), "large"(3), "couch"(4), "sofa"(4), "divan"(4)
+*Weighted Synonyms:*
+
+Combining the DelimitedBoostFilter with the Synonym Graph Filter you can achieve Weighted synonyms at query time.
+For more information feel free to refer to:
+https://sease.io/2020/02/introducing-weighted-synonyms-in-apache-lucene.html
+For the following examples, assume a synonyms file named `boostedSynonyms.txt`:
+
+[source,text]
+----
+leopard, big cat|0.8, bagheera|0.9, panthera pardus|0.85
+lion => panthera leo|0.9, simba|0.8, kimba|0.75
+----
+
+*Example:*
+
+====
+[.tab-label]*With name*
+[source,xml]
+----
+<analyzer type="query">
+ <tokenizer name="standard"/>
+ <filter name="synonymGraph" synonyms="boostedSynonyms.txt"/>
+ <filter name="delimitedBoost"/>
+</analyzer>
+----
+====
+
+*In:* "lion"
+
+*Tokenizer to Filter:* "lion"(1)
+
+*Out:* "panthera"(1), "leo"(2)[0.9], "simba"(1)[0.8], "kimba"(1)[0.75]
+
== Token Offset Payload Filter
This filter adds the numeric character offsets of the token as a payload value for that token.