You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by eh...@apache.org on 2014/08/13 14:56:13 UTC
svn commit: r1617719 - in /lucene/dev/trunk/solr: CHANGES.txt
core/src/java/org/apache/solr/search/ExtendedDismaxQParser.java
core/src/test/org/apache/solr/search/TestExtendedDismaxParser.java
Author: ehatcher
Date: Wed Aug 13 12:56:13 2014
New Revision: 1617719
URL: http://svn.apache.org/r1617719
Log:
SOLR-6062: Fix undesirable edismax query parser effect (introduced in SOLR-2058) in how phrase queries
Modified:
lucene/dev/trunk/solr/CHANGES.txt
lucene/dev/trunk/solr/core/src/java/org/apache/solr/search/ExtendedDismaxQParser.java
lucene/dev/trunk/solr/core/src/test/org/apache/solr/search/TestExtendedDismaxParser.java
Modified: lucene/dev/trunk/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/CHANGES.txt?rev=1617719&r1=1617718&r2=1617719&view=diff
==============================================================================
--- lucene/dev/trunk/solr/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/CHANGES.txt Wed Aug 13 12:56:13 2014
@@ -262,6 +262,9 @@ Bug Fixes
* SOLR-6347: DELETEREPLICA throws a NPE while removing the last Replica in a Custom sharded collection.
+* SOLR-6062: Fix undesirable edismax query parser effect (introduced in SOLR-2058) in how phrase queries
+ generated from pf, pf2, and pf3 are merged into the main query. (Michael Dodsworth via ehatcher)
+
Optimizations
---------------------
Modified: lucene/dev/trunk/solr/core/src/java/org/apache/solr/search/ExtendedDismaxQParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/java/org/apache/solr/search/ExtendedDismaxQParser.java?rev=1617719&r1=1617718&r2=1617719&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/java/org/apache/solr/search/ExtendedDismaxQParser.java (original)
+++ lucene/dev/trunk/solr/core/src/java/org/apache/solr/search/ExtendedDismaxQParser.java Wed Aug 13 12:56:13 2014
@@ -17,6 +17,9 @@
package org.apache.solr.search;
+import com.google.common.base.Function;
+import com.google.common.collect.Multimap;
+import com.google.common.collect.Multimaps;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.StopFilterFactory;
import org.apache.lucene.analysis.util.TokenFilterFactory;
@@ -43,10 +46,12 @@ import org.apache.solr.schema.FieldType;
import org.apache.solr.util.SolrPluginUtils;
import java.util.ArrayList;
+import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
+import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
@@ -57,15 +62,34 @@ import java.util.Set;
* See Wiki page http://wiki.apache.org/solr/ExtendedDisMax
*/
public class ExtendedDismaxQParser extends QParser {
-
-
+
/**
* A field we can't ever find in any schema, so we can safely tell
* DisjunctionMaxQueryParser to use it as our defaultField, and
* map aliases from it to any field in our schema.
*/
private static String IMPOSSIBLE_FIELD_NAME = "\uFFFC\uFFFC\uFFFC";
-
+
+ /**
+ * Helper function which returns the specified {@link FieldParams}' {@link FieldParams#getWordGrams()} value.
+ */
+ private static final Function<FieldParams, Integer> WORD_GRAM_EXTRACTOR = new Function<FieldParams, Integer>() {
+ @Override
+ public Integer apply(FieldParams input) {
+ return input.getWordGrams();
+ }
+ };
+
+ /**
+ * Helper function which returns the specified {@link FieldParams}' {@link FieldParams#getSlop()} value.
+ */
+ private static final Function<FieldParams, Integer> PHRASE_SLOP_EXTRACTOR = new Function<FieldParams, Integer>() {
+ @Override
+ public Integer apply(FieldParams input) {
+ return input.getSlop();
+ }
+ };
+
/** shorten the class references for utilities */
private static class U extends SolrPluginUtils {
/* :NOOP */
@@ -212,21 +236,28 @@ public class ExtendedDismaxQParser exten
if (clause.field != null || clause.isPhrase) continue;
// check for keywords "AND,OR,TO"
if (clause.isBareWord()) {
- String s = clause.val.toString();
+ String s = clause.val;
// avoid putting explicit operators in the phrase query
if ("OR".equals(s) || "AND".equals(s) || "NOT".equals(s) || "TO".equals(s)) continue;
}
normalClauses.add(clause);
}
-
- // full phrase and shingles
- for (FieldParams phraseField: allPhraseFields) {
- Map<String,Float> pf = new HashMap<>(1);
- pf.put(phraseField.getField(),phraseField.getBoost());
- addShingledPhraseQueries(query, normalClauses, pf,
- phraseField.getWordGrams(),config.tiebreaker, phraseField.getSlop());
+
+ // create a map of {wordGram, [phraseField]}
+ Multimap<Integer, FieldParams> phraseFieldsByWordGram = Multimaps.index(allPhraseFields, WORD_GRAM_EXTRACTOR);
+
+ // for each {wordGram, [phraseField]} entry, create and add shingled field queries to the main user query
+ for (Map.Entry<Integer, Collection<FieldParams>> phraseFieldsByWordGramEntry : phraseFieldsByWordGram.asMap().entrySet()) {
+
+ // group the fields within this wordGram collection by their associated slop (it's possible that the same
+ // field appears multiple times for the same wordGram count but with different slop values. In this case, we
+ // should take the *sum* of those phrase queries, rather than the max across them).
+ Multimap<Integer, FieldParams> phraseFieldsBySlop = Multimaps.index(phraseFieldsByWordGramEntry.getValue(), PHRASE_SLOP_EXTRACTOR);
+ for (Map.Entry<Integer, Collection<FieldParams>> phraseFieldsBySlopEntry : phraseFieldsBySlop.asMap().entrySet()) {
+ addShingledPhraseQueries(query, normalClauses, phraseFieldsBySlopEntry.getValue(),
+ phraseFieldsByWordGramEntry.getKey(), config.tiebreaker, phraseFieldsBySlopEntry.getKey());
+ }
}
-
}
}
@@ -493,14 +524,13 @@ public class ExtendedDismaxQParser exten
* @param fields Field => boost mappings for the phrase queries
* @param shingleSize how big the phrases should be, 0 means a single phrase
* @param tiebreaker tie breaker value for the DisjunctionMaxQueries
- * @param slop slop value for the constructed phrases
*/
protected void addShingledPhraseQueries(final BooleanQuery mainQuery,
final List<Clause> clauses,
- final Map<String,Float> fields,
+ final Collection<FieldParams> fields,
int shingleSize,
final float tiebreaker,
- final int slop)
+ final int slop)
throws SyntaxError {
if (null == fields || fields.isEmpty() ||
@@ -509,12 +539,12 @@ public class ExtendedDismaxQParser exten
if (0 == shingleSize) shingleSize = clauses.size();
- final int goat = shingleSize-1; // :TODO: better name for var?
+ final int lastClauseIndex = shingleSize-1;
StringBuilder userPhraseQuery = new StringBuilder();
- for (int i=0; i < clauses.size() - goat; i++) {
+ for (int i=0; i < clauses.size() - lastClauseIndex; i++) {
userPhraseQuery.append('"');
- for (int j=0; j <= goat; j++) {
+ for (int j=0; j <= lastClauseIndex; j++) {
userPhraseQuery.append(clauses.get(i + j).val);
userPhraseQuery.append(' ');
}
@@ -524,8 +554,8 @@ public class ExtendedDismaxQParser exten
/* for parsing sloppy phrases using DisjunctionMaxQueries */
ExtendedSolrQueryParser pp = createEdismaxQueryParser(this, IMPOSSIBLE_FIELD_NAME);
-
- pp.addAlias(IMPOSSIBLE_FIELD_NAME, tiebreaker, fields);
+
+ pp.addAlias(IMPOSSIBLE_FIELD_NAME, tiebreaker, getFieldBoosts(fields));
pp.setPhraseSlop(slop);
pp.setRemoveStopFilter(true); // remove stop filter and keep stopwords
@@ -559,8 +589,20 @@ public class ExtendedDismaxQParser exten
mainQuery.add(phrase, BooleanClause.Occur.SHOULD);
}
}
-
-
+
+ /**
+ * @return a {fieldName, fieldBoost} map for the given fields.
+ */
+ private Map<String, Float> getFieldBoosts(Collection<FieldParams> fields) {
+ Map<String, Float> fieldBoostMap = new LinkedHashMap<>(fields.size());
+
+ for (FieldParams field : fields) {
+ fieldBoostMap.put(field.getField(), field.getBoost());
+ }
+
+ return fieldBoostMap;
+ }
+
@Override
public String[] getDefaultHighlightFields() {
return config.queryFields.keySet().toArray(new String[0]);
@@ -1221,7 +1263,7 @@ public class ExtendedDismaxQParser exten
return null;
}
}
-
+
private Analyzer noStopwordFilterAnalyzer(String fieldName) {
FieldType ft = parser.getReq().getSchema().getFieldType(fieldName);
Analyzer qa = ft.getQueryAnalyzer();
Modified: lucene/dev/trunk/solr/core/src/test/org/apache/solr/search/TestExtendedDismaxParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test/org/apache/solr/search/TestExtendedDismaxParser.java?rev=1617719&r1=1617718&r2=1617719&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/test/org/apache/solr/search/TestExtendedDismaxParser.java (original)
+++ lucene/dev/trunk/solr/core/src/test/org/apache/solr/search/TestExtendedDismaxParser.java Wed Aug 13 12:56:13 2014
@@ -32,7 +32,6 @@ import org.apache.solr.common.SolrExcept
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.request.SolrQueryRequest;
-import org.apache.solr.util.AbstractSolrTestCase;
import org.apache.solr.util.SolrPluginUtils;
import org.junit.BeforeClass;
import org.junit.Test;
@@ -683,7 +682,7 @@ public class TestExtendedDismaxParser ex
assertU(adoc("id", "s0", "phrase_sw", "foo bar a b c", "boost_d", "1.0"));
assertU(adoc("id", "s1", "phrase_sw", "foo a bar b c", "boost_d", "2.0"));
assertU(adoc("id", "s2", "phrase_sw", "foo a b bar c", "boost_d", "3.0"));
- assertU(adoc("id", "s3", "phrase_sw", "foo a b c bar", "boost_d", "4.0"));
+ assertU(adoc("id", "s3", "phrase_sw", "foo a b c bar", "boost_d", "4.0"));
assertU(commit());
assertQ("default order assumption wrong",
@@ -695,7 +694,7 @@ public class TestExtendedDismaxParser ex
"//doc[1]/str[@name='id'][.='s3']",
"//doc[2]/str[@name='id'][.='s2']",
"//doc[3]/str[@name='id'][.='s1']",
- "//doc[4]/str[@name='id'][.='s0']");
+ "//doc[4]/str[@name='id'][.='s0']");
assertQ("pf not working",
req("q", "foo bar",
@@ -705,37 +704,37 @@ public class TestExtendedDismaxParser ex
"fl", "score,*",
"defType", "edismax"),
"//doc[1]/str[@name='id'][.='s0']");
-
+
assertQ("pf2 not working",
- req("q", "foo bar",
+ req("q", "foo bar",
"qf", "phrase_sw",
"pf2", "phrase_sw^10",
"bf", "boost_d",
"fl", "score,*",
"defType", "edismax"),
- "//doc[1]/str[@name='id'][.='s0']");
+ "//doc[1]/str[@name='id'][.='s0']");
assertQ("pf3 not working",
- req("q", "a b bar",
+ req("q", "a b bar",
"qf", "phrase_sw",
"pf3", "phrase_sw^10",
"bf", "boost_d",
"fl", "score,*",
"defType", "edismax"),
- "//doc[1]/str[@name='id'][.='s2']");
+ "//doc[1]/str[@name='id'][.='s2']");
assertQ("ps not working for pf2",
- req("q", "bar foo",
+ req("q", "bar foo",
"qf", "phrase_sw",
"pf2", "phrase_sw^10",
"ps", "2",
"bf", "boost_d",
"fl", "score,*",
"defType", "edismax"),
- "//doc[1]/str[@name='id'][.='s0']");
+ "//doc[1]/str[@name='id'][.='s0']");
assertQ("ps not working for pf3",
- req("q", "a bar foo",
+ req("q", "a bar foo",
"qf", "phrase_sw",
"pf3", "phrase_sw^10",
"ps", "3",
@@ -743,8 +742,8 @@ public class TestExtendedDismaxParser ex
"fl", "score,*",
"debugQuery", "true",
"defType", "edismax"),
- "//doc[1]/str[@name='id'][.='s1']");
-
+ "//doc[1]/str[@name='id'][.='s1']");
+
assertQ("ps/ps2/ps3 with default slop overrides not working",
req("q", "zzzz xxxx cccc vvvv",
"qf", "phrase_sw",
@@ -809,6 +808,17 @@ public class TestExtendedDismaxParser ex
"//str[@name='parsedquery'][contains(.,'phrase_sw:\"zzzz xxxx\"~2^22.0')]"
);
+ assertQ("phrase field queries spanning multiple fields should be within their own dismax queries",
+ req("q", "aaaa bbbb cccc",
+ "qf", "phrase_sw phrase1_sw",
+ "pf2", "phrase_sw phrase1_sw",
+ "pf3", "phrase_sw phrase1_sw",
+ "defType", "edismax",
+ "debugQuery", "true"),
+ "//str[@name='parsedquery'][contains(.,'(phrase_sw:\"aaaa bbbb\" | phrase1_sw:\"aaaa bbbb\")')]",
+ "//str[@name='parsedquery'][contains(.,'(phrase_sw:\"bbbb cccc\" | phrase1_sw:\"bbbb cccc\")')]",
+ "//str[@name='parsedquery'][contains(.,'(phrase_sw:\"aaaa bbbb cccc\" | phrase1_sw:\"aaaa bbbb cccc\")')]"
+ );
}
/**