You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sa...@apache.org on 2018/11/05 15:33:09 UTC

lucene-solr:master: SOLR-12243: Edismax missing phrase queries when phrases contain multiterm synonyms

Repository: lucene-solr
Updated Branches:
  refs/heads/master 1e3cc4861 -> 01808eee9


SOLR-12243: Edismax missing phrase queries when phrases contain multiterm synonyms


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/01808eee
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/01808eee
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/01808eee

Branch: refs/heads/master
Commit: 01808eee938833b4a1c17a8a92cabdb683732a17
Parents: 1e3cc48
Author: Steve Rowe <sa...@apache.org>
Authored: Mon Nov 5 10:29:39 2018 -0500
Committer: Steve Rowe <sa...@apache.org>
Committed: Mon Nov 5 10:32:49 2018 -0500

----------------------------------------------------------------------
 solr/CHANGES.txt                                |   3 +
 .../solr/search/ExtendedDismaxQParser.java      |  12 +-
 .../collection1/conf/multiword-synonyms.txt     |   4 +-
 .../conf/schema-multiword-synonyms.xml          |   2 +
 .../solr/search/TestMultiWordSynonyms.java      | 165 ++++++++++++++++++-
 .../src/the-extended-dismax-query-parser.adoc   |  15 +-
 6 files changed, 192 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/01808eee/solr/CHANGES.txt
----------------------------------------------------------------------
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 3cc43f4..198ccf6 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -156,6 +156,9 @@ used with DVHASH method in json.facet. (Tim Underwood via Mikhail Khludnev)
 
 * SOLR-12023: Autoscaling policy engine shuffles replicas needlessly (noble)
 
+* SOLR-12243: Edismax missing phrase queries when phrases contain multiterm synonyms
+  (Elizabeth Haubert, Alessandro Benedetti, Uwe Schindler, Steve Rowe)
+
 Improvements
 ----------------------
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/01808eee/solr/core/src/java/org/apache/solr/search/ExtendedDismaxQParser.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/search/ExtendedDismaxQParser.java b/solr/core/src/java/org/apache/solr/search/ExtendedDismaxQParser.java
index 3e99d76..d234121 100644
--- a/solr/core/src/java/org/apache/solr/search/ExtendedDismaxQParser.java
+++ b/solr/core/src/java/org/apache/solr/search/ExtendedDismaxQParser.java
@@ -46,6 +46,7 @@ import org.apache.lucene.search.MatchAllDocsQuery;
 import org.apache.lucene.search.MultiPhraseQuery;
 import org.apache.lucene.search.PhraseQuery;
 import org.apache.lucene.search.Query;
+import org.apache.lucene.search.spans.SpanQuery;
 import org.apache.lucene.util.Version;
 import org.apache.solr.analysis.TokenizerChain;
 import org.apache.solr.common.SolrException;
@@ -1411,10 +1412,11 @@ public class ExtendedDismaxQParser extends QParser {
             // Boolean query on a whitespace-separated string
             // If these were synonyms we would have a SynonymQuery
             if (query instanceof BooleanQuery) {
-              BooleanQuery bq = (BooleanQuery) query;
-              query = SolrPluginUtils.setMinShouldMatch(bq, minShouldMatch, false);
-            }
-            if (query instanceof PhraseQuery) {
+              if (type == QType.FIELD) { // Don't set mm for boolean query containing phrase queries
+                BooleanQuery bq = (BooleanQuery) query;
+                query = SolrPluginUtils.setMinShouldMatch(bq, minShouldMatch, false);
+              }
+            } else if (query instanceof PhraseQuery) {
               PhraseQuery pq = (PhraseQuery)query;
               if (minClauseSize > 1 && pq.getTerms().length < minClauseSize) return null;
               PhraseQuery.Builder builder = new PhraseQuery.Builder();
@@ -1431,6 +1433,8 @@ public class ExtendedDismaxQParser extends QParser {
               if (slop != mpq.getSlop()) {
                 query = new MultiPhraseQuery.Builder(mpq).setSlop(slop).build();
               }
+            } else if (query instanceof SpanQuery) {
+              return query;
             } else if (minClauseSize > 1) {
               // if it's not a type of phrase query, it doesn't meet the minClauseSize requirements
               return null;

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/01808eee/solr/core/src/test-files/solr/collection1/conf/multiword-synonyms.txt
----------------------------------------------------------------------
diff --git a/solr/core/src/test-files/solr/collection1/conf/multiword-synonyms.txt b/solr/core/src/test-files/solr/collection1/conf/multiword-synonyms.txt
index 0ef4d78..15a53e0 100644
--- a/solr/core/src/test-files/solr/collection1/conf/multiword-synonyms.txt
+++ b/solr/core/src/test-files/solr/collection1/conf/multiword-synonyms.txt
@@ -10,4 +10,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-US, U.S., U S, USA, U.S.A., U S A, United States, United States of America
\ No newline at end of file
+US, U.S., U S, USA, U.S.A., U S A, United States, United States of America
+bar, tropical cyclone
+chicken, dinner bird
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/01808eee/solr/core/src/test-files/solr/collection1/conf/schema-multiword-synonyms.xml
----------------------------------------------------------------------
diff --git a/solr/core/src/test-files/solr/collection1/conf/schema-multiword-synonyms.xml b/solr/core/src/test-files/solr/collection1/conf/schema-multiword-synonyms.xml
index 0343142..c7ebb36 100644
--- a/solr/core/src/test-files/solr/collection1/conf/schema-multiword-synonyms.xml
+++ b/solr/core/src/test-files/solr/collection1/conf/schema-multiword-synonyms.xml
@@ -20,7 +20,9 @@
 
   <fieldType name="string" class="solr.StrField" sortMissingLast="true"/>
   <fieldType name="long" class="${solr.tests.LongFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="0" positionIncrementGap="0"/>
+  <fieldType name="double" class="${solr.tests.DoubleFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
   <field name="signatureField" type="string" indexed="true" stored="false"/>
+  <field name="boost_d" type="double" indexed="true" stored="false" />
 
   <fieldType name="text" class="solr.TextField" positionIncrementGap="100">
     <analyzer type="index">

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/01808eee/solr/core/src/test/org/apache/solr/search/TestMultiWordSynonyms.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/search/TestMultiWordSynonyms.java b/solr/core/src/test/org/apache/solr/search/TestMultiWordSynonyms.java
index ecc80c3..0ff4a65 100644
--- a/solr/core/src/test/org/apache/solr/search/TestMultiWordSynonyms.java
+++ b/solr/core/src/test/org/apache/solr/search/TestMultiWordSynonyms.java
@@ -19,6 +19,7 @@ package org.apache.solr.search;
 
 import java.util.Arrays;
 
+import org.apache.lucene.search.Query;
 import org.apache.solr.SolrTestCaseJ4;
 import org.junit.BeforeClass;
 import org.junit.Test;
@@ -80,7 +81,7 @@ public class TestMultiWordSynonyms extends SolrTestCaseJ4 {
       }
     }
   }
-  
+
   @Test
   public void testPhrase() throws Exception {
     for (String q : Arrays.asList
@@ -97,4 +98,166 @@ public class TestMultiWordSynonyms extends SolrTestCaseJ4 {
       }
     }
   }
+
+  @Test
+  public void testPf() throws Exception {
+    // test phrase fields including pf2 pf3 and phrase slop
+    // same as edismax test, but "bar" is synonym for "tropical cyclone" here
+    assertU(adoc("id", "10", "text", "foo bar a b c", "boost_d", "1.0"));
+    assertU(adoc("id", "11", "text", "foo a bar b c", "boost_d", "2.0"));
+    assertU(adoc("id", "12", "text", "foo a b bar c", "boost_d", "3.0"));
+    assertU(adoc("id", "13", "text", "foo a b c bar", "boost_d", "4.0"));
+    assertU(commit());
+
+    assertQ("default order assumption wrong",
+        req("q", "foo bar",
+            "qf", "text",
+            "bf", "boost_d",
+            "fl", "score,*",
+            "defType", "edismax"),
+        "//doc[1]/str[@name='id'][.='13']",
+        "//doc[2]/str[@name='id'][.='12']",
+        "//doc[3]/str[@name='id'][.='11']",
+        "//doc[4]/str[@name='id'][.='10']");
+
+    assertQ("default order assumption wrong",
+        req("q", "foo tropical cyclone",
+            "qf", "text",
+            "bf", "boost_d",
+            "fl", "score,*",
+            "defType", "edismax"),
+        "//doc[1]/str[@name='id'][.='13']",
+        "//doc[2]/str[@name='id'][.='12']",
+        "//doc[3]/str[@name='id'][.='11']",
+        "//doc[4]/str[@name='id'][.='10']");
+
+    assertQ("pf not working",
+        req("q", "foo bar",
+            "qf", "text",
+            "pf", "text^10",
+            "fl", "score,*",
+            "bf", "boost_d",
+            "defType", "edismax"),
+        "//doc[1]/str[@name='id'][.='10']");
+
+    assertQ("pf not working",
+        req("q", "foo tropical cyclone",
+            "qf", "text",
+            "pf", "text^10",
+            "fl", "score,*",
+            "bf", "boost_d",
+            "defType", "edismax"),
+        "//doc[1]/str[@name='id'][.='10']");
+
+    assertQ("pf2 not working",
+        req("q", "foo bar",
+            "qf", "text",
+            "pf2", "text^10",
+            "fl", "score,*",
+            "bf", "boost_d",
+            "defType", "edismax"),
+        "//doc[1]/str[@name='id'][.='10']");
+
+    assertQ("pf3 not working",
+        req("q", "a b bar",
+            "qf", "text",
+            "pf3", "text^10",
+            "fl", "score,*",
+            "bf", "boost_d",
+            "defType", "edismax"),
+        "//doc[1]/str[@name='id'][.='12']");
+
+    assertQ("pf3 not working",
+        req("q", "a b tropical cyclone",
+            "qf", "text",
+            "pf3", "text^10",
+            "fl", "score,*",
+            "bf", "boost_d",
+            "defType", "edismax"),
+        "//doc[1]/str[@name='id'][.='12']");
+
+    assertQ("ps not working for pf2",
+        req("q", "bar foo",
+            "qf", "text",
+            "pf2", "text^10",
+            "ps", "2",
+            "fl", "score,*",
+            "defType", "edismax"),
+        "//doc[1]/str[@name='id'][.='10']");
+
+    assertQ("ps not working for pf2",
+        req("q", "tropical cyclone foo",
+            "qf", "text",
+            "pf2", "text^10",
+            "ps", "2",
+            "fl", "score,*",
+            "defType", "edismax"),
+        "//doc[1]/str[@name='id'][.='10']");
+  }
+
+  @Test
+  public void testPf3WithReordering() throws Exception {
+    // test pf3 and phrase slop
+    assertU(adoc("id", "20", "text", "chicken 1 2 3 4 5 pig 1 2 3 4 5  anteater bunny cow", "boost_d", "1.0"));
+    assertU(adoc("id", "21", "text", "chicken anteater pig bunny cow", "boost_d", "2.0"));
+    assertU(adoc("id", "22", "text", "chicken 1 2 3 4 5 anteater bunny 1 2 3 4 5 pig cow", "boost_d", "3.0"));
+    assertU(adoc("id", "23", "text", "chicken 1 2 3 4 5 anteater bunny cow 1 2 3 4 5 pig", "boost_d", "4.0"));
+    assertU(commit());
+
+    assertQ("ps not working for pf3",
+        req("q", "anteater chicken pig",
+            "qf", "text",
+            "bf", "boost_d",
+            "pf3", "text^10",
+            "ps", "6",
+            "fl", "score,*",
+            "debugQuery", "true",
+            "defType", "edismax"),
+        "//doc[1]/str[@name='id'][.='21']");
+  }
+
+ @Test
+  public void testPf3WithoutReordering() throws Exception {
+    // test pf3 and phrase slop
+    assertU(adoc("id", "20", "text", "anteater 1 2 3 4 5 pig 1 2 3 4 5  chicken bunny pig", "boost_d", "1.0"));
+    assertU(adoc("id", "21", "text", "anteater 1 2 chicken 1 2 pig bunny cow", "boost_d", "2.0"));
+    assertU(adoc("id", "22", "text", "chicken 1 2 3 4 5 anteater bunny 1 2 3 4 5 pig cow", "boost_d", "3.0"));
+    assertU(adoc("id", "23", "text", "chicken 1 2 3 4 5 anteater bunny cow 1 2 3 4 5 pig", "boost_d", "4.0"));
+    assertU(commit());
+
+    assertQ("ps not working for pf3",
+        req("q", "anteater chicken pig",
+            "qf", "text",
+            "bf", "boost_d",
+            "pf3", "text^10",
+            "ps", "6",
+            "fl", "score,*",
+            "debugQuery", "true",
+            "defType", "edismax"),
+        "//doc[1]/str[@name='id'][.='21']");
+  }
+
+  public void testEdismaxQueryParsing_multiTermWithPf_shouldParseCorrectPhraseQueries() throws Exception {
+    Query q = QParser.getParser("foo a b bar","edismax",true,
+        req(params("sow", "false","qf", "text^10","pf", "text^10","pf2", "text^5","pf3", "text^8"))).getQuery();
+    assertEquals("+(" +
+        "((text:foo)^10.0) ((text:a)^10.0) ((text:b)^10.0) (((+text:tropical +text:cyclone) text:bar)^10.0)) " +
+        "((spanNear([text:foo, text:a, text:b, spanOr([spanNear([text:tropical, text:cyclone], 0, true), text:bar])], 0, true))^10.0) " +
+        "(((text:\"foo a\")^5.0) ((text:\"a b\")^5.0) ((spanNear([text:b, spanOr([spanNear([text:tropical, text:cyclone], 0, true), text:bar])], 0, true))^5.0)) " +
+        "(((text:\"foo a b\")^8.0) ((spanNear([text:a, text:b, spanOr([spanNear([text:tropical, text:cyclone], 0, true), text:bar])], 0, true))^8.0))", q.toString());
+
+    q = QParser.getParser("tropical cyclone foo a b ","edismax",true, req(params("qf", "text^10","pf", "text^10","pf2", "text^5","pf3", "text^8"))).getQuery();
+    assertEquals("+(" +
+        "((text:bar (+text:tropical +text:cyclone))^10.0) ((text:foo)^10.0) ((text:a)^10.0) ((text:b)^10.0)) " +
+        "((spanNear([spanOr([text:bar, spanNear([text:tropical, text:cyclone], 0, true)]), text:foo, text:a, text:b], 0, true))^10.0) " +
+        "(((spanOr([text:bar, spanNear([text:tropical, text:cyclone], 0, true)]))^5.0) ((text:\"cyclone foo\")^5.0) ((text:\"foo a\")^5.0) ((text:\"a b\")^5.0)) " +
+        "(((spanNear([spanOr([text:bar, spanNear([text:tropical, text:cyclone], 0, true)]), text:foo], 0, true))^8.0) ((text:\"cyclone foo a\")^8.0) ((text:\"foo a b\")^8.0))", q.toString());
+
+    q = QParser.getParser("foo a b tropical cyclone","edismax",true, req(params("qf", "text^10","pf", "text^10","pf2", "text^5","pf3", "text^8"))).getQuery();
+    assertEquals("+(" +
+        "((text:foo)^10.0) ((text:a)^10.0) ((text:b)^10.0) ((text:bar (+text:tropical +text:cyclone))^10.0)) " +
+        "((spanNear([text:foo, text:a, text:b, spanOr([text:bar, spanNear([text:tropical, text:cyclone], 0, true)])], 0, true))^10.0) " +
+        "(((text:\"foo a\")^5.0) ((text:\"a b\")^5.0) ((text:\"b tropical\")^5.0) ((spanOr([text:bar, spanNear([text:tropical, text:cyclone], 0, true)]))^5.0)) " +
+        "(((text:\"foo a b\")^8.0) ((text:\"a b tropical\")^8.0) ((spanNear([text:b, spanOr([text:bar, spanNear([text:tropical, text:cyclone], 0, true)])], 0, true))^8.0))", q.toString());
+  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/01808eee/solr/solr-ref-guide/src/the-extended-dismax-query-parser.adoc
----------------------------------------------------------------------
diff --git a/solr/solr-ref-guide/src/the-extended-dismax-query-parser.adoc b/solr/solr-ref-guide/src/the-extended-dismax-query-parser.adoc
index e44d31f..08451e0 100644
--- a/solr/solr-ref-guide/src/the-extended-dismax-query-parser.adoc
+++ b/solr/solr-ref-guide/src/the-extended-dismax-query-parser.adoc
@@ -63,13 +63,13 @@ Phrase Slop. The default amount of slop - distance between terms - on phrase que
 
 `pf2`::
 
-A multivalued list of fields with optional weights. Similar to `pf`, but based on _pairs_ of word shingles.
+A multivalued list of fields with optional weights. Similar to `pf`, but based on word _pair_ shingles.
 
 `ps2`::
 This is similar to `ps` but overrides the slop factor used for `pf2`. If not specified, `ps` is used.
 
 `pf3`::
-A multivalued list of fields with optional weights, based on triplets of word shingles. Similar to `pf`, except that instead of building a phrase per field out of all the words in the input, it builds a set of phrases for each field out of each _triplet_ of word shingles.
+A multivalued list of fields with optional weights, based on triplets of word shingles. Similar to `pf`, except that instead of building a phrase per field out of all the words in the input, it builds a set of phrases for each field out of word _triplet_ shingles.
 
 `ps3`::
 This is similar to `ps` but overrides the slop factor used for `pf3`. If not specified, `ps` is used.
@@ -210,4 +210,13 @@ q="Hans Anderson"
 
 A document that contains "Hans Anderson" will match, but a document that contains the middle name "Christian" or where the name is written with the last name first ("Anderson, Hans") won't. For those cases one could configure the query field `qs`, so that even if the user searches for an explicit phrase query, a slop is applied.
 
-Finally, in addition to the phrase fields (`pf`) parameter, `edismax` also supports the `pf2` and `pf3` parameters, for fields over which to create bigram and trigram phrase queries. The phrase slop for these parameters' queries can be specified using the `ps2` and `ps3` parameters, respectively. If you use `pf2`/`pf3` but `ps2`/`ps3`, then the phrase slop for these parameters' queries will be taken from the `ps` parameter, if any.
+Finally, in addition to the phrase fields (`pf`) parameter, `edismax` also supports the `pf2` and `pf3` parameters, for fields over which to create bigram and trigram phrase queries. The phrase slop for these parameters' queries can be specified using the `ps2` and `ps3` parameters, respectively. If you use `pf2`/`pf3` but not `ps2`/`ps3`, then the phrase slop for these parameters' queries will be taken from the `ps` parameter, if any.
+
+=== Synonyms expansion in phrase queries with slop
+
+When a phrase query with slop (e.g. `pf` with `ps`) triggers synonym expansions, a separate clause will be generated for each combination of synonyms. For example, with configured synonyms `dog,canine` and `cat,feline`, the query `"dog chased cat"` will generate the following phrase query clauses:
+
+* `"dog chased cat"`
+* `"canine chased cat"`
+* `"dog chased feline"`
+* `"canine chased feline"`