You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/11/08 13:02:12 UTC

svn commit: r1407027 - in /lucene/dev/branches/lucene_solr_3_6: ./ lucene/ lucene/contrib/facet/ lucene/contrib/spellchecker/ lucene/core/src/ lucene/core/src/test/org/apache/lucene/analysis/ lucene/site/ lucene/test-framework/src/java/org/apache/lucen...

Author: rmuir
Date: Thu Nov  8 12:02:10 2012
New Revision: 1407027

URL: http://svn.apache.org/viewvc?rev=1407027&view=rev
Log:
SOLR-3589: Edismax parser does not honor mm parameter if analyzer splits a token

Modified:
    lucene/dev/branches/lucene_solr_3_6/   (props changed)
    lucene/dev/branches/lucene_solr_3_6/lucene/   (props changed)
    lucene/dev/branches/lucene_solr_3_6/lucene/contrib/facet/   (props changed)
    lucene/dev/branches/lucene_solr_3_6/lucene/contrib/spellchecker/   (props changed)
    lucene/dev/branches/lucene_solr_3_6/lucene/core/src/   (props changed)
    lucene/dev/branches/lucene_solr_3_6/lucene/core/src/test/org/apache/lucene/analysis/   (props changed)
    lucene/dev/branches/lucene_solr_3_6/lucene/site/   (props changed)
    lucene/dev/branches/lucene_solr_3_6/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java   (props changed)
    lucene/dev/branches/lucene_solr_3_6/lucene/tools/   (props changed)
    lucene/dev/branches/lucene_solr_3_6/solr/   (props changed)
    lucene/dev/branches/lucene_solr_3_6/solr/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/lucene_solr_3_6/solr/NOTICE.txt   (props changed)
    lucene/dev/branches/lucene_solr_3_6/solr/common-build.xml   (props changed)
    lucene/dev/branches/lucene_solr_3_6/solr/contrib/   (props changed)
    lucene/dev/branches/lucene_solr_3_6/solr/core/   (props changed)
    lucene/dev/branches/lucene_solr_3_6/solr/core/src/java/org/apache/solr/search/ExtendedDismaxQParserPlugin.java
    lucene/dev/branches/lucene_solr_3_6/solr/core/src/test/org/apache/solr/search/TestExtendedDismaxParser.java
    lucene/dev/branches/lucene_solr_3_6/solr/example/   (props changed)
    lucene/dev/branches/lucene_solr_3_6/solr/lib/   (props changed)
    lucene/dev/branches/lucene_solr_3_6/solr/solrj/   (props changed)
    lucene/dev/branches/lucene_solr_3_6/solr/test-framework/   (props changed)
    lucene/dev/branches/lucene_solr_3_6/solr/testlogging.properties   (props changed)

Modified: lucene/dev/branches/lucene_solr_3_6/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_3_6/solr/CHANGES.txt?rev=1407027&r1=1407026&r2=1407027&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_3_6/solr/CHANGES.txt (original)
+++ lucene/dev/branches/lucene_solr_3_6/solr/CHANGES.txt Thu Nov  8 12:02:10 2012
@@ -27,6 +27,9 @@ Bug Fixes
 * SOLR-3790: ConcurrentModificationException could be thrown when using hl.fl=*.
   (yonik, koji)
 
+* SOLR-3589: Edismax parser does not honor mm parameter if analyzer splits a token.
+  (Tom Burton-West, Robert Muir)
+
 ==================  3.6.1  ==================
 More information about this release, including any errata related to the 
 release notes, upgrade instructions, or other changes may be found online at:

Modified: lucene/dev/branches/lucene_solr_3_6/solr/core/src/java/org/apache/solr/search/ExtendedDismaxQParserPlugin.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_3_6/solr/core/src/java/org/apache/solr/search/ExtendedDismaxQParserPlugin.java?rev=1407027&r1=1407026&r2=1407027&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_3_6/solr/core/src/java/org/apache/solr/search/ExtendedDismaxQParserPlugin.java (original)
+++ lucene/dev/branches/lucene_solr_3_6/solr/core/src/java/org/apache/solr/search/ExtendedDismaxQParserPlugin.java Thu Nov  8 12:02:10 2012
@@ -121,7 +121,10 @@ class ExtendedDismaxQParser extends QPar
     SolrParams params = getParams();
     
     solrParams = SolrParams.wrapDefaults(localParams, params);
-
+    // Solr 4.0 sets the default at 0% if q.op=OR and %100 if q.op =AND
+    // just go with the flow and use 3.6 default of 100% here
+    final String minShouldMatch = solrParams.get(DisMaxParams.MM, "100%");
+     
     userFields = new UserFields(U.parseFieldBoosts(solrParams.getParams(DMP.UF)));
     
     queryFields = SolrPluginUtils.parseFieldBoosts(solrParams.getParams(DisMaxParams.QF));
@@ -235,7 +238,9 @@ class ExtendedDismaxQParser extends QPar
       // For correct lucene queries, turn off mm processing if there
       // were explicit operators (except for AND).
       boolean doMinMatched = (numOR + numNOT + numPluses + numMinuses) == 0;
-
+      // but always for unstructured implicit bqs created by getFieldQuery
+      up.minShouldMatch = minShouldMatch;
+    
       try {
         up.setRemoveStopFilter(!stopwords);
         up.exceptions = true;
@@ -252,7 +257,6 @@ class ExtendedDismaxQParser extends QPar
       }
 
       if (parsedUserQuery != null && doMinMatched) {
-        String minShouldMatch = solrParams.get(DisMaxParams.MM, "100%");
         if (parsedUserQuery instanceof BooleanQuery) {
           SolrPluginUtils.setMinShouldMatch((BooleanQuery)parsedUserQuery, minShouldMatch);
         }
@@ -295,8 +299,7 @@ class ExtendedDismaxQParser extends QPar
         parsedUserQuery = up.parse(escapedUserQuery);
 
         // Only do minimum-match logic
-        String minShouldMatch = solrParams.get(DisMaxParams.MM, "100%");
-
+    
         if (parsedUserQuery instanceof BooleanQuery) {
           BooleanQuery t = new BooleanQuery();
           SolrPluginUtils.flattenBooleanQuery(t, (BooleanQuery)parsedUserQuery);
@@ -874,6 +877,7 @@ class ExtendedDismaxQParser extends QPar
                               // used when constructing boosting part of query via sloppy phrases
     boolean exceptions;  //  allow exceptions to be thrown (for example on a missing field)
 
+    String minShouldMatch; // for inner boolean queries produced from a single fieldQuery
     ExtendedAnalyzer analyzer;
 
     /**
@@ -1117,6 +1121,18 @@ class ExtendedDismaxQParser extends QPar
           case FIELD:  // fallthrough
           case PHRASE:
             Query query = super.getFieldQuery(field, val, type == QType.PHRASE);
+            // A BooleanQuery is only possible from getFieldQuery if it came from
+            // a single whitespace separated term. In this case, check the coordination
+            // factor on the query: if its enabled, that means we aren't a set of synonyms
+            // but instead multiple terms from one whitespace-separated term, we must
+            // apply minShouldMatch here so that it works correctly with other things
+            // like aliasing.
+            if (query instanceof BooleanQuery) {
+              BooleanQuery bq = (BooleanQuery) query;
+              if (!bq.isCoordDisabled()) {
+                SolrPluginUtils.setMinShouldMatch(bq, minShouldMatch);
+              }
+            }
             if (query instanceof PhraseQuery) {
               PhraseQuery pq = (PhraseQuery)query;
               if (minClauseSize > 1 && pq.getTerms().length < minClauseSize) return null;

Modified: lucene/dev/branches/lucene_solr_3_6/solr/core/src/test/org/apache/solr/search/TestExtendedDismaxParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_3_6/solr/core/src/test/org/apache/solr/search/TestExtendedDismaxParser.java?rev=1407027&r1=1407026&r2=1407027&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_3_6/solr/core/src/test/org/apache/solr/search/TestExtendedDismaxParser.java (original)
+++ lucene/dev/branches/lucene_solr_3_6/solr/core/src/test/org/apache/solr/search/TestExtendedDismaxParser.java Thu Nov  8 12:02:10 2012
@@ -55,6 +55,16 @@ public class TestExtendedDismaxParser ex
     assertU(adoc("id", "49", "text_sw", "start the big apple end", "foo_i","-100"));
     assertU(adoc("id", "50", "text_sw", "start new big city end"));
     assertU(adoc("id", "51", "store",   "12.34,-56.78"));
+    assertU(adoc("id", "52", "text_sw", "tekna theou klethomen"));
+    assertU(adoc("id", "53", "text_sw", "nun tekna theou esmen"));
+    assertU(adoc("id", "54", "text_sw", "phanera estin ta tekna tou theou"));
+    assertU(adoc("id", "55", "standardtok", "大"));
+    assertU(adoc("id", "56", "standardtok", "大亚"));
+    assertU(adoc("id", "57", "standardtok", "大亚湾"));
+    assertU(adoc("id", "58", "HTMLstandardtok", "大"));
+    assertU(adoc("id", "59", "HTMLstandardtok", "大亚"));
+    assertU(adoc("id", "60", "HTMLstandardtok", "大亚湾"));
+    assertU(adoc("id", "61", "text_sw", "bazaaa")); // synonyms in an expansion group
     assertU(commit());
   }
   @Override
@@ -501,4 +511,151 @@ public class TestExtendedDismaxParser ex
             "//str[@name='id'][.='145']",
             "//str[@name='id'][.='146']");
   }
+  /**
+   * SOLR-3589: Edismax parser does not honor mm parameter if analyzer splits a token
+   */
+  public void testCJK() throws Exception {
+    assertQ("test cjk (disjunction)",
+        req("q", "大亚湾",
+            "qf", "standardtok",
+            "mm", "0%",
+            "defType", "edismax")
+        , "*[count(//doc)=3]");
+    assertQ("test cjk (minShouldMatch)",
+        req("q", "大亚湾",
+            "qf", "standardtok",
+            "mm", "67%",
+            "defType", "edismax")
+        , "*[count(//doc)=2]");
+    assertQ("test cjk (conjunction)",
+        req("q", "大亚湾",
+            "qf", "standardtok",
+            "mm", "100%",
+            "defType", "edismax")
+        , "*[count(//doc)=1]");
+  }
+  /** 
+   * test that minShouldMatch works with aliasing
+   * for implicit boolean queries
+   */
+  public void testCJKAliasing() throws Exception {
+    // single field
+    assertQ("test cjk (aliasing+disjunction)",
+        req("q", "myalias:大亚湾",
+            "f.myalias.qf", "standardtok",
+            "mm", "0%",
+            "defType", "edismax")
+        , "*[count(//doc)=3]");
+    assertQ("test cjk (aliasing+minShouldMatch)",
+        req("q", "myalias:大亚湾",
+            "f.myalias.qf", "standardtok",
+            "mm", "67%",
+            "defType", "edismax")
+        , "*[count(//doc)=2]");
+    assertQ("test cjk (aliasing+conjunction)",
+        req("q", "myalias:大亚湾",
+            "f.myalias.qf", "standardtok",
+            "mm", "100%",
+            "defType", "edismax")
+        , "*[count(//doc)=1]");
+    // multifield
+    assertQ("test cjk (aliasing+disjunction)",
+        req("q", "myalias:大亚湾",
+            "f.myalias.qf", "standardtok HTMLstandardtok",
+            "mm", "0%",
+            "defType", "edismax")
+        , "*[count(//doc)=6]");
+    assertQ("test cjk (aliasing+minShouldMatch)",
+        req("q", "myalias:大亚湾",
+            "f.myalias.qf", "standardtok HTMLstandardtok",
+            "mm", "67%",
+            "defType", "edismax")
+        , "*[count(//doc)=4]");
+    assertQ("test cjk (aliasing+conjunction)",
+        req("q", "myalias:大亚湾",
+            "f.myalias.qf", "standardtok HTMLstandardtok",
+            "mm", "100%",
+            "defType", "edismax")
+        , "*[count(//doc)=2]");
+  }
+  
+  /** Test that we apply boosts correctly */
+  public void testCJKBoosts() throws Exception {
+    assertQ("test cjk (disjunction)",
+        req("q", "大亚湾",
+            "qf", "standardtok^2 HTMLstandardtok",
+            "mm", "0%",
+            "defType", "edismax")
+        , "*[count(//doc)=6]", "//result/doc[1]/str[@name='id'][.='57']");
+    assertQ("test cjk (minShouldMatch)",
+        req("q", "大亚湾",
+            "qf", "standardtok^2 HTMLstandardtok",
+            "mm", "67%",
+            "defType", "edismax")
+        , "*[count(//doc)=4]", "//result/doc[1]/str[@name='id'][.='57']");
+    assertQ("test cjk (conjunction)",
+        req("q", "大亚湾",
+            "qf", "standardtok^2 HTMLstandardtok",
+            "mm", "100%",
+            "defType", "edismax")
+        , "*[count(//doc)=2]", "//result/doc[1]/str[@name='id'][.='57']");
+    
+    // now boost the other field
+    assertQ("test cjk (disjunction)",
+        req("q", "大亚湾",
+            "qf", "standardtok HTMLstandardtok^2",
+            "mm", "0%",
+            "defType", "edismax")
+        , "*[count(//doc)=6]", "//result/doc[1]/str[@name='id'][.='60']");
+    assertQ("test cjk (minShouldMatch)",
+        req("q", "大亚湾",
+            "qf", "standardtok HTMLstandardtok^2",
+            "mm", "67%",
+            "defType", "edismax")
+        , "*[count(//doc)=4]", "//result/doc[1]/str[@name='id'][.='60']");
+    assertQ("test cjk (conjunction)",
+        req("q", "大亚湾",
+            "qf", "standardtok HTMLstandardtok^2",
+            "mm", "100%",
+            "defType", "edismax")
+        , "*[count(//doc)=2]", "//result/doc[1]/str[@name='id'][.='60']");
+  }
+  
+  /** always apply minShouldMatch to the inner booleanqueries
+   *  created from whitespace, as these are never structured lucene queries
+   *  but only come from unstructured text */
+  public void testCJKStructured() throws Exception {
+    assertQ("test cjk (disjunction)",
+        req("q", "大亚湾 OR bogus",
+            "qf", "standardtok",
+            "mm", "0%",
+            "defType", "edismax")
+        , "*[count(//doc)=3]");
+    assertQ("test cjk (minShouldMatch)",
+        req("q", "大亚湾 OR bogus",
+            "qf", "standardtok",
+            "mm", "67%",
+            "defType", "edismax")
+        , "*[count(//doc)=2]");
+    assertQ("test cjk (conjunction)",
+        req("q", "大亚湾 OR bogus",
+            "qf", "standardtok",
+            "mm", "100%",
+            "defType", "edismax")
+        , "*[count(//doc)=1]");
+  }
+  
+  /**
+   * Test that we don't apply minShouldMatch to the inner boolean queries
+   * when there are synonyms (these are indicated by coordination factor)
+   */
+  public void testSynonyms() throws Exception {
+    // document only contains baraaa, but should still match.
+    assertQ("test synonyms",
+        req("q", "fooaaa",
+            "qf", "text_sw",
+            "mm", "100%",
+            "defType", "edismax")
+        , "*[count(//doc)=1]");
+  }
 }