You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ab...@apache.org on 2018/05/28 15:40:01 UTC

[18/35] lucene-solr:jira/solr-11779: SOLR-5351: Fixed More Like This Handler to use all fields provided in mlt.fl when used with content stream. The similarity is calculated between the content stream's value and all fields listed in mlt.fl.

SOLR-5351: Fixed More Like This Handler to use all fields provided in mlt.fl when used with content stream. The similarity is calculated between the content stream's value and all fields listed in mlt.fl.


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/41ecad98
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/41ecad98
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/41ecad98

Branch: refs/heads/jira/solr-11779
Commit: 41ecad9897bb8949bfed730cd988aec58aa69775
Parents: 0a1de2c
Author: Dawid Weiss <dw...@apache.org>
Authored: Fri May 25 11:39:42 2018 +0200
Committer: Dawid Weiss <dw...@apache.org>
Committed: Fri May 25 11:39:42 2018 +0200

----------------------------------------------------------------------
 solr/CHANGES.txt                                |   5 +
 .../solr/handler/MoreLikeThisHandler.java       |  42 ++++-
 .../solr/handler/MoreLikeThisHandlerTest.java   | 163 ++++++++++++-------
 3 files changed, 142 insertions(+), 68 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/41ecad98/solr/CHANGES.txt
----------------------------------------------------------------------
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 9159e92..3218ab3 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -131,6 +131,10 @@ New Features
 Bug Fixes
 ----------------------
 
+* SOLR-5351: Fixed More Like This Handler to use all fields provided in mlt.fl when used with
+  content stream. The similarity is calculated between the content stream's value and all 
+  fields listed in mlt.fl. (Dawid Weiss)
+
 * SOLR-12103: Raise CryptoKeys.DEFAULT_KEYPAIR_LENGTH from 1024 to 2048. (Mark Miller)
 
 * SOLR-12107: Fixed a error in [child] transformer that could ocur if documentCache was not used (hossman)
@@ -425,6 +429,7 @@ Upgrade Notes
 
 New Features
 ----------------------
+
 * SOLR-11285: Simulation framework for autoscaling. (ab)
 
 * LUCENE-2899: In the Solr analysis-extras contrib, added support for the

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/41ecad98/solr/core/src/java/org/apache/solr/handler/MoreLikeThisHandler.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/handler/MoreLikeThisHandler.java b/solr/core/src/java/org/apache/solr/handler/MoreLikeThisHandler.java
index 62f1016..cce2939 100644
--- a/solr/core/src/java/org/apache/solr/handler/MoreLikeThisHandler.java
+++ b/solr/core/src/java/org/apache/solr/handler/MoreLikeThisHandler.java
@@ -21,6 +21,8 @@ import java.io.Reader;
 import java.lang.invoke.MethodHandles;
 import java.util.ArrayList;
 import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
@@ -36,6 +38,7 @@ import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.BoostQuery;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.util.CharsRefBuilder;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.StringUtils;
 import org.apache.solr.common.params.CommonParams;
@@ -80,7 +83,13 @@ public class MoreLikeThisHandler extends RequestHandlerBase
   private static final Pattern splitList = Pattern.compile(",| ");
 
   private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
-  
+
+  static final String ERR_MSG_QUERY_OR_TEXT_REQUIRED =
+      "MoreLikeThis requires either a query (?q=) or text to find similar documents.";
+
+  static final String ERR_MSG_SINGLE_STREAM_ONLY =
+      "MoreLikeThis does not support multiple ContentStreams";
+
   @Override
   public void init(NamedList args) {
     super.init(args);
@@ -156,7 +165,7 @@ public class MoreLikeThisHandler extends RequestHandlerBase
               }
               if (iter.hasNext()) {
                 throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
-                    "MoreLikeThis does not support multiple ContentStreams");
+                    ERR_MSG_SINGLE_STREAM_ONLY);
               }
             }
           }
@@ -191,7 +200,7 @@ public class MoreLikeThisHandler extends RequestHandlerBase
             }
           } else {
             throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
-                "MoreLikeThis requires either a query (?q=) or text to find similar documents.");
+                ERR_MSG_QUERY_OR_TEXT_REQUIRED);
           }
 
         } finally {
@@ -411,10 +420,31 @@ public class MoreLikeThisHandler extends RequestHandlerBase
 
     public DocListAndSet getMoreLikeThis( Reader reader, int start, int rows, List<Query> filters, List<InterestingTerm> terms, int flags ) throws IOException
     {
-      // analyzing with the first field: previous (stupid) behavior
-      rawMLTQuery = mlt.like(mlt.getFieldNames()[0], reader);
+      // SOLR-5351: if only check against a single field, use the reader directly. Otherwise we
+      // repeat the stream's content for multiple fields so that query terms can be pulled from any
+      // of those fields.
+      String [] fields = mlt.getFieldNames();
+      if (fields.length == 1) {
+        rawMLTQuery = mlt.like(fields[0], reader);
+      } else {
+        CharsRefBuilder buffered = new CharsRefBuilder();
+        char [] chunk = new char [1024];
+        int len;
+        while ((len = reader.read(chunk)) >= 0) {
+          buffered.append(chunk, 0, len);
+        }
+
+        Collection<Object> streamValue = Collections.singleton(buffered.get().toString());
+        Map<String, Collection<Object>> multifieldDoc = new HashMap<>(fields.length);
+        for (String field : fields) {
+          multifieldDoc.put(field, streamValue);
+        }
+
+        rawMLTQuery = mlt.like(multifieldDoc);
+      }
+
       boostedMLTQuery = getBoostedQuery( rawMLTQuery );
-      if( terms != null ) {
+      if (terms != null) {
         fillInterestingTermsFromMLTQuery( boostedMLTQuery, terms );
       }
       DocListAndSet results = new DocListAndSet();

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/41ecad98/solr/core/src/test/org/apache/solr/handler/MoreLikeThisHandlerTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/handler/MoreLikeThisHandlerTest.java b/solr/core/src/test/org/apache/solr/handler/MoreLikeThisHandlerTest.java
index aa63ce3..6b80014 100644
--- a/solr/core/src/test/org/apache/solr/handler/MoreLikeThisHandlerTest.java
+++ b/solr/core/src/test/org/apache/solr/handler/MoreLikeThisHandlerTest.java
@@ -18,6 +18,7 @@ package org.apache.solr.handler;
 
 import java.util.ArrayList;
 import org.apache.solr.SolrTestCaseJ4;
+import org.apache.solr.common.SolrException;
 import org.apache.solr.common.params.*;
 import org.apache.solr.common.util.ContentStream;
 import org.apache.solr.common.util.ContentStreamBase;
@@ -29,12 +30,10 @@ import org.apache.solr.response.SolrQueryResponse;
 import org.junit.BeforeClass;
 import org.junit.Test;
 
-
 /**
  * TODO -- this needs to actually test the results/query etc
  */
 public class MoreLikeThisHandlerTest extends SolrTestCaseJ4 {
-
   @BeforeClass
   public static void moreLikeThisBeforeClass() throws Exception {
     initCore("solrconfig.xml", "schema.xml");
@@ -47,27 +46,7 @@ public class MoreLikeThisHandlerTest extends SolrTestCaseJ4 {
     MoreLikeThisHandler mlt = new MoreLikeThisHandler();
     
     ModifiableSolrParams params = new ModifiableSolrParams();
-    SolrQueryRequestBase req = new SolrQueryRequestBase( core, params) {};
-    
-    // requires 'q' or single content stream
-    try {
-      mlt.handleRequestBody( req, new SolrQueryResponse() );
-    }
-    catch( Exception ex ) {} // expected
-
-    // requires 'q' or single content stream
-    try {
-      ArrayList<ContentStream> streams = new ArrayList<>( 2 );
-      streams.add( new ContentStreamBase.StringStream( "hello" ) );
-      streams.add( new ContentStreamBase.StringStream( "there" ) );
-      req.setContentStreams( streams );
-      mlt.handleRequestBody( req, new SolrQueryResponse() );
-    }
-    catch( Exception ex ) {} // expected
-    finally {
-      req.close();
-    }
-    
+
     assertU(adoc("id","42","name","Tom Cruise","subword","Top Gun","subword","Risky Business","subword","The Color of Money","subword","Minority Report","subword", "Days of Thunder","subword", "Eyes Wide Shut","subword", "Far and Away", "foo_ti","10"));
     assertU(adoc("id","43","name","Tom Hanks","subword","The Green Mile","subword","Forest Gump","subword","Philadelphia Story","subword","Big","subword","Cast Away", "foo_ti","10"));
     assertU(adoc("id","44","name","Harrison Ford","subword","Star Wars","subword","Indiana Jones","subword","Patriot Games","subword","Regarding Henry"));
@@ -75,7 +54,6 @@ public class MoreLikeThisHandlerTest extends SolrTestCaseJ4 {
     assertU(adoc("id","46","name","Nicole Kidman","subword","Batman","subword","Days of Thunder","subword","Eyes Wide Shut","subword","Far and Away"));
     assertU(commit());
 
-    params.set(CommonParams.Q, "id:42");
     params.set(MoreLikeThisParams.MLT, "true");
     params.set(MoreLikeThisParams.SIMILARITY_FIELDS, "name,subword");
     params.set(MoreLikeThisParams.INTERESTING_TERMS, "details");
@@ -83,67 +61,128 @@ public class MoreLikeThisHandlerTest extends SolrTestCaseJ4 {
     params.set(MoreLikeThisParams.MIN_DOC_FREQ,"1");
     params.set("indent","true");
 
-    SolrQueryRequest mltreq = new LocalSolrQueryRequest( core, params);
-    assertQ("morelikethis - tom cruise",mltreq
-        ,"//result/doc[1]/str[@name='id'][.='46']"
-        ,"//result/doc[2]/str[@name='id'][.='43']");
+    // requires 'q' or a single content stream
+    SolrException ex = expectThrows(SolrException.class, () -> {
+      try (SolrQueryRequestBase req = new SolrQueryRequestBase(core, params) {}) {
+        mlt.handleRequestBody(req, new SolrQueryResponse());
+      }
+    });
+    assertEquals(ex.getMessage(), MoreLikeThisHandler.ERR_MSG_QUERY_OR_TEXT_REQUIRED);
+    assertEquals(ex.code(), SolrException.ErrorCode.BAD_REQUEST.code);
+
+    // requires a single content stream (more than one is not supported).
+    ex = expectThrows(SolrException.class, () -> {
+      try (SolrQueryRequestBase req = new SolrQueryRequestBase(core, params) {}) {
+        ArrayList<ContentStream> streams = new ArrayList<>(2);
+        streams.add(new ContentStreamBase.StringStream("hello"));
+        streams.add(new ContentStreamBase.StringStream("there"));
+        req.setContentStreams(streams);
+        mlt.handleRequestBody(req, new SolrQueryResponse());
+      }
+    });
+    assertEquals(ex.getMessage(), MoreLikeThisHandler.ERR_MSG_SINGLE_STREAM_ONLY);
+    assertEquals(ex.code(), SolrException.ErrorCode.BAD_REQUEST.code);
+
+    params.set(CommonParams.Q, "id:42");
+
+    try (SolrQueryRequest mltreq = new LocalSolrQueryRequest( core, params)) {
+      assertQ("morelikethis - tom cruise", mltreq,
+          "//result/doc[1]/str[@name='id'][.='46']",
+          "//result/doc[2]/str[@name='id'][.='43']");
+    }
 
     params.set(MoreLikeThisParams.BOOST, "true");
-    mltreq.close(); mltreq = new LocalSolrQueryRequest( core, params);
-    assertQ("morelikethis - tom cruise",mltreq
-        ,"//result/doc[1]/str[@name='id'][.='46']"
-        ,"//result/doc[2]/str[@name='id'][.='43']");
+
+    try (SolrQueryRequest mltreq = new LocalSolrQueryRequest( core, params)) {
+      assertQ("morelikethis - tom cruise", mltreq,
+          "//result/doc[1]/str[@name='id'][.='46']",
+          "//result/doc[2]/str[@name='id'][.='43']");
+    }
     
     params.set(CommonParams.Q, "id:44");
-    mltreq.close(); mltreq = new LocalSolrQueryRequest(h.getCore(), params);
-    assertQ("morelike this - harrison ford",mltreq
-        ,"//result/doc[1]/str[@name='id'][.='45']");
+    try (SolrQueryRequest mltreq = new LocalSolrQueryRequest( core, params)) {
+      assertQ("morelike this - harrison ford", mltreq,
+          "//result/doc[1]/str[@name='id'][.='45']");
+    }
 
     // test MoreLikeThis debug
     params.set(CommonParams.DEBUG_QUERY, "true");
-    mltreq.close(); mltreq = new LocalSolrQueryRequest(h.getCore(), params);
-    assertQ("morelike this - harrison ford",mltreq
-        ,"//lst[@name='debug']/lst[@name='moreLikeThis']/lst[@name='44']/str[@name='rawMLTQuery']"
-        ,"//lst[@name='debug']/lst[@name='moreLikeThis']/lst[@name='44']/str[@name='boostedMLTQuery']"
-        ,"//lst[@name='debug']/lst[@name='moreLikeThis']/lst[@name='44']/str[@name='realMLTQuery']"
-        ,"//lst[@name='debug']/lst[@name='moreLikeThis']/lst[@name='44']/lst[@name='explain']/str[@name='45']"
-        );
+    try (SolrQueryRequest mltreq = new LocalSolrQueryRequest( core, params)) {
+      assertQ("morelike this - harrison ford", mltreq,
+          "//lst[@name='debug']/lst[@name='moreLikeThis']/lst[@name='44']/str[@name='rawMLTQuery']",
+          "//lst[@name='debug']/lst[@name='moreLikeThis']/lst[@name='44']/str[@name='boostedMLTQuery']",
+          "//lst[@name='debug']/lst[@name='moreLikeThis']/lst[@name='44']/str[@name='realMLTQuery']",
+          "//lst[@name='debug']/lst[@name='moreLikeThis']/lst[@name='44']/lst[@name='explain']/str[@name='45']"
+      );
+    }
 
     // test that qparser plugins work
     params.remove(CommonParams.DEBUG_QUERY);
     params.set(CommonParams.Q, "{!field f=id}44");
-    mltreq.close(); mltreq = new LocalSolrQueryRequest(h.getCore(), params);
-    assertQ(mltreq
-        ,"//result/doc[1]/str[@name='id'][.='45']");
+    try (SolrQueryRequest mltreq = new LocalSolrQueryRequest( core, params)) {
+      assertQ(mltreq, "//result/doc[1]/str[@name='id'][.='45']");
+    }
 
     params.set(CommonParams.Q, "id:42");
     params.set(MoreLikeThisParams.QF,"name^5.0 subword^0.1");
-    mltreq.close(); mltreq = new LocalSolrQueryRequest(h.getCore(), params);
-    assertQ("morelikethis with weights",mltreq
-        ,"//result/doc[1]/str[@name='id'][.='43']"
-        ,"//result/doc[2]/str[@name='id'][.='46']");
-
+    try (SolrQueryRequest mltreq = new LocalSolrQueryRequest( core, params)) {
+      assertQ("morelikethis with weights", mltreq,
+          "//result/doc[1]/str[@name='id'][.='43']",
+          "//result/doc[2]/str[@name='id'][.='46']");
+    }
 
     // test that qparser plugins work w/ the MoreLikeThisHandler
     params.set(CommonParams.QT, "/mlt");
     params.set(CommonParams.Q, "{!field f=id}44");
-    mltreq.close(); mltreq = new LocalSolrQueryRequest(h.getCore(), params);
-    assertQ(mltreq
-        ,"//result/doc[1]/str[@name='id'][.='45']");
+    try (SolrQueryRequest mltreq = new LocalSolrQueryRequest( core, params)) {
+      assertQ(mltreq, "//result/doc[1]/str[@name='id'][.='45']");
+    }
 
     // test that debugging works (test for MoreLikeThis*Handler*)
     params.set(CommonParams.QT, "/mlt");
     params.set(CommonParams.DEBUG_QUERY, "true");
-    mltreq.close(); mltreq = new LocalSolrQueryRequest(h.getCore(), params);
-    assertQ(mltreq
-        ,"//result/doc[1]/str[@name='id'][.='45']"
-        ,"//lst[@name='debug']/lst[@name='explain']"
-    );
+    try (SolrQueryRequest mltreq = new LocalSolrQueryRequest( core, params)) {
+      assertQ(mltreq,
+          "//result/doc[1]/str[@name='id'][.='45']",
+          "//lst[@name='debug']/lst[@name='explain']"
+      );
+    }
+  }
+
+  @Test
+  public void testMultifieldSimilarity() throws Exception
+  {
+    SolrCore core = h.getCore();
+    MoreLikeThisHandler mlt = new MoreLikeThisHandler();
 
-    // params.put(MoreLikeThisParams.QF,new String[]{"foo_ti"});
-    // String response = h.query(mltreq);
-    // System.out.println(response);
+    ModifiableSolrParams params = new ModifiableSolrParams();
 
-    mltreq.close();
+    assertU(adoc("id", "1", "name", "aaa bbb ccc", "subword", "        zzz"));
+    assertU(adoc("id", "2", "name", "    bbb ccc", "subword", "    bbb zzz"));
+    assertU(adoc("id", "3", "name", "        ccc", "subword", "aaa bbb zzz"));
+    assertU(adoc("id", "4", "name", "        ccc", "subword", "    bbb    "));
+    assertU(commit());
+
+    params.set(CommonParams.QT, "/mlt");
+    params.set(MoreLikeThisParams.MLT, "true");
+    params.set(MoreLikeThisParams.SIMILARITY_FIELDS, "name,subword");
+    params.set(MoreLikeThisParams.INTERESTING_TERMS, "details");
+    params.set(MoreLikeThisParams.MIN_TERM_FREQ, "1");
+    params.set(MoreLikeThisParams.MIN_DOC_FREQ, "2");
+    params.set(MoreLikeThisParams.BOOST, true);
+    params.set("indent", "true");
+
+    try (SolrQueryRequestBase req = new SolrQueryRequestBase(core, params) {}) {
+      ArrayList<ContentStream> streams = new ArrayList<>(2);
+      streams.add(new ContentStreamBase.StringStream("bbb", "zzz"));
+      req.setContentStreams(streams);
+
+      // Make sure we have terms from both fields in the interestingTerms array and all documents have been
+      // retrieved as matching.
+      assertQ(req,
+          "//lst[@name = 'interestingTerms']/float[@name = 'subword:bbb']",
+          "//lst[@name = 'interestingTerms']/float[@name = 'name:bbb']",
+          "//result[@name = 'response' and @numFound = '4']");
+    }
   }
 }