You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by st...@apache.org on 2011/12/09 16:57:36 UTC

svn commit: r1212488 - in /lucene/dev/trunk/solr/contrib/clustering: ./ src/java/org/apache/solr/handler/clustering/carrot2/ src/test/org/apache/solr/handler/clustering/ src/test/org/apache/solr/handler/clustering/carrot2/

Author: stanislaw
Date: Fri Dec  9 15:57:36 2011
New Revision: 1212488

URL: http://svn.apache.org/viewvc?rev=1212488&view=rev
Log:
SOLR-2937: Configuring the number of contextual snippets used for search results clustering

Modified:
    lucene/dev/trunk/solr/contrib/clustering/CHANGES.txt
    lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java
    lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java
    lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/AbstractClusteringTestCase.java
    lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java

Modified: lucene/dev/trunk/solr/contrib/clustering/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/CHANGES.txt?rev=1212488&r1=1212487&r2=1212488&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/clustering/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/contrib/clustering/CHANGES.txt Fri Dec  9 15:57:36 2011
@@ -13,7 +13,10 @@ $Id$
 
 ================== Release 3.6.0 ==================
 
-(No Changes)
+* SOLR-2937: Configuring the number of contextual snippets used for 
+  search results clustering. The hl.snippets parameter is now respected
+  by the clustering plugin, can be overridden by carrot.summarySnippets
+  if needed (Stanislaw Osinski).
 
 ================== Release 3.5.0 ==================
 
@@ -21,10 +24,10 @@ $Id$
 
 ================== Release 3.4.0 ==================
 
-SOLR-2706: The carrot.lexicalResourcesDir parameter now works 
-   with absolute directories (Stanislaw Osinski)
+* SOLR-2706: The carrot.lexicalResourcesDir parameter now works 
+  with absolute directories (Stanislaw Osinski)
   
-SOLR-2692: Typo in param name fixed: "carrot.fragzise" changed to 
+* SOLR-2692: Typo in param name fixed: "carrot.fragzise" changed to 
   "carrot.fragSize" (Stanislaw Osinski).
 
 ================== Release 3.3.0 ==================

Modified: lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java?rev=1212488&r1=1212487&r2=1212488&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java (original)
+++ lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java Fri Dec  9 15:57:36 2011
@@ -331,6 +331,7 @@ public class CarrotClusteringEngine exte
         args.put(HighlightParams.SIMPLE_PRE, ""); //we don't care about actually highlighting the area
         args.put(HighlightParams.SIMPLE_POST, "");
         args.put(HighlightParams.FRAGSIZE, solrParams.getInt(CarrotParams.SUMMARY_FRAGSIZE, solrParams.getInt(HighlightParams.FRAGSIZE, 100)));
+        args.put(HighlightParams.SNIPPETS, solrParams.getInt(CarrotParams.SUMMARY_SNIPPETS, solrParams.getInt(HighlightParams.SNIPPETS, 1)));
         req = new LocalSolrQueryRequest(core, query.toString(), "", 0, 1, args) {
           @Override
           public SolrIndexSearcher getSearcher() {
@@ -364,8 +365,16 @@ public class CarrotClusteringEngine exte
           @SuppressWarnings("unchecked")
           NamedList<String []> tmp = (NamedList<String[]>) highlights.getVal(0);
           String [] highlt = tmp.get(snippetField);
-          if (highlt != null && highlt.length == 1) {
-            snippet = highlt[0];
+          
+          // Join fragments with a period, so that Carrot2 does not create
+          // cross-fragment phrases, such phrases rarely make sense.
+          if (highlt != null && highlt.length > 0) {
+            final StringBuilder sb = new StringBuilder(highlt[0]);
+            for (int i = 1; i < highlt.length; i++) {
+              sb.append(" . ");
+              sb.append(highlt[i]);
+            }
+            snippet = sb.toString();
           }
         }
       }

Modified: lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java?rev=1212488&r1=1212487&r2=1212488&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java (original)
+++ lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java Fri Dec  9 15:57:36 2011
@@ -34,6 +34,7 @@ public interface CarrotParams {
   String NUM_DESCRIPTIONS = CARROT_PREFIX + "numDescriptions";
   String OUTPUT_SUB_CLUSTERS = CARROT_PREFIX + "outputSubClusters";
   String SUMMARY_FRAGSIZE = CARROT_PREFIX + "fragSize";
+  String SUMMARY_SNIPPETS = CARROT_PREFIX + "summarySnippets";
 
   String LEXICAL_RESOURCES_DIR = CARROT_PREFIX + "lexicalResourcesDir";
 

Modified: lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/AbstractClusteringTestCase.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/AbstractClusteringTestCase.java?rev=1212488&r1=1212487&r2=1212488&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/AbstractClusteringTestCase.java (original)
+++ lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/AbstractClusteringTestCase.java Fri Dec  9 15:57:36 2011
@@ -17,6 +17,7 @@ package org.apache.solr.handler.clusteri
  */
 
 import org.apache.solr.SolrTestCaseJ4;
+import org.apache.solr.common.SolrInputDocument;
 import org.junit.BeforeClass;
 
 
@@ -34,6 +35,17 @@ public abstract class AbstractClustering
       assertNull(h.validateUpdate(adoc("id", Integer.toString(numberOfDocs), "url", doc[0], "title", doc[1], "snippet", doc[2])));
       numberOfDocs++;
     }
+    
+    // Add a multi-valued snippet
+    final SolrInputDocument multiValuedSnippet = new SolrInputDocument();
+    multiValuedSnippet.addField("id", numberOfDocs++);
+    multiValuedSnippet.addField("title", "Title");
+    multiValuedSnippet.addField("url", "URL");
+    multiValuedSnippet.addField("snippet", "First value of multi field. Some more text. And still more.");
+    multiValuedSnippet.addField("snippet", "Second value of multi field. Some more text. And still more.");
+    multiValuedSnippet.addField("snippet", "Third value of multi field. Some more text. And still more.");
+    assertNull(h.validateUpdate(adoc(multiValuedSnippet)));
+
     assertNull(h.validateUpdate(commit()));
   }
 

Modified: lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java?rev=1212488&r1=1212487&r2=1212488&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java (original)
+++ lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java Fri Dec  9 15:57:36 2011
@@ -88,10 +88,15 @@ public class CarrotClusteringEngineTest 
 
   private List<NamedList<Object>> clusterWithHighlighting(
       boolean enableHighlighting, int fragSize) throws IOException {
-    
-    final TermQuery query = new TermQuery(new Term("snippet", "mine"));
     // Two documents don't have mining in the snippet
-    int expectedNumDocuments = numberOfDocs - 2;
+    return clusterWithHighlighting(enableHighlighting, fragSize, 1, "mine", numberOfDocs - 2);
+  }
+
+  private List<NamedList<Object>> clusterWithHighlighting(
+      boolean enableHighlighting, int fragSize, int summarySnippets,
+      String term, int expectedNumDocuments) throws IOException {
+    
+    final TermQuery query = new TermQuery(new Term("snippet", term));
 
     final ModifiableSolrParams summaryParams = new ModifiableSolrParams();
     summaryParams.add(CarrotParams.SNIPPET_FIELD_NAME, "snippet");
@@ -99,6 +104,8 @@ public class CarrotClusteringEngineTest 
         Boolean.toString(enableHighlighting));
     summaryParams
         .add(CarrotParams.SUMMARY_FRAGSIZE, Integer.toString(fragSize));
+    summaryParams
+        .add(CarrotParams.SUMMARY_SNIPPETS, Integer.toString(summarySnippets));
     final List<NamedList<Object>> summaryClusters = checkEngine(
         getClusteringEngine("echo"), expectedNumDocuments,
         expectedNumDocuments, query, summaryParams);
@@ -229,6 +236,23 @@ public class CarrotClusteringEngineTest 
     assertEquals(ImmutableList.of("solrownstopword"),
         getLabels(clusters.get(1)));
   }
+  
+  @Test
+  public void highlightingOfMultiValueField() throws Exception {
+    final String snippetWithoutSummary = getLabels(clusterWithHighlighting(
+        false, 30, 3, "multi", 1).get(0)).get(1);
+    assertTrue("Snippet contains first value", snippetWithoutSummary.contains("First"));
+    assertTrue("Snippet contains second value", snippetWithoutSummary.contains("Second"));
+    assertTrue("Snippet contains third value", snippetWithoutSummary.contains("Third"));
+
+    final String snippetWithSummary = getLabels(clusterWithHighlighting(
+        true, 30, 3, "multi", 1).get(0)).get(1);
+    assertTrue("Snippet with summary shorter than full snippet",
+        snippetWithoutSummary.length() > snippetWithSummary.length());
+    assertTrue("Summary covers first value", snippetWithSummary.contains("First"));
+    assertTrue("Summary covers second value", snippetWithSummary.contains("Second"));
+    assertTrue("Summary covers third value", snippetWithSummary.contains("Third"));
+  }
 
   private CarrotClusteringEngine getClusteringEngine(String engineName) {
     ClusteringComponent comp = (ClusteringComponent) h.getCore()