You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by st...@apache.org on 2011/08/03 11:08:43 UTC

svn commit: r1153399 - in /lucene/dev/trunk/solr/contrib/clustering/src: java/org/apache/solr/handler/clustering/carrot2/ test-files/clustering/solr/conf/ test/org/apache/solr/handler/clustering/carrot2/

Author: stanislaw
Date: Wed Aug  3 09:08:39 2011
New Revision: 1153399

URL: http://svn.apache.org/viewvc?rev=1153399&view=rev
Log:
SOLR-1692: CarrotClusteringEngine produce summary does nothing: improved unit tests

Added:
    lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/EchoClusteringAlgorithm.java
Modified:
    lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java
    lucene/dev/trunk/solr/contrib/clustering/src/test-files/clustering/solr/conf/solrconfig.xml
    lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java

Modified: lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java?rev=1153399&r1=1153398&r2=1153399&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java (original)
+++ lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java Wed Aug  3 09:08:39 2011
@@ -264,7 +264,7 @@ public class CarrotClusteringEngine exte
 
     SolrQueryRequest req = null;
     String[] snippetFieldAry = null;
-    if (produceSummary == true) {
+    if (produceSummary) {
       highlighter = HighlightComponent.getHighlighter(core);
       if (highlighter != null){
         Map<String, Object> args = Maps.newHashMap();

Modified: lucene/dev/trunk/solr/contrib/clustering/src/test-files/clustering/solr/conf/solrconfig.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/src/test-files/clustering/solr/conf/solrconfig.xml?rev=1153399&r1=1153398&r2=1153399&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/clustering/src/test-files/clustering/solr/conf/solrconfig.xml (original)
+++ lucene/dev/trunk/solr/contrib/clustering/src/test-files/clustering/solr/conf/solrconfig.xml Wed Aug  3 09:08:39 2011
@@ -397,6 +397,10 @@
       <str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.MockClusteringAlgorithm</str>
     </lst>
     <lst name="engine">
+      <str name="name">echo</str>
+      <str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.EchoClusteringAlgorithm</str>
+    </lst>
+    <lst name="engine">
       <str name="name">lexical-resource-check</str>
       <str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.LexicalResourcesCheckClusteringAlgorithm</str>
     </lst>

Modified: lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java?rev=1153399&r1=1153398&r2=1153399&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java (original)
+++ lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java Wed Aug  3 09:08:39 2011
@@ -58,14 +58,52 @@ public class CarrotClusteringEngineTest 
 
   @Test
   public void testProduceSummary() throws Exception {
-    ModifiableSolrParams solrParams = new ModifiableSolrParams();
-    solrParams.add(CarrotParams.SNIPPET_FIELD_NAME, "snippet");
-    solrParams.add(CarrotParams.SUMMARY_FRAGSIZE, "200");//how do we validate this?
+    // We'll make two queries, one with- and another one without summary
+    // and assert that documents are shorter when highlighter is in use.
+    final List<NamedList<Object>> noSummaryClusters = clusterWithHighlighting(false, 80);
+    final List<NamedList<Object>> summaryClusters = clusterWithHighlighting(true, 80);
+
+    assertEquals("Equal number of clusters", noSummaryClusters.size(), summaryClusters.size());
+    for (int i = 0; i < noSummaryClusters.size(); i++) {
+      assertTrue("Summary shorter than original document", 
+          getLabels(noSummaryClusters.get(i)).get(1).length() > 
+          getLabels(summaryClusters.get(i)).get(1).length()); 
+    }
+  }
+  
+  @Test
+  public void testSummaryFragSize() throws Exception {
+    // We'll make two queries, one short summaries and another one with longer
+    // summaries and will check that the results differ.
+    final List<NamedList<Object>> shortSummaryClusters = clusterWithHighlighting(true, 30);
+    final List<NamedList<Object>> longSummaryClusters = clusterWithHighlighting(true, 80);
+    
+    assertEquals("Equal number of clusters", shortSummaryClusters.size(), longSummaryClusters.size());
+    for (int i = 0; i < shortSummaryClusters.size(); i++) {
+      assertTrue("Summary shorter than original document", 
+          getLabels(shortSummaryClusters.get(i)).get(1).length() < 
+      getLabels(longSummaryClusters.get(i)).get(1).length()); 
+    }
+  }
+
+  private List<NamedList<Object>> clusterWithHighlighting(
+      boolean enableHighlighting, int fragSize) throws IOException {
+    
+    final TermQuery query = new TermQuery(new Term("snippet", "mine"));
+    // Two documents don't have mining in the snippet
+    int expectedNumDocuments = numberOfDocs - 2;
+
+    final ModifiableSolrParams summaryParams = new ModifiableSolrParams();
+    summaryParams.add(CarrotParams.SNIPPET_FIELD_NAME, "snippet");
+    summaryParams.add(CarrotParams.PRODUCE_SUMMARY,
+        Boolean.toString(enableHighlighting));
+    summaryParams
+        .add(CarrotParams.SUMMARY_FRAGSIZE, Integer.toString(fragSize));
+    final List<NamedList<Object>> summaryClusters = checkEngine(
+        getClusteringEngine("echo"), expectedNumDocuments,
+        expectedNumDocuments, query, summaryParams);
     
-  	// Note: the expected number of clusters may change after upgrading Carrot2
-  	// due to e.g. internal improvements or tuning of Carrot2 clustering.
-    final int expectedNumClusters = 15;
-    checkEngine(getClusteringEngine("default"), numberOfDocs -2 /*two don't have mining in the snippet*/, expectedNumClusters, new TermQuery(new Term("snippet", "mine")), solrParams);
+    return summaryClusters;
   }
 
   @Test
@@ -227,7 +265,6 @@ public class CarrotClusteringEngineTest 
       assertEquals("docList size", expectedNumDocs, docList.matches());
 
       ModifiableSolrParams solrParams = new ModifiableSolrParams();
-      solrParams.add(CarrotParams.PRODUCE_SUMMARY, "true");
       solrParams.add(clusteringParams);
 
       // Perform clustering

Added: lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/EchoClusteringAlgorithm.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/EchoClusteringAlgorithm.java?rev=1153399&view=auto
==============================================================================
--- lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/EchoClusteringAlgorithm.java (added)
+++ lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/EchoClusteringAlgorithm.java Wed Aug  3 09:08:39 2011
@@ -0,0 +1,62 @@
+package org.apache.solr.handler.clustering.carrot2;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import java.util.List;
+
+import org.carrot2.core.Cluster;
+import org.carrot2.core.Document;
+import org.carrot2.core.IClusteringAlgorithm;
+import org.carrot2.core.ProcessingComponentBase;
+import org.carrot2.core.ProcessingException;
+import org.carrot2.core.attribute.AttributeNames;
+import org.carrot2.core.attribute.Processing;
+import org.carrot2.util.attribute.Attribute;
+import org.carrot2.util.attribute.Bindable;
+import org.carrot2.util.attribute.Input;
+import org.carrot2.util.attribute.Output;
+
+import com.google.common.collect.Lists;
+
+/**
+ * A mock Carrot2 clustering algorithm that outputs input documents as clusters.
+ * Useful only in tests.
+ */
+@Bindable(prefix = "EchoClusteringAlgorithm")
+public class EchoClusteringAlgorithm extends ProcessingComponentBase implements
+        IClusteringAlgorithm {
+  @Input
+  @Processing
+  @Attribute(key = AttributeNames.DOCUMENTS)
+  private List<Document> documents;
+
+  @Output
+  @Processing
+  @Attribute(key = AttributeNames.CLUSTERS)
+  private List<Cluster> clusters;
+
+  @Override
+  public void process() throws ProcessingException {
+    clusters = Lists.newArrayListWithCapacity(documents.size());
+    
+    for (Document document : documents) {
+      final Cluster cluster = new Cluster();
+      cluster.addPhrases(document.getTitle(), document.getSummary());
+      cluster.addDocuments(document);
+      clusters.add(cluster);
+    }
+  }
+}



Re: svn commit: r1153399 - in /lucene/dev/trunk/solr/contrib/clustering/src: java/org/apache/solr/handler/clustering/carrot2/ test-files/clustering/solr/conf/ test/org/apache/solr/handler/clustering/carrot2/

Posted by Stanislaw Osinski <st...@carrotsearch.com>.
>
> Staszek, add an entry to CHANGES as well?


In fact, this issue was fixed in 3.1.0 and it's already mentioned in changes
under that release. I think the issue remained open because of the
developing discussion in it, but reading these again, I think we can keep
highlighting-clustering the way it is, it's simple and it works. I'll
resolve the issue in a sec and change the fix for to reflect that.

S.

Re: svn commit: r1153399 - in /lucene/dev/trunk/solr/contrib/clustering/src: java/org/apache/solr/handler/clustering/carrot2/ test-files/clustering/solr/conf/ test/org/apache/solr/handler/clustering/carrot2/

Posted by Dawid Weiss <da...@gmail.com>.
Staszek, add an entry to CHANGES as well?
Dawid

On Wed, Aug 3, 2011 at 11:08 AM, <st...@apache.org> wrote:

> Author: stanislaw
> Date: Wed Aug  3 09:08:39 2011
> New Revision: 1153399
>
> URL: http://svn.apache.org/viewvc?rev=1153399&view=rev
> Log:
> SOLR-1692: CarrotClusteringEngine produce summary does nothing: improved
> unit tests
>
> Added:
>
>  lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/EchoClusteringAlgorithm.java
> Modified:
>
>  lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java
>
>  lucene/dev/trunk/solr/contrib/clustering/src/test-files/clustering/solr/conf/solrconfig.xml
>
>  lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java
>
> Modified:
> lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java
> URL:
> http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java?rev=1153399&r1=1153398&r2=1153399&view=diff
>
> ==============================================================================
> ---
> lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java
> (original)
> +++
> lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java
> Wed Aug  3 09:08:39 2011
> @@ -264,7 +264,7 @@ public class CarrotClusteringEngine exte
>
>     SolrQueryRequest req = null;
>     String[] snippetFieldAry = null;
> -    if (produceSummary == true) {
> +    if (produceSummary) {
>       highlighter = HighlightComponent.getHighlighter(core);
>       if (highlighter != null){
>         Map<String, Object> args = Maps.newHashMap();
>
> Modified:
> lucene/dev/trunk/solr/contrib/clustering/src/test-files/clustering/solr/conf/solrconfig.xml
> URL:
> http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/src/test-files/clustering/solr/conf/solrconfig.xml?rev=1153399&r1=1153398&r2=1153399&view=diff
>
> ==============================================================================
> ---
> lucene/dev/trunk/solr/contrib/clustering/src/test-files/clustering/solr/conf/solrconfig.xml
> (original)
> +++
> lucene/dev/trunk/solr/contrib/clustering/src/test-files/clustering/solr/conf/solrconfig.xml
> Wed Aug  3 09:08:39 2011
> @@ -397,6 +397,10 @@
>       <str
> name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.MockClusteringAlgorithm</str>
>     </lst>
>     <lst name="engine">
> +      <str name="name">echo</str>
> +      <str
> name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.EchoClusteringAlgorithm</str>
> +    </lst>
> +    <lst name="engine">
>       <str name="name">lexical-resource-check</str>
>       <str
> name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.LexicalResourcesCheckClusteringAlgorithm</str>
>     </lst>
>
> Modified:
> lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java
> URL:
> http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java?rev=1153399&r1=1153398&r2=1153399&view=diff
>
> ==============================================================================
> ---
> lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java
> (original)
> +++
> lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java
> Wed Aug  3 09:08:39 2011
> @@ -58,14 +58,52 @@ public class CarrotClusteringEngineTest
>
>   @Test
>   public void testProduceSummary() throws Exception {
> -    ModifiableSolrParams solrParams = new ModifiableSolrParams();
> -    solrParams.add(CarrotParams.SNIPPET_FIELD_NAME, "snippet");
> -    solrParams.add(CarrotParams.SUMMARY_FRAGSIZE, "200");//how do we
> validate this?
> +    // We'll make two queries, one with- and another one without summary
> +    // and assert that documents are shorter when highlighter is in use.
> +    final List<NamedList<Object>> noSummaryClusters =
> clusterWithHighlighting(false, 80);
> +    final List<NamedList<Object>> summaryClusters =
> clusterWithHighlighting(true, 80);
> +
> +    assertEquals("Equal number of clusters", noSummaryClusters.size(),
> summaryClusters.size());
> +    for (int i = 0; i < noSummaryClusters.size(); i++) {
> +      assertTrue("Summary shorter than original document",
> +          getLabels(noSummaryClusters.get(i)).get(1).length() >
> +          getLabels(summaryClusters.get(i)).get(1).length());
> +    }
> +  }
> +
> +  @Test
> +  public void testSummaryFragSize() throws Exception {
> +    // We'll make two queries, one short summaries and another one with
> longer
> +    // summaries and will check that the results differ.
> +    final List<NamedList<Object>> shortSummaryClusters =
> clusterWithHighlighting(true, 30);
> +    final List<NamedList<Object>> longSummaryClusters =
> clusterWithHighlighting(true, 80);
> +
> +    assertEquals("Equal number of clusters", shortSummaryClusters.size(),
> longSummaryClusters.size());
> +    for (int i = 0; i < shortSummaryClusters.size(); i++) {
> +      assertTrue("Summary shorter than original document",
> +          getLabels(shortSummaryClusters.get(i)).get(1).length() <
> +      getLabels(longSummaryClusters.get(i)).get(1).length());
> +    }
> +  }
> +
> +  private List<NamedList<Object>> clusterWithHighlighting(
> +      boolean enableHighlighting, int fragSize) throws IOException {
> +
> +    final TermQuery query = new TermQuery(new Term("snippet", "mine"));
> +    // Two documents don't have mining in the snippet
> +    int expectedNumDocuments = numberOfDocs - 2;
> +
> +    final ModifiableSolrParams summaryParams = new ModifiableSolrParams();
> +    summaryParams.add(CarrotParams.SNIPPET_FIELD_NAME, "snippet");
> +    summaryParams.add(CarrotParams.PRODUCE_SUMMARY,
> +        Boolean.toString(enableHighlighting));
> +    summaryParams
> +        .add(CarrotParams.SUMMARY_FRAGSIZE, Integer.toString(fragSize));
> +    final List<NamedList<Object>> summaryClusters = checkEngine(
> +        getClusteringEngine("echo"), expectedNumDocuments,
> +        expectedNumDocuments, query, summaryParams);
>
> -       // Note: the expected number of clusters may change after upgrading
> Carrot2
> -       // due to e.g. internal improvements or tuning of Carrot2
> clustering.
> -    final int expectedNumClusters = 15;
> -    checkEngine(getClusteringEngine("default"), numberOfDocs -2 /*two
> don't have mining in the snippet*/, expectedNumClusters, new TermQuery(new
> Term("snippet", "mine")), solrParams);
> +    return summaryClusters;
>   }
>
>   @Test
> @@ -227,7 +265,6 @@ public class CarrotClusteringEngineTest
>       assertEquals("docList size", expectedNumDocs, docList.matches());
>
>       ModifiableSolrParams solrParams = new ModifiableSolrParams();
> -      solrParams.add(CarrotParams.PRODUCE_SUMMARY, "true");
>       solrParams.add(clusteringParams);
>
>       // Perform clustering
>
> Added:
> lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/EchoClusteringAlgorithm.java
> URL:
> http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/EchoClusteringAlgorithm.java?rev=1153399&view=auto
>
> ==============================================================================
> ---
> lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/EchoClusteringAlgorithm.java
> (added)
> +++
> lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/EchoClusteringAlgorithm.java
> Wed Aug  3 09:08:39 2011
> @@ -0,0 +1,62 @@
> +package org.apache.solr.handler.clustering.carrot2;
> +/**
> + * Licensed to the Apache Software Foundation (ASF) under one or more
> + * contributor license agreements.  See the NOTICE file distributed with
> + * this work for additional information regarding copyright ownership.
> + * The ASF licenses this file to You under the Apache License, Version 2.0
> + * (the "License"); you may not use this file except in compliance with
> + * the License.  You may obtain a copy of the License at
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +import java.util.List;
> +
> +import org.carrot2.core.Cluster;
> +import org.carrot2.core.Document;
> +import org.carrot2.core.IClusteringAlgorithm;
> +import org.carrot2.core.ProcessingComponentBase;
> +import org.carrot2.core.ProcessingException;
> +import org.carrot2.core.attribute.AttributeNames;
> +import org.carrot2.core.attribute.Processing;
> +import org.carrot2.util.attribute.Attribute;
> +import org.carrot2.util.attribute.Bindable;
> +import org.carrot2.util.attribute.Input;
> +import org.carrot2.util.attribute.Output;
> +
> +import com.google.common.collect.Lists;
> +
> +/**
> + * A mock Carrot2 clustering algorithm that outputs input documents as
> clusters.
> + * Useful only in tests.
> + */
> +@Bindable(prefix = "EchoClusteringAlgorithm")
> +public class EchoClusteringAlgorithm extends ProcessingComponentBase
> implements
> +        IClusteringAlgorithm {
> +  @Input
> +  @Processing
> +  @Attribute(key = AttributeNames.DOCUMENTS)
> +  private List<Document> documents;
> +
> +  @Output
> +  @Processing
> +  @Attribute(key = AttributeNames.CLUSTERS)
> +  private List<Cluster> clusters;
> +
> +  @Override
> +  public void process() throws ProcessingException {
> +    clusters = Lists.newArrayListWithCapacity(documents.size());
> +
> +    for (Document document : documents) {
> +      final Cluster cluster = new Cluster();
> +      cluster.addPhrases(document.getTitle(), document.getSummary());
> +      cluster.addDocuments(document);
> +      clusters.add(cluster);
> +    }
> +  }
> +}
>
>
>