You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by st...@apache.org on 2011/08/03 11:08:43 UTC
svn commit: r1153399 - in /lucene/dev/trunk/solr/contrib/clustering/src:
java/org/apache/solr/handler/clustering/carrot2/
test-files/clustering/solr/conf/
test/org/apache/solr/handler/clustering/carrot2/
Author: stanislaw
Date: Wed Aug 3 09:08:39 2011
New Revision: 1153399
URL: http://svn.apache.org/viewvc?rev=1153399&view=rev
Log:
SOLR-1692: CarrotClusteringEngine produce summary does nothing: improved unit tests
Added:
lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/EchoClusteringAlgorithm.java
Modified:
lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java
lucene/dev/trunk/solr/contrib/clustering/src/test-files/clustering/solr/conf/solrconfig.xml
lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java
Modified: lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java?rev=1153399&r1=1153398&r2=1153399&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java (original)
+++ lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java Wed Aug 3 09:08:39 2011
@@ -264,7 +264,7 @@ public class CarrotClusteringEngine exte
SolrQueryRequest req = null;
String[] snippetFieldAry = null;
- if (produceSummary == true) {
+ if (produceSummary) {
highlighter = HighlightComponent.getHighlighter(core);
if (highlighter != null){
Map<String, Object> args = Maps.newHashMap();
Modified: lucene/dev/trunk/solr/contrib/clustering/src/test-files/clustering/solr/conf/solrconfig.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/src/test-files/clustering/solr/conf/solrconfig.xml?rev=1153399&r1=1153398&r2=1153399&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/clustering/src/test-files/clustering/solr/conf/solrconfig.xml (original)
+++ lucene/dev/trunk/solr/contrib/clustering/src/test-files/clustering/solr/conf/solrconfig.xml Wed Aug 3 09:08:39 2011
@@ -397,6 +397,10 @@
<str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.MockClusteringAlgorithm</str>
</lst>
<lst name="engine">
+ <str name="name">echo</str>
+ <str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.EchoClusteringAlgorithm</str>
+ </lst>
+ <lst name="engine">
<str name="name">lexical-resource-check</str>
<str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.LexicalResourcesCheckClusteringAlgorithm</str>
</lst>
Modified: lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java?rev=1153399&r1=1153398&r2=1153399&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java (original)
+++ lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java Wed Aug 3 09:08:39 2011
@@ -58,14 +58,52 @@ public class CarrotClusteringEngineTest
@Test
public void testProduceSummary() throws Exception {
- ModifiableSolrParams solrParams = new ModifiableSolrParams();
- solrParams.add(CarrotParams.SNIPPET_FIELD_NAME, "snippet");
- solrParams.add(CarrotParams.SUMMARY_FRAGSIZE, "200");//how do we validate this?
+ // We'll make two queries, one with- and another one without summary
+ // and assert that documents are shorter when highlighter is in use.
+ final List<NamedList<Object>> noSummaryClusters = clusterWithHighlighting(false, 80);
+ final List<NamedList<Object>> summaryClusters = clusterWithHighlighting(true, 80);
+
+ assertEquals("Equal number of clusters", noSummaryClusters.size(), summaryClusters.size());
+ for (int i = 0; i < noSummaryClusters.size(); i++) {
+ assertTrue("Summary shorter than original document",
+ getLabels(noSummaryClusters.get(i)).get(1).length() >
+ getLabels(summaryClusters.get(i)).get(1).length());
+ }
+ }
+
+ @Test
+ public void testSummaryFragSize() throws Exception {
+ // We'll make two queries, one short summaries and another one with longer
+ // summaries and will check that the results differ.
+ final List<NamedList<Object>> shortSummaryClusters = clusterWithHighlighting(true, 30);
+ final List<NamedList<Object>> longSummaryClusters = clusterWithHighlighting(true, 80);
+
+ assertEquals("Equal number of clusters", shortSummaryClusters.size(), longSummaryClusters.size());
+ for (int i = 0; i < shortSummaryClusters.size(); i++) {
+ assertTrue("Summary shorter than original document",
+ getLabels(shortSummaryClusters.get(i)).get(1).length() <
+ getLabels(longSummaryClusters.get(i)).get(1).length());
+ }
+ }
+
+ private List<NamedList<Object>> clusterWithHighlighting(
+ boolean enableHighlighting, int fragSize) throws IOException {
+
+ final TermQuery query = new TermQuery(new Term("snippet", "mine"));
+ // Two documents don't have mining in the snippet
+ int expectedNumDocuments = numberOfDocs - 2;
+
+ final ModifiableSolrParams summaryParams = new ModifiableSolrParams();
+ summaryParams.add(CarrotParams.SNIPPET_FIELD_NAME, "snippet");
+ summaryParams.add(CarrotParams.PRODUCE_SUMMARY,
+ Boolean.toString(enableHighlighting));
+ summaryParams
+ .add(CarrotParams.SUMMARY_FRAGSIZE, Integer.toString(fragSize));
+ final List<NamedList<Object>> summaryClusters = checkEngine(
+ getClusteringEngine("echo"), expectedNumDocuments,
+ expectedNumDocuments, query, summaryParams);
- // Note: the expected number of clusters may change after upgrading Carrot2
- // due to e.g. internal improvements or tuning of Carrot2 clustering.
- final int expectedNumClusters = 15;
- checkEngine(getClusteringEngine("default"), numberOfDocs -2 /*two don't have mining in the snippet*/, expectedNumClusters, new TermQuery(new Term("snippet", "mine")), solrParams);
+ return summaryClusters;
}
@Test
@@ -227,7 +265,6 @@ public class CarrotClusteringEngineTest
assertEquals("docList size", expectedNumDocs, docList.matches());
ModifiableSolrParams solrParams = new ModifiableSolrParams();
- solrParams.add(CarrotParams.PRODUCE_SUMMARY, "true");
solrParams.add(clusteringParams);
// Perform clustering
Added: lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/EchoClusteringAlgorithm.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/EchoClusteringAlgorithm.java?rev=1153399&view=auto
==============================================================================
--- lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/EchoClusteringAlgorithm.java (added)
+++ lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/EchoClusteringAlgorithm.java Wed Aug 3 09:08:39 2011
@@ -0,0 +1,62 @@
+package org.apache.solr.handler.clustering.carrot2;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import java.util.List;
+
+import org.carrot2.core.Cluster;
+import org.carrot2.core.Document;
+import org.carrot2.core.IClusteringAlgorithm;
+import org.carrot2.core.ProcessingComponentBase;
+import org.carrot2.core.ProcessingException;
+import org.carrot2.core.attribute.AttributeNames;
+import org.carrot2.core.attribute.Processing;
+import org.carrot2.util.attribute.Attribute;
+import org.carrot2.util.attribute.Bindable;
+import org.carrot2.util.attribute.Input;
+import org.carrot2.util.attribute.Output;
+
+import com.google.common.collect.Lists;
+
+/**
+ * A mock Carrot2 clustering algorithm that outputs input documents as clusters.
+ * Useful only in tests.
+ */
+@Bindable(prefix = "EchoClusteringAlgorithm")
+public class EchoClusteringAlgorithm extends ProcessingComponentBase implements
+ IClusteringAlgorithm {
+ @Input
+ @Processing
+ @Attribute(key = AttributeNames.DOCUMENTS)
+ private List<Document> documents;
+
+ @Output
+ @Processing
+ @Attribute(key = AttributeNames.CLUSTERS)
+ private List<Cluster> clusters;
+
+ @Override
+ public void process() throws ProcessingException {
+ clusters = Lists.newArrayListWithCapacity(documents.size());
+
+ for (Document document : documents) {
+ final Cluster cluster = new Cluster();
+ cluster.addPhrases(document.getTitle(), document.getSummary());
+ cluster.addDocuments(document);
+ clusters.add(cluster);
+ }
+ }
+}
Re: svn commit: r1153399 - in /lucene/dev/trunk/solr/contrib/clustering/src:
java/org/apache/solr/handler/clustering/carrot2/ test-files/clustering/solr/conf/
test/org/apache/solr/handler/clustering/carrot2/
Posted by Stanislaw Osinski <st...@carrotsearch.com>.
>
> Staszek, add an entry to CHANGES as well?
In fact, this issue was fixed in 3.1.0 and it's already mentioned in changes
under that release. I think the issue remained open because of the
developing discussion in it, but reading these again, I think we can keep
highlighting-clustering the way it is, it's simple and it works. I'll
resolve the issue in a sec and change the fix for to reflect that.
S.
Re: svn commit: r1153399 - in /lucene/dev/trunk/solr/contrib/clustering/src:
java/org/apache/solr/handler/clustering/carrot2/ test-files/clustering/solr/conf/
test/org/apache/solr/handler/clustering/carrot2/
Posted by Dawid Weiss <da...@gmail.com>.
Staszek, add an entry to CHANGES as well?
Dawid
On Wed, Aug 3, 2011 at 11:08 AM, <st...@apache.org> wrote:
> Author: stanislaw
> Date: Wed Aug 3 09:08:39 2011
> New Revision: 1153399
>
> URL: http://svn.apache.org/viewvc?rev=1153399&view=rev
> Log:
> SOLR-1692: CarrotClusteringEngine produce summary does nothing: improved
> unit tests
>
> Added:
>
> lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/EchoClusteringAlgorithm.java
> Modified:
>
> lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java
>
> lucene/dev/trunk/solr/contrib/clustering/src/test-files/clustering/solr/conf/solrconfig.xml
>
> lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java
>
> Modified:
> lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java
> URL:
> http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java?rev=1153399&r1=1153398&r2=1153399&view=diff
>
> ==============================================================================
> ---
> lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java
> (original)
> +++
> lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java
> Wed Aug 3 09:08:39 2011
> @@ -264,7 +264,7 @@ public class CarrotClusteringEngine exte
>
> SolrQueryRequest req = null;
> String[] snippetFieldAry = null;
> - if (produceSummary == true) {
> + if (produceSummary) {
> highlighter = HighlightComponent.getHighlighter(core);
> if (highlighter != null){
> Map<String, Object> args = Maps.newHashMap();
>
> Modified:
> lucene/dev/trunk/solr/contrib/clustering/src/test-files/clustering/solr/conf/solrconfig.xml
> URL:
> http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/src/test-files/clustering/solr/conf/solrconfig.xml?rev=1153399&r1=1153398&r2=1153399&view=diff
>
> ==============================================================================
> ---
> lucene/dev/trunk/solr/contrib/clustering/src/test-files/clustering/solr/conf/solrconfig.xml
> (original)
> +++
> lucene/dev/trunk/solr/contrib/clustering/src/test-files/clustering/solr/conf/solrconfig.xml
> Wed Aug 3 09:08:39 2011
> @@ -397,6 +397,10 @@
> <str
> name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.MockClusteringAlgorithm</str>
> </lst>
> <lst name="engine">
> + <str name="name">echo</str>
> + <str
> name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.EchoClusteringAlgorithm</str>
> + </lst>
> + <lst name="engine">
> <str name="name">lexical-resource-check</str>
> <str
> name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.LexicalResourcesCheckClusteringAlgorithm</str>
> </lst>
>
> Modified:
> lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java
> URL:
> http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java?rev=1153399&r1=1153398&r2=1153399&view=diff
>
> ==============================================================================
> ---
> lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java
> (original)
> +++
> lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java
> Wed Aug 3 09:08:39 2011
> @@ -58,14 +58,52 @@ public class CarrotClusteringEngineTest
>
> @Test
> public void testProduceSummary() throws Exception {
> - ModifiableSolrParams solrParams = new ModifiableSolrParams();
> - solrParams.add(CarrotParams.SNIPPET_FIELD_NAME, "snippet");
> - solrParams.add(CarrotParams.SUMMARY_FRAGSIZE, "200");//how do we
> validate this?
> + // We'll make two queries, one with- and another one without summary
> + // and assert that documents are shorter when highlighter is in use.
> + final List<NamedList<Object>> noSummaryClusters =
> clusterWithHighlighting(false, 80);
> + final List<NamedList<Object>> summaryClusters =
> clusterWithHighlighting(true, 80);
> +
> + assertEquals("Equal number of clusters", noSummaryClusters.size(),
> summaryClusters.size());
> + for (int i = 0; i < noSummaryClusters.size(); i++) {
> + assertTrue("Summary shorter than original document",
> + getLabels(noSummaryClusters.get(i)).get(1).length() >
> + getLabels(summaryClusters.get(i)).get(1).length());
> + }
> + }
> +
> + @Test
> + public void testSummaryFragSize() throws Exception {
> + // We'll make two queries, one short summaries and another one with
> longer
> + // summaries and will check that the results differ.
> + final List<NamedList<Object>> shortSummaryClusters =
> clusterWithHighlighting(true, 30);
> + final List<NamedList<Object>> longSummaryClusters =
> clusterWithHighlighting(true, 80);
> +
> + assertEquals("Equal number of clusters", shortSummaryClusters.size(),
> longSummaryClusters.size());
> + for (int i = 0; i < shortSummaryClusters.size(); i++) {
> + assertTrue("Summary shorter than original document",
> + getLabels(shortSummaryClusters.get(i)).get(1).length() <
> + getLabels(longSummaryClusters.get(i)).get(1).length());
> + }
> + }
> +
> + private List<NamedList<Object>> clusterWithHighlighting(
> + boolean enableHighlighting, int fragSize) throws IOException {
> +
> + final TermQuery query = new TermQuery(new Term("snippet", "mine"));
> + // Two documents don't have mining in the snippet
> + int expectedNumDocuments = numberOfDocs - 2;
> +
> + final ModifiableSolrParams summaryParams = new ModifiableSolrParams();
> + summaryParams.add(CarrotParams.SNIPPET_FIELD_NAME, "snippet");
> + summaryParams.add(CarrotParams.PRODUCE_SUMMARY,
> + Boolean.toString(enableHighlighting));
> + summaryParams
> + .add(CarrotParams.SUMMARY_FRAGSIZE, Integer.toString(fragSize));
> + final List<NamedList<Object>> summaryClusters = checkEngine(
> + getClusteringEngine("echo"), expectedNumDocuments,
> + expectedNumDocuments, query, summaryParams);
>
> - // Note: the expected number of clusters may change after upgrading
> Carrot2
> - // due to e.g. internal improvements or tuning of Carrot2
> clustering.
> - final int expectedNumClusters = 15;
> - checkEngine(getClusteringEngine("default"), numberOfDocs -2 /*two
> don't have mining in the snippet*/, expectedNumClusters, new TermQuery(new
> Term("snippet", "mine")), solrParams);
> + return summaryClusters;
> }
>
> @Test
> @@ -227,7 +265,6 @@ public class CarrotClusteringEngineTest
> assertEquals("docList size", expectedNumDocs, docList.matches());
>
> ModifiableSolrParams solrParams = new ModifiableSolrParams();
> - solrParams.add(CarrotParams.PRODUCE_SUMMARY, "true");
> solrParams.add(clusteringParams);
>
> // Perform clustering
>
> Added:
> lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/EchoClusteringAlgorithm.java
> URL:
> http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/EchoClusteringAlgorithm.java?rev=1153399&view=auto
>
> ==============================================================================
> ---
> lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/EchoClusteringAlgorithm.java
> (added)
> +++
> lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/EchoClusteringAlgorithm.java
> Wed Aug 3 09:08:39 2011
> @@ -0,0 +1,62 @@
> +package org.apache.solr.handler.clustering.carrot2;
> +/**
> + * Licensed to the Apache Software Foundation (ASF) under one or more
> + * contributor license agreements. See the NOTICE file distributed with
> + * this work for additional information regarding copyright ownership.
> + * The ASF licenses this file to You under the Apache License, Version 2.0
> + * (the "License"); you may not use this file except in compliance with
> + * the License. You may obtain a copy of the License at
> + *
> + * http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +import java.util.List;
> +
> +import org.carrot2.core.Cluster;
> +import org.carrot2.core.Document;
> +import org.carrot2.core.IClusteringAlgorithm;
> +import org.carrot2.core.ProcessingComponentBase;
> +import org.carrot2.core.ProcessingException;
> +import org.carrot2.core.attribute.AttributeNames;
> +import org.carrot2.core.attribute.Processing;
> +import org.carrot2.util.attribute.Attribute;
> +import org.carrot2.util.attribute.Bindable;
> +import org.carrot2.util.attribute.Input;
> +import org.carrot2.util.attribute.Output;
> +
> +import com.google.common.collect.Lists;
> +
> +/**
> + * A mock Carrot2 clustering algorithm that outputs input documents as
> clusters.
> + * Useful only in tests.
> + */
> +@Bindable(prefix = "EchoClusteringAlgorithm")
> +public class EchoClusteringAlgorithm extends ProcessingComponentBase
> implements
> + IClusteringAlgorithm {
> + @Input
> + @Processing
> + @Attribute(key = AttributeNames.DOCUMENTS)
> + private List<Document> documents;
> +
> + @Output
> + @Processing
> + @Attribute(key = AttributeNames.CLUSTERS)
> + private List<Cluster> clusters;
> +
> + @Override
> + public void process() throws ProcessingException {
> + clusters = Lists.newArrayListWithCapacity(documents.size());
> +
> + for (Document document : documents) {
> + final Cluster cluster = new Cluster();
> + cluster.addPhrases(document.getTitle(), document.getSummary());
> + cluster.addDocuments(document);
> + clusters.add(cluster);
> + }
> + }
> +}
>
>
>