You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by st...@apache.org on 2011/12/09 17:00:41 UTC
svn commit: r1212490 - in /lucene/dev/trunk/solr/contrib/clustering: ./
src/java/org/apache/solr/handler/clustering/carrot2/
src/test-files/clustering/solr/conf/
src/test/org/apache/solr/handler/clustering/
src/test/org/apache/solr/handler/clustering/c...
Author: stanislaw
Date: Fri Dec 9 16:00:40 2011
New Revision: 1212490
URL: http://svn.apache.org/viewvc?rev=1212490&view=rev
Log:
SOLR-2939: Clustering of multilingual search results
Modified:
lucene/dev/trunk/solr/contrib/clustering/CHANGES.txt
lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java
lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java
lucene/dev/trunk/solr/contrib/clustering/src/test-files/clustering/solr/conf/schema.xml
lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/AbstractClusteringTestCase.java
lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java
lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/EchoClusteringAlgorithm.java
Modified: lucene/dev/trunk/solr/contrib/clustering/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/CHANGES.txt?rev=1212490&r1=1212489&r2=1212490&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/clustering/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/contrib/clustering/CHANGES.txt Fri Dec 9 16:00:40 2011
@@ -22,6 +22,10 @@ $Id$
carrot.snippet can now take comma- or space-separated lists of
field names to cluster (Stanislaw Osinski).
+* SOLR-2939: Clustering of multilingual search results. The document's
+ language field be passed in the carrot.lang parameter, the carrot.lcmap
+ parameter enables mapping of language codes to ISO 639 (Stanislaw Osinski).
+
================== Release 3.5.0 ==================
(No Changes)
Modified: lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java?rev=1212490&r1=1212489&r2=1212490&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java (original)
+++ lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java Fri Dec 9 16:00:40 2011
@@ -29,6 +29,7 @@ import java.util.Map;
import java.util.Set;
import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang.ObjectUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.lucene.search.Query;
import org.apache.solr.common.SolrDocument;
@@ -55,6 +56,7 @@ import org.carrot2.core.Controller;
import org.carrot2.core.ControllerFactory;
import org.carrot2.core.Document;
import org.carrot2.core.IClusteringAlgorithm;
+import org.carrot2.core.LanguageCode;
import org.carrot2.core.attribute.AttributeNames;
import org.carrot2.text.linguistic.DefaultLexicalDataFactoryDescriptor;
import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipelineDescriptor;
@@ -65,6 +67,7 @@ import org.carrot2.util.resource.Resourc
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import com.google.common.base.Function;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
@@ -103,7 +106,7 @@ public class CarrotClusteringEngine exte
*/
private Controller controller = ControllerFactory.createPooling();
private Class<? extends IClusteringAlgorithm> clusteringAlgorithmClass;
-
+
private static class SolrResourceLocator implements IResourceLocator {
private final SolrResourceLoader resourceLoader;
private final String carrot2ResourcesDir;
@@ -320,6 +323,22 @@ public class CarrotClusteringEngine exte
String urlField = solrParams.get(CarrotParams.URL_FIELD_NAME, "url");
String titleFieldSpec = solrParams.get(CarrotParams.TITLE_FIELD_NAME, "title");
String snippetFieldSpec = solrParams.get(CarrotParams.SNIPPET_FIELD_NAME, titleFieldSpec);
+ String languageField = solrParams.get(CarrotParams.LANGUAGE_FIELD_NAME, null);
+
+ // Parse language code map string into a map
+ Map<String, String> languageCodeMap = Maps.newHashMap();
+ if (StringUtils.isNotBlank(languageField)) {
+ for (String pair : solrParams.get(CarrotParams.LANGUAGE_CODE_MAP, "")
+ .split("[, ]")) {
+ final String[] split = pair.split(":");
+ if (split.length == 2 && StringUtils.isNotBlank(split[0]) && StringUtils.isNotBlank(split[1])) {
+ languageCodeMap.put(split[0], split[1]);
+ } else {
+ log.warn("Unsupported format for " + CarrotParams.LANGUAGE_CODE_MAP
+ + ": '" + pair + "'. Skipping this mapping.");
+ }
+ }
+ }
// Get the documents
boolean produceSummary = solrParams.getBool(CarrotParams.PRODUCE_SUMMARY, false);
@@ -392,9 +411,42 @@ public class CarrotClusteringEngine exte
snippet = getConcatenated(sdoc, snippetFieldSpec);
}
+ // Create a Carrot2 document
Document carrotDocument = new Document(getConcatenated(sdoc, titleFieldSpec),
snippet, (String)sdoc.getFieldValue(urlField));
+
+ // Store Solr id of the document, we need it to map document instances
+ // found in clusters back to identifiers.
carrotDocument.setField(SOLR_DOCUMENT_ID, sdoc.getFieldValue(idFieldName));
+
+ // Set language
+ if (StringUtils.isNotBlank(languageField)) {
+ Collection<Object> languages = sdoc.getFieldValues(languageField);
+ if (languages != null) {
+
+ // Use the first Carrot2-supported language
+ for (Object l : languages) {
+ String lang = ObjectUtils.toString(l, "");
+
+ if (languageCodeMap.containsKey(lang)) {
+ lang = languageCodeMap.get(lang);
+ }
+
+ // Language detection Library for Java uses dashes to separate
+ // language variants, such as 'zh-cn', but Carrot2 uses underscores.
+ if (lang.indexOf('-') > 0) {
+ lang = lang.replace('-', '_');
+ }
+
+ // If the language is supported by Carrot2, we'll get a non-null value
+ final LanguageCode carrot2Language = LanguageCode.forISOCode(lang);
+ if (carrot2Language != null) {
+ carrotDocument.setLanguage(carrot2Language);
+ break;
+ }
+ }
+ }
+ }
result.add(carrotDocument);
}
Modified: lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java?rev=1212490&r1=1212489&r2=1212490&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java (original)
+++ lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java Fri Dec 9 16:00:40 2011
@@ -27,18 +27,23 @@ public interface CarrotParams {
String CARROT_PREFIX = "carrot.";
String ALGORITHM = CARROT_PREFIX + "algorithm";
+
String TITLE_FIELD_NAME = CARROT_PREFIX + "title";
String URL_FIELD_NAME = CARROT_PREFIX + "url";
String SNIPPET_FIELD_NAME = CARROT_PREFIX + "snippet";
+ String LANGUAGE_FIELD_NAME = CARROT_PREFIX + "lang";
+
String PRODUCE_SUMMARY = CARROT_PREFIX + "produceSummary";
- String NUM_DESCRIPTIONS = CARROT_PREFIX + "numDescriptions";
- String OUTPUT_SUB_CLUSTERS = CARROT_PREFIX + "outputSubClusters";
String SUMMARY_FRAGSIZE = CARROT_PREFIX + "fragSize";
String SUMMARY_SNIPPETS = CARROT_PREFIX + "summarySnippets";
+ String NUM_DESCRIPTIONS = CARROT_PREFIX + "numDescriptions";
+ String OUTPUT_SUB_CLUSTERS = CARROT_PREFIX + "outputSubClusters";
String LEXICAL_RESOURCES_DIR = CARROT_PREFIX + "lexicalResourcesDir";
+ String LANGUAGE_CODE_MAP = CARROT_PREFIX + "lcmap";
public static final Set<String> CARROT_PARAM_NAMES = ImmutableSet.of(
- ALGORITHM, TITLE_FIELD_NAME, URL_FIELD_NAME, SNIPPET_FIELD_NAME,
- PRODUCE_SUMMARY, NUM_DESCRIPTIONS, OUTPUT_SUB_CLUSTERS, SUMMARY_FRAGSIZE);
+ ALGORITHM, TITLE_FIELD_NAME, URL_FIELD_NAME, SNIPPET_FIELD_NAME, LANGUAGE_FIELD_NAME,
+ PRODUCE_SUMMARY, SUMMARY_FRAGSIZE, SUMMARY_SNIPPETS, NUM_DESCRIPTIONS, OUTPUT_SUB_CLUSTERS,
+ LEXICAL_RESOURCES_DIR);
}
Modified: lucene/dev/trunk/solr/contrib/clustering/src/test-files/clustering/solr/conf/schema.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/src/test-files/clustering/solr/conf/schema.xml?rev=1212490&r1=1212489&r2=1212490&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/clustering/src/test-files/clustering/solr/conf/schema.xml (original)
+++ lucene/dev/trunk/solr/contrib/clustering/src/test-files/clustering/solr/conf/schema.xml Fri Dec 9 16:00:40 2011
@@ -280,6 +280,7 @@
<field name="id" type="string" indexed="true" stored="true" required="true" />
<field name="url" type="string" indexed="true" stored="true" required="true" />
+ <field name="lang" type="string" indexed="true" stored="true" required="false" multiValued="true" />
<field name="title" type="text" indexed="true" stored="true" multiValued="true"/>
<field name="heading" type="text" indexed="true" stored="true" multiValued="true"/>
Modified: lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/AbstractClusteringTestCase.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/AbstractClusteringTestCase.java?rev=1212490&r1=1212489&r2=1212490&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/AbstractClusteringTestCase.java (original)
+++ lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/AbstractClusteringTestCase.java Fri Dec 9 16:00:40 2011
@@ -56,6 +56,23 @@ public abstract class AbstractClustering
multiFieldDoc.addField("body", "Body field: this is the contents of the body field that will get clustered together with snippet.");
assertNull(h.validateUpdate(adoc(multiFieldDoc)));
+ // Add a document with one language supported by Carrot2
+ final SolrInputDocument docWithOneSupprtedLanguage = new SolrInputDocument();
+ docWithOneSupprtedLanguage.addField("id", numberOfDocs++);
+ docWithOneSupprtedLanguage.addField("title", "");
+ docWithOneSupprtedLanguage.addField("url", "one_supported_language");
+ docWithOneSupprtedLanguage.addField("lang", "zh-cn");
+ assertNull(h.validateUpdate(adoc(docWithOneSupprtedLanguage)));
+
+ // Add a document with more languages, one supported by Carrot2
+ final SolrInputDocument docWithOneSupprtedLanguageOfMany = new SolrInputDocument();
+ docWithOneSupprtedLanguageOfMany.addField("id", numberOfDocs++);
+ docWithOneSupprtedLanguageOfMany.addField("url", "one_supported_language_of_many");
+ docWithOneSupprtedLanguageOfMany.addField("lang", "zh-tw");
+ docWithOneSupprtedLanguageOfMany.addField("lang", "POLISH");
+ docWithOneSupprtedLanguageOfMany.addField("lang", "de");
+ assertNull(h.validateUpdate(adoc(docWithOneSupprtedLanguageOfMany)));
+
assertNull(h.validateUpdate(commit()));
}
Modified: lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java?rev=1212490&r1=1212489&r2=1212490&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java (original)
+++ lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java Fri Dec 9 16:00:40 2011
@@ -39,6 +39,7 @@ import org.apache.solr.search.DocList;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.util.RefCounted;
import org.apache.solr.util.SolrPluginUtils;
+import org.carrot2.core.LanguageCode;
import org.carrot2.util.attribute.AttributeUtils;
import org.junit.Test;
@@ -89,7 +90,7 @@ public class CarrotClusteringEngineTest
private List<NamedList<Object>> clusterWithHighlighting(
boolean enableHighlighting, int fragSize) throws IOException {
// Some documents don't have mining in the snippet
- return clusterWithHighlighting(enableHighlighting, fragSize, 1, "mine", numberOfDocs - 4);
+ return clusterWithHighlighting(enableHighlighting, fragSize, 1, "mine", numberOfDocs - 6);
}
private List<NamedList<Object>> clusterWithHighlighting(
@@ -295,6 +296,43 @@ public class CarrotClusteringEngineTest
}
+ @Test
+ public void oneCarrot2SupportedLanguage() throws Exception {
+ final ModifiableSolrParams params = new ModifiableSolrParams();
+ params.add(CarrotParams.LANGUAGE_FIELD_NAME, "lang");
+
+ final List<String> labels = getLabels(checkEngine(
+ getClusteringEngine("echo"), 1, 1, new TermQuery(new Term("url",
+ "one_supported_language")), params).get(0));
+ assertEquals(3, labels.size());
+ assertEquals("Correct Carrot2 language", LanguageCode.CHINESE_SIMPLIFIED.name(), labels.get(2));
+ }
+
+ @Test
+ public void oneCarrot2SupportedLanguageOfMany() throws Exception {
+ final ModifiableSolrParams params = new ModifiableSolrParams();
+ params.add(CarrotParams.LANGUAGE_FIELD_NAME, "lang");
+
+ final List<String> labels = getLabels(checkEngine(
+ getClusteringEngine("echo"), 1, 1, new TermQuery(new Term("url",
+ "one_supported_language_of_many")), params).get(0));
+ assertEquals(3, labels.size());
+ assertEquals("Correct Carrot2 language", LanguageCode.GERMAN.name(), labels.get(2));
+ }
+
+ @Test
+ public void languageCodeMapping() throws Exception {
+ final ModifiableSolrParams params = new ModifiableSolrParams();
+ params.add(CarrotParams.LANGUAGE_FIELD_NAME, "lang");
+ params.add(CarrotParams.LANGUAGE_CODE_MAP, "POLISH:pl");
+
+ final List<String> labels = getLabels(checkEngine(
+ getClusteringEngine("echo"), 1, 1, new TermQuery(new Term("url",
+ "one_supported_language_of_many")), params).get(0));
+ assertEquals(3, labels.size());
+ assertEquals("Correct Carrot2 language", LanguageCode.POLISH.name(), labels.get(2));
+ }
+
private CarrotClusteringEngine getClusteringEngine(String engineName) {
ClusteringComponent comp = (ClusteringComponent) h.getCore()
.getSearchComponent("clustering");
@@ -367,7 +405,7 @@ public class CarrotClusteringEngineTest
List<Object> docs = getDocs(cluster);
assertNotNull("docs is null and it shouldn't be", docs);
for (int j = 0; j < docs.size(); j++) {
- String id = (String) docs.get(j);
+ Object id = docs.get(j);
assertNotNull("id is null and it shouldn't be", id);
}
Modified: lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/EchoClusteringAlgorithm.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/EchoClusteringAlgorithm.java?rev=1212490&r1=1212489&r2=1212490&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/EchoClusteringAlgorithm.java (original)
+++ lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/EchoClusteringAlgorithm.java Fri Dec 9 16:00:40 2011
@@ -55,6 +55,9 @@ public class EchoClusteringAlgorithm ext
for (Document document : documents) {
final Cluster cluster = new Cluster();
cluster.addPhrases(document.getTitle(), document.getSummary());
+ if (document.getLanguage() != null) {
+ cluster.addPhrases(document.getLanguage().name());
+ }
cluster.addDocuments(document);
clusters.add(cluster);
}