You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by st...@apache.org on 2011/12/09 17:00:41 UTC

svn commit: r1212490 - in /lucene/dev/trunk/solr/contrib/clustering: ./ src/java/org/apache/solr/handler/clustering/carrot2/ src/test-files/clustering/solr/conf/ src/test/org/apache/solr/handler/clustering/ src/test/org/apache/solr/handler/clustering/c...

Author: stanislaw
Date: Fri Dec  9 16:00:40 2011
New Revision: 1212490

URL: http://svn.apache.org/viewvc?rev=1212490&view=rev
Log:
SOLR-2939: Clustering of multilingual search results

Modified:
    lucene/dev/trunk/solr/contrib/clustering/CHANGES.txt
    lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java
    lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java
    lucene/dev/trunk/solr/contrib/clustering/src/test-files/clustering/solr/conf/schema.xml
    lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/AbstractClusteringTestCase.java
    lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java
    lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/EchoClusteringAlgorithm.java

Modified: lucene/dev/trunk/solr/contrib/clustering/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/CHANGES.txt?rev=1212490&r1=1212489&r2=1212490&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/clustering/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/contrib/clustering/CHANGES.txt Fri Dec  9 16:00:40 2011
@@ -22,6 +22,10 @@ $Id$
   carrot.snippet can now take comma- or space-separated lists of
   field names to cluster (Stanislaw Osinski).
 
+* SOLR-2939: Clustering of multilingual search results. The document's
+  language field be passed in the carrot.lang parameter, the carrot.lcmap
+  parameter enables mapping of language codes to ISO 639 (Stanislaw Osinski).
+
 ================== Release 3.5.0 ==================
 
 (No Changes)

Modified: lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java?rev=1212490&r1=1212489&r2=1212490&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java (original)
+++ lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java Fri Dec  9 16:00:40 2011
@@ -29,6 +29,7 @@ import java.util.Map;
 import java.util.Set;
 
 import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang.ObjectUtils;
 import org.apache.commons.lang.StringUtils;
 import org.apache.lucene.search.Query;
 import org.apache.solr.common.SolrDocument;
@@ -55,6 +56,7 @@ import org.carrot2.core.Controller;
 import org.carrot2.core.ControllerFactory;
 import org.carrot2.core.Document;
 import org.carrot2.core.IClusteringAlgorithm;
+import org.carrot2.core.LanguageCode;
 import org.carrot2.core.attribute.AttributeNames;
 import org.carrot2.text.linguistic.DefaultLexicalDataFactoryDescriptor;
 import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipelineDescriptor;
@@ -65,6 +67,7 @@ import org.carrot2.util.resource.Resourc
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import com.google.common.base.Function;
 import com.google.common.collect.Lists;
 import com.google.common.collect.Maps;
 import com.google.common.collect.Sets;
@@ -103,7 +106,7 @@ public class CarrotClusteringEngine exte
    */
   private Controller controller = ControllerFactory.createPooling();
   private Class<? extends IClusteringAlgorithm> clusteringAlgorithmClass;
-  
+
   private static class SolrResourceLocator implements IResourceLocator {
     private final SolrResourceLoader resourceLoader;
     private final String carrot2ResourcesDir;
@@ -320,6 +323,22 @@ public class CarrotClusteringEngine exte
     String urlField = solrParams.get(CarrotParams.URL_FIELD_NAME, "url");
     String titleFieldSpec = solrParams.get(CarrotParams.TITLE_FIELD_NAME, "title");
     String snippetFieldSpec = solrParams.get(CarrotParams.SNIPPET_FIELD_NAME, titleFieldSpec);
+    String languageField = solrParams.get(CarrotParams.LANGUAGE_FIELD_NAME, null);
+
+    // Parse language code map string into a map
+    Map<String, String> languageCodeMap = Maps.newHashMap();
+    if (StringUtils.isNotBlank(languageField)) {
+      for (String pair : solrParams.get(CarrotParams.LANGUAGE_CODE_MAP, "")
+          .split("[, ]")) {
+        final String[] split = pair.split(":");
+        if (split.length == 2 && StringUtils.isNotBlank(split[0]) && StringUtils.isNotBlank(split[1])) {
+          languageCodeMap.put(split[0], split[1]);
+        } else {
+          log.warn("Unsupported format for " + CarrotParams.LANGUAGE_CODE_MAP
+              + ": '" + pair + "'. Skipping this mapping.");
+        }
+      }
+    }
     
     // Get the documents
     boolean produceSummary = solrParams.getBool(CarrotParams.PRODUCE_SUMMARY, false);
@@ -392,9 +411,42 @@ public class CarrotClusteringEngine exte
         snippet = getConcatenated(sdoc, snippetFieldSpec);
       }
       
+      // Create a Carrot2 document
       Document carrotDocument = new Document(getConcatenated(sdoc, titleFieldSpec),
               snippet, (String)sdoc.getFieldValue(urlField));
+      
+      // Store Solr id of the document, we need it to map document instances 
+      // found in clusters back to identifiers.
       carrotDocument.setField(SOLR_DOCUMENT_ID, sdoc.getFieldValue(idFieldName));
+      
+      // Set language
+      if (StringUtils.isNotBlank(languageField)) {
+        Collection<Object> languages = sdoc.getFieldValues(languageField);
+        if (languages != null) {
+          
+          // Use the first Carrot2-supported language
+          for (Object l : languages) {
+            String lang = ObjectUtils.toString(l, "");
+            
+            if (languageCodeMap.containsKey(lang)) {
+              lang = languageCodeMap.get(lang);
+            }
+            
+            // Language detection Library for Java uses dashes to separate
+            // language variants, such as 'zh-cn', but Carrot2 uses underscores.
+            if (lang.indexOf('-') > 0) {
+              lang = lang.replace('-', '_');
+            }
+            
+            // If the language is supported by Carrot2, we'll get a non-null value
+            final LanguageCode carrot2Language = LanguageCode.forISOCode(lang);
+            if (carrot2Language != null) {
+              carrotDocument.setLanguage(carrot2Language);
+              break;
+            }
+          }
+        }
+      }
       result.add(carrotDocument);
     }
 

Modified: lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java?rev=1212490&r1=1212489&r2=1212490&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java (original)
+++ lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java Fri Dec  9 16:00:40 2011
@@ -27,18 +27,23 @@ public interface CarrotParams {
   String CARROT_PREFIX = "carrot.";
 
   String ALGORITHM = CARROT_PREFIX + "algorithm";
+  
   String TITLE_FIELD_NAME = CARROT_PREFIX + "title";
   String URL_FIELD_NAME = CARROT_PREFIX + "url";
   String SNIPPET_FIELD_NAME = CARROT_PREFIX + "snippet";
+  String LANGUAGE_FIELD_NAME = CARROT_PREFIX + "lang";
+  
   String PRODUCE_SUMMARY = CARROT_PREFIX + "produceSummary";
-  String NUM_DESCRIPTIONS = CARROT_PREFIX + "numDescriptions";
-  String OUTPUT_SUB_CLUSTERS = CARROT_PREFIX + "outputSubClusters";
   String SUMMARY_FRAGSIZE = CARROT_PREFIX + "fragSize";
   String SUMMARY_SNIPPETS = CARROT_PREFIX + "summarySnippets";
 
+  String NUM_DESCRIPTIONS = CARROT_PREFIX + "numDescriptions";
+  String OUTPUT_SUB_CLUSTERS = CARROT_PREFIX + "outputSubClusters";
   String LEXICAL_RESOURCES_DIR = CARROT_PREFIX + "lexicalResourcesDir";
+  String LANGUAGE_CODE_MAP = CARROT_PREFIX + "lcmap";
 
   public static final Set<String> CARROT_PARAM_NAMES = ImmutableSet.of(
-          ALGORITHM, TITLE_FIELD_NAME, URL_FIELD_NAME, SNIPPET_FIELD_NAME,
-          PRODUCE_SUMMARY, NUM_DESCRIPTIONS, OUTPUT_SUB_CLUSTERS, SUMMARY_FRAGSIZE);
+          ALGORITHM, TITLE_FIELD_NAME, URL_FIELD_NAME, SNIPPET_FIELD_NAME, LANGUAGE_FIELD_NAME,
+          PRODUCE_SUMMARY, SUMMARY_FRAGSIZE, SUMMARY_SNIPPETS, NUM_DESCRIPTIONS, OUTPUT_SUB_CLUSTERS, 
+          LEXICAL_RESOURCES_DIR);
 }

Modified: lucene/dev/trunk/solr/contrib/clustering/src/test-files/clustering/solr/conf/schema.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/src/test-files/clustering/solr/conf/schema.xml?rev=1212490&r1=1212489&r2=1212490&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/clustering/src/test-files/clustering/solr/conf/schema.xml (original)
+++ lucene/dev/trunk/solr/contrib/clustering/src/test-files/clustering/solr/conf/schema.xml Fri Dec  9 16:00:40 2011
@@ -280,6 +280,7 @@
 
    <field name="id" type="string" indexed="true" stored="true" required="true" />
    <field name="url" type="string" indexed="true" stored="true" required="true" />
+   <field name="lang" type="string" indexed="true" stored="true" required="false" multiValued="true" />
 
    <field name="title" type="text" indexed="true" stored="true" multiValued="true"/>
    <field name="heading" type="text" indexed="true" stored="true" multiValued="true"/>

Modified: lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/AbstractClusteringTestCase.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/AbstractClusteringTestCase.java?rev=1212490&r1=1212489&r2=1212490&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/AbstractClusteringTestCase.java (original)
+++ lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/AbstractClusteringTestCase.java Fri Dec  9 16:00:40 2011
@@ -56,6 +56,23 @@ public abstract class AbstractClustering
     multiFieldDoc.addField("body", "Body field: this is the contents of the body field that will get clustered together with snippet.");
     assertNull(h.validateUpdate(adoc(multiFieldDoc)));
     
+    // Add a document with one language supported by Carrot2
+    final SolrInputDocument docWithOneSupprtedLanguage = new SolrInputDocument();
+    docWithOneSupprtedLanguage.addField("id", numberOfDocs++);
+    docWithOneSupprtedLanguage.addField("title", "");
+    docWithOneSupprtedLanguage.addField("url", "one_supported_language");
+    docWithOneSupprtedLanguage.addField("lang", "zh-cn");
+    assertNull(h.validateUpdate(adoc(docWithOneSupprtedLanguage)));
+    
+    // Add a document with more languages, one supported by Carrot2
+    final SolrInputDocument docWithOneSupprtedLanguageOfMany = new SolrInputDocument();
+    docWithOneSupprtedLanguageOfMany.addField("id", numberOfDocs++);
+    docWithOneSupprtedLanguageOfMany.addField("url", "one_supported_language_of_many");
+    docWithOneSupprtedLanguageOfMany.addField("lang", "zh-tw");
+    docWithOneSupprtedLanguageOfMany.addField("lang", "POLISH");
+    docWithOneSupprtedLanguageOfMany.addField("lang", "de");
+    assertNull(h.validateUpdate(adoc(docWithOneSupprtedLanguageOfMany)));
+    
     assertNull(h.validateUpdate(commit()));
   }
 

Modified: lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java?rev=1212490&r1=1212489&r2=1212490&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java (original)
+++ lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java Fri Dec  9 16:00:40 2011
@@ -39,6 +39,7 @@ import org.apache.solr.search.DocList;
 import org.apache.solr.search.SolrIndexSearcher;
 import org.apache.solr.util.RefCounted;
 import org.apache.solr.util.SolrPluginUtils;
+import org.carrot2.core.LanguageCode;
 import org.carrot2.util.attribute.AttributeUtils;
 import org.junit.Test;
 
@@ -89,7 +90,7 @@ public class CarrotClusteringEngineTest 
   private List<NamedList<Object>> clusterWithHighlighting(
       boolean enableHighlighting, int fragSize) throws IOException {
     // Some documents don't have mining in the snippet
-    return clusterWithHighlighting(enableHighlighting, fragSize, 1, "mine", numberOfDocs - 4);
+    return clusterWithHighlighting(enableHighlighting, fragSize, 1, "mine", numberOfDocs - 6);
   }
 
   private List<NamedList<Object>> clusterWithHighlighting(
@@ -295,6 +296,43 @@ public class CarrotClusteringEngineTest 
 
   }
 
+  @Test
+  public void oneCarrot2SupportedLanguage() throws Exception {
+    final ModifiableSolrParams params = new ModifiableSolrParams();
+    params.add(CarrotParams.LANGUAGE_FIELD_NAME, "lang");
+
+    final List<String> labels = getLabels(checkEngine(
+        getClusteringEngine("echo"), 1, 1, new TermQuery(new Term("url",
+            "one_supported_language")), params).get(0));
+    assertEquals(3, labels.size());
+    assertEquals("Correct Carrot2 language", LanguageCode.CHINESE_SIMPLIFIED.name(), labels.get(2));
+  }
+  
+  @Test
+  public void oneCarrot2SupportedLanguageOfMany() throws Exception {
+    final ModifiableSolrParams params = new ModifiableSolrParams();
+    params.add(CarrotParams.LANGUAGE_FIELD_NAME, "lang");
+    
+    final List<String> labels = getLabels(checkEngine(
+        getClusteringEngine("echo"), 1, 1, new TermQuery(new Term("url",
+            "one_supported_language_of_many")), params).get(0));
+    assertEquals(3, labels.size());
+    assertEquals("Correct Carrot2 language", LanguageCode.GERMAN.name(), labels.get(2));
+  }
+  
+  @Test
+  public void languageCodeMapping() throws Exception {
+    final ModifiableSolrParams params = new ModifiableSolrParams();
+    params.add(CarrotParams.LANGUAGE_FIELD_NAME, "lang");
+    params.add(CarrotParams.LANGUAGE_CODE_MAP, "POLISH:pl");
+    
+    final List<String> labels = getLabels(checkEngine(
+        getClusteringEngine("echo"), 1, 1, new TermQuery(new Term("url",
+            "one_supported_language_of_many")), params).get(0));
+    assertEquals(3, labels.size());
+    assertEquals("Correct Carrot2 language", LanguageCode.POLISH.name(), labels.get(2));
+  }
+
   private CarrotClusteringEngine getClusteringEngine(String engineName) {
     ClusteringComponent comp = (ClusteringComponent) h.getCore()
             .getSearchComponent("clustering");
@@ -367,7 +405,7 @@ public class CarrotClusteringEngineTest 
     List<Object> docs = getDocs(cluster);
     assertNotNull("docs is null and it shouldn't be", docs);
     for (int j = 0; j < docs.size(); j++) {
-      String id = (String) docs.get(j);
+      Object id = docs.get(j);
       assertNotNull("id is null and it shouldn't be", id);
     }
 

Modified: lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/EchoClusteringAlgorithm.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/EchoClusteringAlgorithm.java?rev=1212490&r1=1212489&r2=1212490&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/EchoClusteringAlgorithm.java (original)
+++ lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/EchoClusteringAlgorithm.java Fri Dec  9 16:00:40 2011
@@ -55,6 +55,9 @@ public class EchoClusteringAlgorithm ext
     for (Document document : documents) {
       final Cluster cluster = new Cluster();
       cluster.addPhrases(document.getTitle(), document.getSummary());
+      if (document.getLanguage() != null) {
+        cluster.addPhrases(document.getLanguage().name());
+      }
       cluster.addDocuments(document);
       clusters.add(cluster);
     }