You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by pk...@apache.org on 2006/04/13 20:57:58 UTC

svn commit: r393889 - in /lucene/nutch/trunk/src/plugin/clustering-carrot2: ./ lib/ src/java/org/apache/nutch/clustering/carrot2/

Author: pkosiorowski
Date: Thu Apr 13 11:57:55 2006
New Revision: 393889

URL: http://svn.apache.org/viewcvs?rev=393889&view=rev
Log:
NUTCH-237 - Carrot2 clustering plugin upgrade. (Dawid Weiss)

Added:
    lucene/nutch/trunk/src/plugin/clustering-carrot2/readme.txt
Removed:
    lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/FSA.LICENSE
    lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/FSA.jar
Modified:
    lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2-filter-lingo.jar
    lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2-local-core.jar
    lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2-snowball-stemmers.jar
    lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2-util-common.jar
    lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2-util-tokenizer.jar
    lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2.CONTRIBUTORS
    lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2.LICENSE
    lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml
    lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/Clusterer.java
    lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/HitsClusterAdapter.java
    lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/LocalNutchInputComponent.java
    lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/NutchDocument.java

Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2-filter-lingo.jar
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2-filter-lingo.jar?rev=393889&r1=393888&r2=393889&view=diff
==============================================================================
Binary files - no diff available.

Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2-local-core.jar
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2-local-core.jar?rev=393889&r1=393888&r2=393889&view=diff
==============================================================================
Binary files - no diff available.

Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2-snowball-stemmers.jar
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2-snowball-stemmers.jar?rev=393889&r1=393888&r2=393889&view=diff
==============================================================================
Binary files - no diff available.

Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2-util-common.jar
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2-util-common.jar?rev=393889&r1=393888&r2=393889&view=diff
==============================================================================
Binary files - no diff available.

Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2-util-tokenizer.jar
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2-util-tokenizer.jar?rev=393889&r1=393888&r2=393889&view=diff
==============================================================================
Binary files - no diff available.

Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2.CONTRIBUTORS
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2.CONTRIBUTORS?rev=393889&r1=393888&r2=393889&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2.CONTRIBUTORS (original)
+++ lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2.CONTRIBUTORS Thu Apr 13 11:57:55 2006
@@ -7,6 +7,7 @@
 
 Dawid Weiss; Project administrator, various components, core; 2002; Poland
 Stanisław, Osiński; Lingo clustering component, ODP Input; 2003; Poland
+Karol Gołembniak, Irmina Masłowska; HAOG clustering component; 2006; Poznan University of Technology; Poland
 
 Michał, Wróblewski [*]; AHC clustering components; 2003; Poznan University of Technology, Poland
 Paweł, Kowalik [*]; Inductive search engine wrapper; 2003; Poznan University of Technology, Poland

Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2.LICENSE
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2.LICENSE?rev=393889&r1=393888&r2=393889&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2.LICENSE (original)
+++ lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2.LICENSE Thu Apr 13 11:57:55 2006
@@ -1,7 +1,9 @@
 
 Carrot2 Project
-Copyright (C) Dawid Weiss, Stanislaw Osinski
-Portions (C) Contributors listed in carrot2.CONTRIBUTORS file.
+
+Copyright (C) 2002-2006, Dawid Weiss, Stanis³aw Osiñski.
+Portions (C) Contributors listed in "carrot2.CONTRIBUTORS" file.
+All rights reserved.
 
 Redistribution and use in source and binary forms, with or without modification,
 are permitted provided that the following conditions are met:

Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml?rev=393889&r1=393888&r2=393889&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml Thu Apr 13 11:57:55 2006
@@ -2,7 +2,7 @@
 <plugin
    id="clustering-carrot2"
    name="Online Search Results Clustering using Carrot2's Lingo component"
-   version="0.9.0"
+   version="1.0.2"
    provider-name="carrot2.sourceforge.net">
 
    <runtime>
@@ -13,21 +13,18 @@
       <library name="carrot2-filter-lingo.jar"/>
       <library name="carrot2-local-core.jar"/>
       <library name="carrot2-snowball-stemmers.jar"/>
-      <library name="carrot2-stemmer-lametyzator.jar"/>
       <library name="carrot2-util-common.jar"/>
       <library name="carrot2-util-tokenizer.jar"/>
 
-      <library name="colt-1.0.3.jar"/>
       <library name="commons-collections-3.1-patched.jar"/>
       <library name="commons-pool-1.1.jar"/>
-      <library name="FSA.jar"/>
       <library name="Jama-1.0.1-patched.jar"/>
+      <library name="violinstrings-1.0.2.jar"/>
    </runtime>
 
    <requires>
       <import plugin="nutch-extensionpoints"/>
       <import plugin="lib-log4j"/>
-      <import plugin="lib-nekohtml"/>
    </requires>
 
    <extension id="org.apache.nutch.clustering.carrot2"
@@ -36,6 +33,5 @@
 
       <implementation id="Carrot2-Lingo"
                       class="org.apache.nutch.clustering.carrot2.Clusterer"/>
-
    </extension>
 </plugin>

Added: lucene/nutch/trunk/src/plugin/clustering-carrot2/readme.txt
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/clustering-carrot2/readme.txt?rev=393889&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/clustering-carrot2/readme.txt (added)
+++ lucene/nutch/trunk/src/plugin/clustering-carrot2/readme.txt Thu Apr 13 11:57:55 2006
@@ -0,0 +1,44 @@
+This plugin extension adds search results clustering capability to Nutch search 
+frontend.
+
+The user interface in Nutch is very limited and you'll most likely need something 
+more application-specific. Look at http://www.carrot2.org or 
+http://carrot.cs.put.poznan.pl for inspiration.
+
+Libraries in this release are precompiled with stemming and stop words for various
+languages present in Carrot2 codebase (imported from the Snowball project). You 
+must define the default language and supported languages in Nutch configuration
+file (nutch-site.xml). If nothing is given in Nutch configuration, English is 
+taken by default.
+
+<!-- Carrot2 Clustering plugin configuration -->
+
+<property>
+  <name>extension.clustering.carrot2.defaultLanguage</name>
+  <value>en</value>
+  <description>Two-letter ISO code of the language. 
+  http://www.ics.uci.edu/pub/ietf/http/related/iso639.txt</description>
+</property>
+
+<property>
+  <name>extension.clustering.carrot2.languages</name>
+  <value>en,nl,da,fi,fr,de,it,no,pl,pt,ru,es,sv</value>
+  <description>All languages to be used by the clustering plugin. 
+  This list includes all currently supported languages (although not all of them
+  will successfully instantiate -- support for Polish requires additional
+  libraries for instance). Adjust to your needs, fewer languages take less
+  memory.
+  
+  If you use the language recognizer plugin, then each hit will come with its
+  own ISO language code. All hits with no explicit language take the default
+  language specified in "extension.clustering.carrot2.defaultLanguage" property.
+  </description>
+</property>
+
+
+If you need a different language/ clustering algorithm, you'll need to modify 
+Nutch plugin code a bit (we don't want the plugin to outgrow Nutch, so we 
+include just the essentials here). Ask on carrot@ developers mailing list for 
+help if you need it.
+
+Carrot2 JARs come from codebase in version: 1.0.2

Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/Clusterer.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/Clusterer.java?rev=393889&r1=393888&r2=393889&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/Clusterer.java (original)
+++ lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/Clusterer.java Thu Apr 13 11:57:55 2006
@@ -16,46 +16,86 @@
 
 package org.apache.nutch.clustering.carrot2;
 
-import java.util.HashMap;
-import java.util.Map;
-import java.util.Set;
-import java.util.List;
-import java.util.Iterator;
-
+import java.util.*;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+import org.apache.hadoop.conf.Configurable;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.LogFormatter;
 import org.apache.nutch.clustering.HitsCluster;
 import org.apache.nutch.clustering.OnlineClusterer;
 import org.apache.nutch.searcher.HitDetails;
+
 import com.dawidweiss.carrot.core.local.*;
 import com.dawidweiss.carrot.core.local.clustering.RawCluster;
 import com.dawidweiss.carrot.core.local.impl.ClustersConsumerOutputComponent;
-import com.dawidweiss.carrot.util.tokenizer.SnippetTokenizerLocalFilterComponent;
-import com.stachoodev.carrot.filter.lingo.local.LingoLocalFilterComponent;
-
-import com.dawidweiss.carrot.util.tokenizer.languages.dutch.Dutch;
-import com.dawidweiss.carrot.util.tokenizer.languages.english.English;
-import com.dawidweiss.carrot.util.tokenizer.languages.french.French;
-import com.dawidweiss.carrot.util.tokenizer.languages.german.German;
-import com.dawidweiss.carrot.util.tokenizer.languages.italian.Italian;
-import com.dawidweiss.carrot.util.tokenizer.languages.spanish.Spanish;
 import com.dawidweiss.carrot.core.local.linguistic.Language;
+import com.dawidweiss.carrot.util.tokenizer.languages.AllKnownLanguages;
+import com.stachoodev.carrot.filter.lingo.local.LingoLocalFilterComponent;
 
 
 /**
- * An plugin providing an implementation of {@link OnlineClusterer} extension
- * using clustering components of the Carrot2 project
- * (<a href="http://carrot2.sourceforge.net">http://carrot2.sourceforge.net</a>). 
+ * An plugin providing an implementation of {@link OnlineClusterer} 
+ * extension using clustering components of the Carrot2 project
+ * (<a href="http://carrot2.sourceforge.net">http://carrot2.sourceforge.net</a>).
+ * 
+ * We hardcode the following Carrot2 process:
+ * <pre><![CDATA[
+ * <local-process id="yahoo-lingo">
+ *   <name>Yahoo Search API -- Lingo Classic Clusterer</name>
+ * 
+ *   <input  component-key="input-localnutch" />
+ *   <filter component-key="filter-lingo" />
+ *   <output component-key="output-clustersConsumer" />
+ * </local-process>
+ * ]]></pre>
  *
  * @author Dawid Weiss
  * @version $Id: Clusterer.java,v 1.1 2004/08/09 23:23:53 johnnx Exp $
  */
-public class Clusterer implements OnlineClusterer {
-  private final LocalController controller;
+public class Clusterer implements OnlineClusterer, Configurable {
+  /** Default language property name. */
+  private final static String CONF_PROP_DEFAULT_LANGUAGE =
+    "extension.clustering.carrot2.defaultLanguage";
+
+  /** Recognizable languages property name. */
+  private final static String CONF_PROP_LANGUAGES =
+    "extension.clustering.carrot2.languages";
+
+  /** Internal clustering process ID in Carrot2 LocalController */
+  private final static String PROCESS_ID = "nutch-lingo";
+  
+  public static final Logger logger =
+    LogFormatter.getLogger(Clusterer.class.getName());  
+
+  /** The LocalController instance used for clustering */
+  private LocalController controller;
+
+  /** Nutch configuration. */
+  private Configuration conf;
+
+  /** 
+   * Default language for hits. English by default, but may be changed
+   * via a property in Nutch configuration. 
+   */
+  private String defaultLanguage = "en";
+
+  /** 
+   * A list of recognizable languages..
+   * English only by default, but configurable via Nutch configuration.
+   */
+  private String [] languages = new String [] {defaultLanguage};
 
   /**
    * An empty public constructor for making new instances
    * of the clusterer.
    */
   public Clusterer() {
+    initialize();
+  }
+
+  private synchronized void initialize() {
     controller = new LocalControllerBase();
     addComponentFactories();
     addProcesses();
@@ -63,68 +103,70 @@
 
   /** Adds the required component factories to a local Carrot2 controller. */
   private void addComponentFactories() {
-    // Local nutch input component
+    //  *   <input  component-key="input-localnutch" />
     LocalComponentFactory nutchInputFactory = new LocalComponentFactoryBase() {
       public LocalComponent getInstance() {
-        return new LocalNutchInputComponent();
+        return new LocalNutchInputComponent(defaultLanguage);
       }
     };
-    controller.addLocalComponentFactory("input.localnutch", nutchInputFactory);
-    
-    // Cluster consumer output component
-    LocalComponentFactory clusterConsumerOutputFactory = new LocalComponentFactoryBase() {
-      public LocalComponent getInstance() {
-        return new ClustersConsumerOutputComponent();
-      }
-    };
-    controller.addLocalComponentFactory("output.cluster-consumer", 
-      clusterConsumerOutputFactory);
-    
-    // Clustering component here.
+    controller.addLocalComponentFactory("input-localnutch", nutchInputFactory);
+
+    // *   <filter component-key="filter-lingo" />
     LocalComponentFactory lingoFactory = new LocalComponentFactoryBase() {
       public LocalComponent getInstance() {
         HashMap defaults = new HashMap();
-        
-        // These are adjustments settings for the clustering algorithm...
-        // You can play with them, but the values below are our 'best guess'
-        // settings that we acquired experimentally.
+
+        // These are adjustments settings for the clustering algorithm.
+        // If you try the live WebStart demo of Carrot2 you can see how they affect
+        // the final clustering: http://www.carrot2.org/webstart 
         defaults.put("lsi.threshold.clusterAssignment", "0.150");
         defaults.put("lsi.threshold.candidateCluster",  "0.775");
 
-        // TODO: this should be eventually replaced with documents from Nutch
-        // tagged with a language tag. There is no need to again determine
-        // the language of a document.
+        // Initialize a new Lingo clustering component.
+        ArrayList languageList = new ArrayList(languages.length);
+        for (int i = 0; i < languages.length; i++) {
+          final String lcode = languages[i];
+          try {
+            Language lang = AllKnownLanguages.getLanguageForIsoCode(lcode);
+            if (lang == null) {
+              logger.log(Level.WARNING, "Language not supported in Carrot2: " + lcode);
+            } else {
+              languageList.add(lang);
+              logger.log(Level.FINE, "Language loaded: " + lcode);
+            }
+          } catch (Throwable t) {
+            logger.log(Level.WARNING, "Language could not be loaded: " + lcode, t);
+          }
+        }
         return new LingoLocalFilterComponent(
-          // If you want to include Polish in the list of supported languages,
-          // you have to download a separate Carrot2-component called
-          // carrot2-stemmer-lametyzator.jar, put it in classpath
-          // and add new Polish() below.
-          new Language[]
-          { 
-            new English(), 
-            new Dutch(), 
-            new French(), 
-            new German(),
-            new Italian(), 
-            new Spanish() 
-          }, defaults);
+          (Language []) languageList.toArray(new Language [languageList.size()]), defaults);
       }
     };
-    controller.addLocalComponentFactory("filter.lingo-old", lingoFactory);      
+    controller.addLocalComponentFactory("filter-lingo", lingoFactory);
+
+    // *   <output component-key="output-clustersConsumer" />
+    LocalComponentFactory clusterConsumerOutputFactory = new LocalComponentFactoryBase() {
+      public LocalComponent getInstance() {
+        return new ClustersConsumerOutputComponent();
+      }
+    };
+    controller.addLocalComponentFactory("output-clustersConsumer", 
+      clusterConsumerOutputFactory);
   }
 
-  /** Adds a clustering process to the local controller */  
+  /** 
+   * Adds a hardcoded clustering process to the local controller.
+   */  
   private void addProcesses() {
-    LocalProcessBase lingoNMFKM3 
-      = new LocalProcessBase(
-        "input.localnutch",
-        "output.cluster-consumer",
-        new String [] {"filter.lingo-old"},
-        "Example the Lingo clustering algorithm.",
+    LocalProcessBase process = new LocalProcessBase(
+        "input-localnutch",                                   // input
+        "output-clustersConsumer",                            // output
+        new String [] {"filter-lingo"},                       // filters
+        "The Lingo clustering algorithm (www.carrot2.org).",
         "");
 
     try {
-      controller.addProcess("lingo-nmf-km-3", lingoNMFKM3);
+      controller.addProcess(PROCESS_ID, process);
     } catch (Exception e) {
       throw new RuntimeException("Could not assemble clustering process.", e);
     }
@@ -139,15 +181,17 @@
       hitDetails);
     requestParams.put(LocalNutchInputComponent.NUTCH_INPUT_SUMMARIES_ARRAY,
       descriptions);
+
     try {
-      ProcessingResult result = 
-        controller.query("lingo-nmf-km-3", "pseudo-query", requestParams);
+      // The input component takes Nutch's results so we don't need the query argument.
+      final ProcessingResult result = 
+        controller.query(PROCESS_ID, "no-query", requestParams);
 
-      ClustersConsumerOutputComponent.Result output =
+      final ClustersConsumerOutputComponent.Result output =
         (ClustersConsumerOutputComponent.Result) result.getQueryResult();
 
-      List outputClusters = output.clusters;
-      HitsCluster [] clusters = new HitsCluster[ outputClusters.size() ];
+      final List outputClusters = output.clusters;
+      final HitsCluster [] clusters = new HitsCluster[ outputClusters.size() ];
 
       int j = 0;
       for (Iterator i = outputClusters.iterator(); i.hasNext(); j++) {
@@ -162,5 +206,33 @@
     } catch (Exception e) {
       throw new RuntimeException("Unidentified problems with the clustering.", e);
     }
+  }
+
+  /**
+   * Implementation of {@link Configurable}
+   */
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    
+    // Configure default language and other component settings.
+    if (conf.get(CONF_PROP_DEFAULT_LANGUAGE) != null) {
+      // Change the default language.
+      this.defaultLanguage = conf.get(CONF_PROP_DEFAULT_LANGUAGE);
+    } 
+    if (conf.getStrings(CONF_PROP_LANGUAGES) != null) {
+      this.languages = conf.getStrings(CONF_PROP_LANGUAGES);
+    }
+
+    logger.log(Level.INFO, "Default language: " + defaultLanguage);
+    logger.log(Level.INFO, "Enabled languages: " + Arrays.asList(languages));
+
+    initialize();
+  }
+
+  /**
+   * Implementation of {@link Configurable}
+   */
+  public Configuration getConf() {
+    return conf;
   }
 }

Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/HitsClusterAdapter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/HitsClusterAdapter.java?rev=393889&r1=393888&r2=393889&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/HitsClusterAdapter.java (original)
+++ lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/HitsClusterAdapter.java Thu Apr 13 11:57:55 2006
@@ -33,7 +33,6 @@
  * @version $Id: HitsClusterAdapter.java,v 1.1 2004/08/09 23:23:53 johnnx Exp $
  */
 public class HitsClusterAdapter implements HitsCluster {
-
   private RawCluster rawCluster;
   private HitDetails [] hits;
 

Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/LocalNutchInputComponent.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/LocalNutchInputComponent.java?rev=393889&r1=393888&r2=393889&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/LocalNutchInputComponent.java (original)
+++ lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/LocalNutchInputComponent.java Thu Apr 13 11:57:55 2006
@@ -34,6 +34,7 @@
 import com.dawidweiss.carrot.core.local.ProcessingException;
 import com.dawidweiss.carrot.core.local.RequestContext;
 import com.dawidweiss.carrot.core.local.clustering.*;
+import com.dawidweiss.carrot.util.common.StringUtils;
 
 /**
  * A local input component that ignores the query passed from the
@@ -58,7 +59,19 @@
   /** This component's capabilities */
   private final static Set COMPONENT_CAPABILITIES 
     = new HashSet(Arrays.asList(new Object [] { RawDocumentsProducer.class }));
-    
+
+  /**
+   * Default language code for hits that don't have their own.
+   */
+  private String defaultLanguage;
+
+  /**
+   * Creates an input component with the given default language code.
+   */
+  public LocalNutchInputComponent(String defaultLanguage) {
+    this.defaultLanguage = defaultLanguage;
+  }
+
   /*
    * @see com.dawidweiss.carrot.core.local.LocalInputComponent#setQuery(java.lang.String)
    */
@@ -66,15 +79,17 @@
       // ignore the query; data will be provided from the request context.
   }
 
-  /** A callback hook that starts the processing. */
+  /**
+   * A callback hook that starts the processing.
+   */
   public void startProcessing(RequestContext context) throws ProcessingException {
     // let successor components know that the processing has started.
     super.startProcessing(context);
     
     // get the information about documents from the context.
-    Map params = context.getRequestParameters();
-    HitDetails [] details = (HitDetails[]) params.get(NUTCH_INPUT_HIT_DETAILS_ARRAY);
-    String [] summaries = (String[]) params.get(NUTCH_INPUT_SUMMARIES_ARRAY);
+    final Map params = context.getRequestParameters();
+    final HitDetails [] details = (HitDetails[]) params.get(NUTCH_INPUT_HIT_DETAILS_ARRAY);
+    final String [] summaries = (String[]) params.get(NUTCH_INPUT_SUMMARIES_ARRAY);
     
     if (details == null)
       throw new ProcessingException("Details array must not be null.");
@@ -85,11 +100,10 @@
     if (summaries.length != details.length)
       throw new ProcessingException("Summaries and details must be of the same length.");
     
-    RawDocumentsConsumer consumer = (RawDocumentsConsumer) next;
-    
     // produce 'documents' for successor components.
+    final RawDocumentsConsumer consumer = (RawDocumentsConsumer) next;
     for (int i=0;i<summaries.length;i++) {
-      consumer.addDocument(new NutchDocument(i, details[i], htmlToText(summaries[i])));
+      consumer.addDocument(new NutchDocument(i, details[i], htmlToText(summaries[i]), defaultLanguage));
     }
   }
 
@@ -107,52 +121,14 @@
     return SUCCESSOR_CAPABILITIES;
   }
 
-  // --- The methods below, plus dependency on the Nekohtml parser
-  // are only required because Nutch's summaries are in HTML by default.
-  // I guess it would be possible to get rid of the code below by
-  // adding patches/ methods to Nutch that return plain text summaries.
-  // 
-  // The temporary quick-and-dirty solution below has been provided by Doug, thanks. 
-
-  /**
-   * The text buffer for plain text. 
-   */
-  private StringBuffer textBuffer = new StringBuffer();
-    
-  /**
-   * A parser that will convert html to plain text.
-   */
-  private AbstractSAXParser parser;
-
-  /*
-   * Anonymous initialization of the parser. Since we declared
-   * the current solution to be quick and dirty, it doesn't have
-   * to be in the constructor :)
-   */
-  {
-    try {
-      parser = new AbstractSAXParser(new HTMLConfiguration()){};
-      parser.setContentHandler(new DefaultHandler() {
-          public void characters(char[] chars, int start, int length)
-            throws SAXException {
-            textBuffer.append(chars, start, length);
-          }
-        });
-    } catch (Exception e) {
-      throw new RuntimeException(e.toString(), e);
-    }
-  }
-
   /**
    * Converts a html chunk to plain text.
+   * 
+   * This method is only required because Nutch's summaries are in HTML.
+   * I guess it would be possible to get rid of the code below by
+   * adding patches/ methods to Nutch that return plain text summaries. 
    */
   private final String htmlToText(String html) {
-    textBuffer.setLength(0);
-    try {
-      parser.parse(new InputSource(new StringReader(html)));
-    } catch (Exception e) {                     // shouldn't happen
-      throw new RuntimeException(e.toString(), e);
-    }
-    return textBuffer.toString();
+    return StringUtils.removeMarkup(html);
   }
 }

Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/NutchDocument.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/NutchDocument.java?rev=393889&r1=393888&r2=393889&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/NutchDocument.java (original)
+++ lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/NutchDocument.java Thu Apr 13 11:57:55 2006
@@ -22,32 +22,47 @@
 import com.dawidweiss.carrot.core.local.clustering.RawDocumentBase;
 
 /**
- * An adapter class that implements {@link RawDocument} for
- * Carrot2.  
+ * An adapter class that implements {@link RawDocument} required for Carrot2.  
  *
  * @author Dawid Weiss
  * @version $Id: NutchDocument.java,v 1.2 2004/08/10 00:18:43 johnnx Exp $
  */
 public class NutchDocument extends RawDocumentBase {
-
+  /**
+   * Integer identifier of this document. We need a subclass of 
+   * {@link java.lang.Object}, so this should do.
+   */
   private final Integer id;
   
   /**
    * Creates a new document with the given id, <code>summary</code> and wrapping
    * a <code>details</code> hit details.
    */
-  public NutchDocument(int id, HitDetails details, String summary) {
+  public NutchDocument(int id, HitDetails details, String summary, String defaultLanguage) {
     super.setProperty(RawDocument.PROPERTY_URL, details.getValue("url"));
     super.setProperty(RawDocument.PROPERTY_SNIPPET, summary);
-    
-    String title = details.getValue("title");
+
+    final String title = details.getValue("title");
     if (title != null && !"".equals(title)) {
       super.setProperty(RawDocument.PROPERTY_TITLE, title);
     }
     
+    String lang = details.getValue("lang");
+    if (lang == null) {
+      // No default language. Take the default from the configuration file.
+      lang = defaultLanguage;
+    }
+    // Use this language for the snippet. Truncate longer ISO codes
+    // to only include two-letter language code.
+    if (lang.length() > 2) {
+      lang = lang.substring(0, 2);
+    }
+    lang = lang.toLowerCase();
+    super.setProperty(RawDocument.PROPERTY_LANGUAGE, lang);
+
     this.id = new Integer(id);
   }
-  
+
   /*
    * @see com.dawidweiss.carrot.core.local.clustering.RawDocument#getId()
    */