You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by pk...@apache.org on 2006/04/13 20:57:58 UTC
svn commit: r393889 - in /lucene/nutch/trunk/src/plugin/clustering-carrot2:
./ lib/ src/java/org/apache/nutch/clustering/carrot2/
Author: pkosiorowski
Date: Thu Apr 13 11:57:55 2006
New Revision: 393889
URL: http://svn.apache.org/viewcvs?rev=393889&view=rev
Log:
NUTCH-237 - Carrot2 clustering plugin upgrade. (Dawid Weiss)
Added:
lucene/nutch/trunk/src/plugin/clustering-carrot2/readme.txt
Removed:
lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/FSA.LICENSE
lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/FSA.jar
Modified:
lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2-filter-lingo.jar
lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2-local-core.jar
lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2-snowball-stemmers.jar
lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2-util-common.jar
lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2-util-tokenizer.jar
lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2.CONTRIBUTORS
lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2.LICENSE
lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml
lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/Clusterer.java
lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/HitsClusterAdapter.java
lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/LocalNutchInputComponent.java
lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/NutchDocument.java
Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2-filter-lingo.jar
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2-filter-lingo.jar?rev=393889&r1=393888&r2=393889&view=diff
==============================================================================
Binary files - no diff available.
Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2-local-core.jar
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2-local-core.jar?rev=393889&r1=393888&r2=393889&view=diff
==============================================================================
Binary files - no diff available.
Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2-snowball-stemmers.jar
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2-snowball-stemmers.jar?rev=393889&r1=393888&r2=393889&view=diff
==============================================================================
Binary files - no diff available.
Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2-util-common.jar
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2-util-common.jar?rev=393889&r1=393888&r2=393889&view=diff
==============================================================================
Binary files - no diff available.
Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2-util-tokenizer.jar
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2-util-tokenizer.jar?rev=393889&r1=393888&r2=393889&view=diff
==============================================================================
Binary files - no diff available.
Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2.CONTRIBUTORS
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2.CONTRIBUTORS?rev=393889&r1=393888&r2=393889&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2.CONTRIBUTORS (original)
+++ lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2.CONTRIBUTORS Thu Apr 13 11:57:55 2006
@@ -7,6 +7,7 @@
Dawid Weiss; Project administrator, various components, core; 2002; Poland
StanisÅaw, OsiÅski; Lingo clustering component, ODP Input; 2003; Poland
+Karol GoÅembniak, Irmina MasÅowska; HAOG clustering component; 2006; Poznan University of Technology; Poland
MichaÅ, Wróblewski [*]; AHC clustering components; 2003; Poznan University of Technology, Poland
PaweÅ, Kowalik [*]; Inductive search engine wrapper; 2003; Poznan University of Technology, Poland
Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2.LICENSE
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2.LICENSE?rev=393889&r1=393888&r2=393889&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2.LICENSE (original)
+++ lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2.LICENSE Thu Apr 13 11:57:55 2006
@@ -1,7 +1,9 @@
Carrot2 Project
-Copyright (C) Dawid Weiss, Stanislaw Osinski
-Portions (C) Contributors listed in carrot2.CONTRIBUTORS file.
+
+Copyright (C) 2002-2006, Dawid Weiss, Stanis³aw Osiñski.
+Portions (C) Contributors listed in "carrot2.CONTRIBUTORS" file.
+All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml?rev=393889&r1=393888&r2=393889&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml Thu Apr 13 11:57:55 2006
@@ -2,7 +2,7 @@
<plugin
id="clustering-carrot2"
name="Online Search Results Clustering using Carrot2's Lingo component"
- version="0.9.0"
+ version="1.0.2"
provider-name="carrot2.sourceforge.net">
<runtime>
@@ -13,21 +13,18 @@
<library name="carrot2-filter-lingo.jar"/>
<library name="carrot2-local-core.jar"/>
<library name="carrot2-snowball-stemmers.jar"/>
- <library name="carrot2-stemmer-lametyzator.jar"/>
<library name="carrot2-util-common.jar"/>
<library name="carrot2-util-tokenizer.jar"/>
- <library name="colt-1.0.3.jar"/>
<library name="commons-collections-3.1-patched.jar"/>
<library name="commons-pool-1.1.jar"/>
- <library name="FSA.jar"/>
<library name="Jama-1.0.1-patched.jar"/>
+ <library name="violinstrings-1.0.2.jar"/>
</runtime>
<requires>
<import plugin="nutch-extensionpoints"/>
<import plugin="lib-log4j"/>
- <import plugin="lib-nekohtml"/>
</requires>
<extension id="org.apache.nutch.clustering.carrot2"
@@ -36,6 +33,5 @@
<implementation id="Carrot2-Lingo"
class="org.apache.nutch.clustering.carrot2.Clusterer"/>
-
</extension>
</plugin>
Added: lucene/nutch/trunk/src/plugin/clustering-carrot2/readme.txt
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/clustering-carrot2/readme.txt?rev=393889&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/clustering-carrot2/readme.txt (added)
+++ lucene/nutch/trunk/src/plugin/clustering-carrot2/readme.txt Thu Apr 13 11:57:55 2006
@@ -0,0 +1,44 @@
+This plugin extension adds search results clustering capability to Nutch search
+frontend.
+
+The user interface in Nutch is very limited and you'll most likely need something
+more application-specific. Look at http://www.carrot2.org or
+http://carrot.cs.put.poznan.pl for inspiration.
+
+Libraries in this release are precompiled with stemming and stop words for various
+languages present in Carrot2 codebase (imported from the Snowball project). You
+must define the default language and supported languages in Nutch configuration
+file (nutch-site.xml). If nothing is given in Nutch configuration, English is
+taken by default.
+
+<!-- Carrot2 Clustering plugin configuration -->
+
+<property>
+ <name>extension.clustering.carrot2.defaultLanguage</name>
+ <value>en</value>
+ <description>Two-letter ISO code of the language.
+ http://www.ics.uci.edu/pub/ietf/http/related/iso639.txt</description>
+</property>
+
+<property>
+ <name>extension.clustering.carrot2.languages</name>
+ <value>en,nl,da,fi,fr,de,it,no,pl,pt,ru,es,sv</value>
+ <description>All languages to be used by the clustering plugin.
+ This list includes all currently supported languages (although not all of them
+ will successfully instantiate -- support for Polish requires additional
+ libraries for instance). Adjust to your needs, fewer languages take less
+ memory.
+
+ If you use the language recognizer plugin, then each hit will come with its
+ own ISO language code. All hits with no explicit language take the default
+ language specified in "extension.clustering.carrot2.defaultLanguage" property.
+ </description>
+</property>
+
+
+If you need a different language/ clustering algorithm, you'll need to modify
+Nutch plugin code a bit (we don't want the plugin to outgrow Nutch, so we
+include just the essentials here). Ask on carrot@ developers mailing list for
+help if you need it.
+
+Carrot2 JARs come from codebase in version: 1.0.2
Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/Clusterer.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/Clusterer.java?rev=393889&r1=393888&r2=393889&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/Clusterer.java (original)
+++ lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/Clusterer.java Thu Apr 13 11:57:55 2006
@@ -16,46 +16,86 @@
package org.apache.nutch.clustering.carrot2;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.Set;
-import java.util.List;
-import java.util.Iterator;
-
+import java.util.*;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+import org.apache.hadoop.conf.Configurable;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.LogFormatter;
import org.apache.nutch.clustering.HitsCluster;
import org.apache.nutch.clustering.OnlineClusterer;
import org.apache.nutch.searcher.HitDetails;
+
import com.dawidweiss.carrot.core.local.*;
import com.dawidweiss.carrot.core.local.clustering.RawCluster;
import com.dawidweiss.carrot.core.local.impl.ClustersConsumerOutputComponent;
-import com.dawidweiss.carrot.util.tokenizer.SnippetTokenizerLocalFilterComponent;
-import com.stachoodev.carrot.filter.lingo.local.LingoLocalFilterComponent;
-
-import com.dawidweiss.carrot.util.tokenizer.languages.dutch.Dutch;
-import com.dawidweiss.carrot.util.tokenizer.languages.english.English;
-import com.dawidweiss.carrot.util.tokenizer.languages.french.French;
-import com.dawidweiss.carrot.util.tokenizer.languages.german.German;
-import com.dawidweiss.carrot.util.tokenizer.languages.italian.Italian;
-import com.dawidweiss.carrot.util.tokenizer.languages.spanish.Spanish;
import com.dawidweiss.carrot.core.local.linguistic.Language;
+import com.dawidweiss.carrot.util.tokenizer.languages.AllKnownLanguages;
+import com.stachoodev.carrot.filter.lingo.local.LingoLocalFilterComponent;
/**
- * An plugin providing an implementation of {@link OnlineClusterer} extension
- * using clustering components of the Carrot2 project
- * (<a href="http://carrot2.sourceforge.net">http://carrot2.sourceforge.net</a>).
+ * An plugin providing an implementation of {@link OnlineClusterer}
+ * extension using clustering components of the Carrot2 project
+ * (<a href="http://carrot2.sourceforge.net">http://carrot2.sourceforge.net</a>).
+ *
+ * We hardcode the following Carrot2 process:
+ * <pre><![CDATA[
+ * <local-process id="yahoo-lingo">
+ * <name>Yahoo Search API -- Lingo Classic Clusterer</name>
+ *
+ * <input component-key="input-localnutch" />
+ * <filter component-key="filter-lingo" />
+ * <output component-key="output-clustersConsumer" />
+ * </local-process>
+ * ]]></pre>
*
* @author Dawid Weiss
* @version $Id: Clusterer.java,v 1.1 2004/08/09 23:23:53 johnnx Exp $
*/
-public class Clusterer implements OnlineClusterer {
- private final LocalController controller;
+public class Clusterer implements OnlineClusterer, Configurable {
+ /** Default language property name. */
+ private final static String CONF_PROP_DEFAULT_LANGUAGE =
+ "extension.clustering.carrot2.defaultLanguage";
+
+ /** Recognizable languages property name. */
+ private final static String CONF_PROP_LANGUAGES =
+ "extension.clustering.carrot2.languages";
+
+ /** Internal clustering process ID in Carrot2 LocalController */
+ private final static String PROCESS_ID = "nutch-lingo";
+
+ public static final Logger logger =
+ LogFormatter.getLogger(Clusterer.class.getName());
+
+ /** The LocalController instance used for clustering */
+ private LocalController controller;
+
+ /** Nutch configuration. */
+ private Configuration conf;
+
+ /**
+ * Default language for hits. English by default, but may be changed
+ * via a property in Nutch configuration.
+ */
+ private String defaultLanguage = "en";
+
+ /**
+ * A list of recognizable languages..
+ * English only by default, but configurable via Nutch configuration.
+ */
+ private String [] languages = new String [] {defaultLanguage};
/**
* An empty public constructor for making new instances
* of the clusterer.
*/
public Clusterer() {
+ initialize();
+ }
+
+ private synchronized void initialize() {
controller = new LocalControllerBase();
addComponentFactories();
addProcesses();
@@ -63,68 +103,70 @@
/** Adds the required component factories to a local Carrot2 controller. */
private void addComponentFactories() {
- // Local nutch input component
+ // * <input component-key="input-localnutch" />
LocalComponentFactory nutchInputFactory = new LocalComponentFactoryBase() {
public LocalComponent getInstance() {
- return new LocalNutchInputComponent();
+ return new LocalNutchInputComponent(defaultLanguage);
}
};
- controller.addLocalComponentFactory("input.localnutch", nutchInputFactory);
-
- // Cluster consumer output component
- LocalComponentFactory clusterConsumerOutputFactory = new LocalComponentFactoryBase() {
- public LocalComponent getInstance() {
- return new ClustersConsumerOutputComponent();
- }
- };
- controller.addLocalComponentFactory("output.cluster-consumer",
- clusterConsumerOutputFactory);
-
- // Clustering component here.
+ controller.addLocalComponentFactory("input-localnutch", nutchInputFactory);
+
+ // * <filter component-key="filter-lingo" />
LocalComponentFactory lingoFactory = new LocalComponentFactoryBase() {
public LocalComponent getInstance() {
HashMap defaults = new HashMap();
-
- // These are adjustments settings for the clustering algorithm...
- // You can play with them, but the values below are our 'best guess'
- // settings that we acquired experimentally.
+
+ // These are adjustments settings for the clustering algorithm.
+ // If you try the live WebStart demo of Carrot2 you can see how they affect
+ // the final clustering: http://www.carrot2.org/webstart
defaults.put("lsi.threshold.clusterAssignment", "0.150");
defaults.put("lsi.threshold.candidateCluster", "0.775");
- // TODO: this should be eventually replaced with documents from Nutch
- // tagged with a language tag. There is no need to again determine
- // the language of a document.
+ // Initialize a new Lingo clustering component.
+ ArrayList languageList = new ArrayList(languages.length);
+ for (int i = 0; i < languages.length; i++) {
+ final String lcode = languages[i];
+ try {
+ Language lang = AllKnownLanguages.getLanguageForIsoCode(lcode);
+ if (lang == null) {
+ logger.log(Level.WARNING, "Language not supported in Carrot2: " + lcode);
+ } else {
+ languageList.add(lang);
+ logger.log(Level.FINE, "Language loaded: " + lcode);
+ }
+ } catch (Throwable t) {
+ logger.log(Level.WARNING, "Language could not be loaded: " + lcode, t);
+ }
+ }
return new LingoLocalFilterComponent(
- // If you want to include Polish in the list of supported languages,
- // you have to download a separate Carrot2-component called
- // carrot2-stemmer-lametyzator.jar, put it in classpath
- // and add new Polish() below.
- new Language[]
- {
- new English(),
- new Dutch(),
- new French(),
- new German(),
- new Italian(),
- new Spanish()
- }, defaults);
+ (Language []) languageList.toArray(new Language [languageList.size()]), defaults);
}
};
- controller.addLocalComponentFactory("filter.lingo-old", lingoFactory);
+ controller.addLocalComponentFactory("filter-lingo", lingoFactory);
+
+ // * <output component-key="output-clustersConsumer" />
+ LocalComponentFactory clusterConsumerOutputFactory = new LocalComponentFactoryBase() {
+ public LocalComponent getInstance() {
+ return new ClustersConsumerOutputComponent();
+ }
+ };
+ controller.addLocalComponentFactory("output-clustersConsumer",
+ clusterConsumerOutputFactory);
}
- /** Adds a clustering process to the local controller */
+ /**
+ * Adds a hardcoded clustering process to the local controller.
+ */
private void addProcesses() {
- LocalProcessBase lingoNMFKM3
- = new LocalProcessBase(
- "input.localnutch",
- "output.cluster-consumer",
- new String [] {"filter.lingo-old"},
- "Example the Lingo clustering algorithm.",
+ LocalProcessBase process = new LocalProcessBase(
+ "input-localnutch", // input
+ "output-clustersConsumer", // output
+ new String [] {"filter-lingo"}, // filters
+ "The Lingo clustering algorithm (www.carrot2.org).",
"");
try {
- controller.addProcess("lingo-nmf-km-3", lingoNMFKM3);
+ controller.addProcess(PROCESS_ID, process);
} catch (Exception e) {
throw new RuntimeException("Could not assemble clustering process.", e);
}
@@ -139,15 +181,17 @@
hitDetails);
requestParams.put(LocalNutchInputComponent.NUTCH_INPUT_SUMMARIES_ARRAY,
descriptions);
+
try {
- ProcessingResult result =
- controller.query("lingo-nmf-km-3", "pseudo-query", requestParams);
+ // The input component takes Nutch's results so we don't need the query argument.
+ final ProcessingResult result =
+ controller.query(PROCESS_ID, "no-query", requestParams);
- ClustersConsumerOutputComponent.Result output =
+ final ClustersConsumerOutputComponent.Result output =
(ClustersConsumerOutputComponent.Result) result.getQueryResult();
- List outputClusters = output.clusters;
- HitsCluster [] clusters = new HitsCluster[ outputClusters.size() ];
+ final List outputClusters = output.clusters;
+ final HitsCluster [] clusters = new HitsCluster[ outputClusters.size() ];
int j = 0;
for (Iterator i = outputClusters.iterator(); i.hasNext(); j++) {
@@ -162,5 +206,33 @@
} catch (Exception e) {
throw new RuntimeException("Unidentified problems with the clustering.", e);
}
+ }
+
+ /**
+ * Implementation of {@link Configurable}
+ */
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+
+ // Configure default language and other component settings.
+ if (conf.get(CONF_PROP_DEFAULT_LANGUAGE) != null) {
+ // Change the default language.
+ this.defaultLanguage = conf.get(CONF_PROP_DEFAULT_LANGUAGE);
+ }
+ if (conf.getStrings(CONF_PROP_LANGUAGES) != null) {
+ this.languages = conf.getStrings(CONF_PROP_LANGUAGES);
+ }
+
+ logger.log(Level.INFO, "Default language: " + defaultLanguage);
+ logger.log(Level.INFO, "Enabled languages: " + Arrays.asList(languages));
+
+ initialize();
+ }
+
+ /**
+ * Implementation of {@link Configurable}
+ */
+ public Configuration getConf() {
+ return conf;
}
}
Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/HitsClusterAdapter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/HitsClusterAdapter.java?rev=393889&r1=393888&r2=393889&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/HitsClusterAdapter.java (original)
+++ lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/HitsClusterAdapter.java Thu Apr 13 11:57:55 2006
@@ -33,7 +33,6 @@
* @version $Id: HitsClusterAdapter.java,v 1.1 2004/08/09 23:23:53 johnnx Exp $
*/
public class HitsClusterAdapter implements HitsCluster {
-
private RawCluster rawCluster;
private HitDetails [] hits;
Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/LocalNutchInputComponent.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/LocalNutchInputComponent.java?rev=393889&r1=393888&r2=393889&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/LocalNutchInputComponent.java (original)
+++ lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/LocalNutchInputComponent.java Thu Apr 13 11:57:55 2006
@@ -34,6 +34,7 @@
import com.dawidweiss.carrot.core.local.ProcessingException;
import com.dawidweiss.carrot.core.local.RequestContext;
import com.dawidweiss.carrot.core.local.clustering.*;
+import com.dawidweiss.carrot.util.common.StringUtils;
/**
* A local input component that ignores the query passed from the
@@ -58,7 +59,19 @@
/** This component's capabilities */
private final static Set COMPONENT_CAPABILITIES
= new HashSet(Arrays.asList(new Object [] { RawDocumentsProducer.class }));
-
+
+ /**
+ * Default language code for hits that don't have their own.
+ */
+ private String defaultLanguage;
+
+ /**
+ * Creates an input component with the given default language code.
+ */
+ public LocalNutchInputComponent(String defaultLanguage) {
+ this.defaultLanguage = defaultLanguage;
+ }
+
/*
* @see com.dawidweiss.carrot.core.local.LocalInputComponent#setQuery(java.lang.String)
*/
@@ -66,15 +79,17 @@
// ignore the query; data will be provided from the request context.
}
- /** A callback hook that starts the processing. */
+ /**
+ * A callback hook that starts the processing.
+ */
public void startProcessing(RequestContext context) throws ProcessingException {
// let successor components know that the processing has started.
super.startProcessing(context);
// get the information about documents from the context.
- Map params = context.getRequestParameters();
- HitDetails [] details = (HitDetails[]) params.get(NUTCH_INPUT_HIT_DETAILS_ARRAY);
- String [] summaries = (String[]) params.get(NUTCH_INPUT_SUMMARIES_ARRAY);
+ final Map params = context.getRequestParameters();
+ final HitDetails [] details = (HitDetails[]) params.get(NUTCH_INPUT_HIT_DETAILS_ARRAY);
+ final String [] summaries = (String[]) params.get(NUTCH_INPUT_SUMMARIES_ARRAY);
if (details == null)
throw new ProcessingException("Details array must not be null.");
@@ -85,11 +100,10 @@
if (summaries.length != details.length)
throw new ProcessingException("Summaries and details must be of the same length.");
- RawDocumentsConsumer consumer = (RawDocumentsConsumer) next;
-
// produce 'documents' for successor components.
+ final RawDocumentsConsumer consumer = (RawDocumentsConsumer) next;
for (int i=0;i<summaries.length;i++) {
- consumer.addDocument(new NutchDocument(i, details[i], htmlToText(summaries[i])));
+ consumer.addDocument(new NutchDocument(i, details[i], htmlToText(summaries[i]), defaultLanguage));
}
}
@@ -107,52 +121,14 @@
return SUCCESSOR_CAPABILITIES;
}
- // --- The methods below, plus dependency on the Nekohtml parser
- // are only required because Nutch's summaries are in HTML by default.
- // I guess it would be possible to get rid of the code below by
- // adding patches/ methods to Nutch that return plain text summaries.
- //
- // The temporary quick-and-dirty solution below has been provided by Doug, thanks.
-
- /**
- * The text buffer for plain text.
- */
- private StringBuffer textBuffer = new StringBuffer();
-
- /**
- * A parser that will convert html to plain text.
- */
- private AbstractSAXParser parser;
-
- /*
- * Anonymous initialization of the parser. Since we declared
- * the current solution to be quick and dirty, it doesn't have
- * to be in the constructor :)
- */
- {
- try {
- parser = new AbstractSAXParser(new HTMLConfiguration()){};
- parser.setContentHandler(new DefaultHandler() {
- public void characters(char[] chars, int start, int length)
- throws SAXException {
- textBuffer.append(chars, start, length);
- }
- });
- } catch (Exception e) {
- throw new RuntimeException(e.toString(), e);
- }
- }
-
/**
* Converts a html chunk to plain text.
+ *
+ * This method is only required because Nutch's summaries are in HTML.
+ * I guess it would be possible to get rid of the code below by
+ * adding patches/ methods to Nutch that return plain text summaries.
*/
private final String htmlToText(String html) {
- textBuffer.setLength(0);
- try {
- parser.parse(new InputSource(new StringReader(html)));
- } catch (Exception e) { // shouldn't happen
- throw new RuntimeException(e.toString(), e);
- }
- return textBuffer.toString();
+ return StringUtils.removeMarkup(html);
}
}
Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/NutchDocument.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/NutchDocument.java?rev=393889&r1=393888&r2=393889&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/NutchDocument.java (original)
+++ lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/NutchDocument.java Thu Apr 13 11:57:55 2006
@@ -22,32 +22,47 @@
import com.dawidweiss.carrot.core.local.clustering.RawDocumentBase;
/**
- * An adapter class that implements {@link RawDocument} for
- * Carrot2.
+ * An adapter class that implements {@link RawDocument} required for Carrot2.
*
* @author Dawid Weiss
* @version $Id: NutchDocument.java,v 1.2 2004/08/10 00:18:43 johnnx Exp $
*/
public class NutchDocument extends RawDocumentBase {
-
+ /**
+ * Integer identifier of this document. We need a subclass of
+ * {@link java.lang.Object}, so this should do.
+ */
private final Integer id;
/**
* Creates a new document with the given id, <code>summary</code> and wrapping
* a <code>details</code> hit details.
*/
- public NutchDocument(int id, HitDetails details, String summary) {
+ public NutchDocument(int id, HitDetails details, String summary, String defaultLanguage) {
super.setProperty(RawDocument.PROPERTY_URL, details.getValue("url"));
super.setProperty(RawDocument.PROPERTY_SNIPPET, summary);
-
- String title = details.getValue("title");
+
+ final String title = details.getValue("title");
if (title != null && !"".equals(title)) {
super.setProperty(RawDocument.PROPERTY_TITLE, title);
}
+ String lang = details.getValue("lang");
+ if (lang == null) {
+ // No default language. Take the default from the configuration file.
+ lang = defaultLanguage;
+ }
+ // Use this language for the snippet. Truncate longer ISO codes
+ // to only include two-letter language code.
+ if (lang.length() > 2) {
+ lang = lang.substring(0, 2);
+ }
+ lang = lang.toLowerCase();
+ super.setProperty(RawDocument.PROPERTY_LANGUAGE, lang);
+
this.id = new Integer(id);
}
-
+
/*
* @see com.dawidweiss.carrot.core.local.clustering.RawDocument#getId()
*/