You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by do...@apache.org on 2007/08/28 08:26:53 UTC

svn commit: r570327 - in /lucene/nutch/trunk: ./ src/plugin/clustering-carrot2/ src/plugin/clustering-carrot2/lib/ src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/ src/plugin/clustering-carrot2/src/test/org/apache/nutch/clust...

Author: dogacan
Date: Mon Aug 27 23:26:51 2007
New Revision: 570327

URL: http://svn.apache.org/viewvc?rev=570327&view=rev
Log:
NUTCH-544 - Upgrade Carrot2 clustering plugin to the newest stable release (2.1). Contributed by Dawid Weiss.

Added:
    lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/Jama-1.0.2.jar   (with props)
    lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/commons-collections-3.2.jar   (with props)
    lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/commons-pool-1.3.jar   (with props)
    lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/NutchInputComponent.java
    lucene/nutch/trunk/src/plugin/clustering-carrot2/src/test/org/apache/nutch/clustering/carrot2/TestClusterer.java
    lucene/nutch/trunk/src/plugin/clustering-carrot2/src/test/org/apache/nutch/clustering/carrot2/test-input.xml
Removed:
    lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/Jama-1.0.1-patched.jar
    lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/commons-collections-3.1-patched.jar
    lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/commons-collections.LICENSE
    lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/commons-pool-1.1.jar
    lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/LocalNutchInputComponent.java
    lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/test-input.xml
    lucene/nutch/trunk/src/plugin/clustering-carrot2/src/test/org/apache/nutch/clustering/carrot2/ClustererTest.java
Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/src/plugin/clustering-carrot2/build.xml
    lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2-filter-lingo.jar
    lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2-local-core.jar
    lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2-snowball-stemmers.jar
    lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2-util-common.jar
    lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2-util-tokenizer.jar
    lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/commons-pool.LICENSE
    lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml
    lucene/nutch/trunk/src/plugin/clustering-carrot2/readme.txt
    lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/Clusterer.java
    lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/HitsClusterAdapter.java
    lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/NutchDocument.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=570327&r1=570326&r2=570327&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Mon Aug 27 23:26:51 2007
@@ -120,6 +120,9 @@
 40. NUTCH-439 - Top Level Domains Indexing / Scoring. Also adds 
     domain-related utilities. (Enis Soztutar via dogacan)
 
+41. NUTCH-544 - Upgrade Carrot2 clustering plugin to the newest stable 
+    release (2.1). (Dawid Weiss via dogacan)
+
 Release 0.9 - 2007-04-02
 
  1. Changed log4j confiquration to log to stdout on commandline

Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/build.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/clustering-carrot2/build.xml?rev=570327&r1=570326&r2=570327&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/clustering-carrot2/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/clustering-carrot2/build.xml Mon Aug 27 23:26:51 2007
@@ -16,7 +16,6 @@
  limitations under the License.
 -->
 <project name="clustering-carrot2" default="jar-core">
-
   <import file="../build-plugin.xml"/>
 
   <!-- Build compilation dependencies -->
@@ -29,12 +28,19 @@
     <fileset dir="${nutch.root}/build">
       <include name="**/lib-nekohtml/*.jar" />
     </fileset>
+    <fileset dir="${nutch.root}/lib">
+      <include name="commons-lang-*.jar" />
+    </fileset>
   </path>
 
   <!-- Deploy Unit test dependencies -->
   <target name="deps-test">
     <ant target="deploy" inheritall="false" dir="../lib-nekohtml"/>
-    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
+    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
+
+	<copy toDir="${build.test}">
+		<fileset dir="${src.test}" excludes="**/*.java" />
+	</copy>
   </target>
 
 </project>

Added: lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/Jama-1.0.2.jar
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/Jama-1.0.2.jar?rev=570327&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/Jama-1.0.2.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2-filter-lingo.jar
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2-filter-lingo.jar?rev=570327&r1=570326&r2=570327&view=diff
==============================================================================
Binary files - no diff available.

Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2-local-core.jar
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2-local-core.jar?rev=570327&r1=570326&r2=570327&view=diff
==============================================================================
Binary files - no diff available.

Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2-snowball-stemmers.jar
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2-snowball-stemmers.jar?rev=570327&r1=570326&r2=570327&view=diff
==============================================================================
Binary files - no diff available.

Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2-util-common.jar
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2-util-common.jar?rev=570327&r1=570326&r2=570327&view=diff
==============================================================================
Binary files - no diff available.

Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2-util-tokenizer.jar
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/carrot2-util-tokenizer.jar?rev=570327&r1=570326&r2=570327&view=diff
==============================================================================
Binary files - no diff available.

Added: lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/commons-collections-3.2.jar
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/commons-collections-3.2.jar?rev=570327&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/commons-collections-3.2.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/commons-pool-1.3.jar
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/commons-pool-1.3.jar?rev=570327&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/commons-pool-1.3.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/commons-pool.LICENSE
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/commons-pool.LICENSE?rev=570327&r1=570326&r2=570327&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/commons-pool.LICENSE (original)
+++ lucene/nutch/trunk/src/plugin/clustering-carrot2/lib/commons-pool.LICENSE Mon Aug 27 23:26:51 2007
@@ -1,60 +1,60 @@
-/*
- * $Revision: 1.2 $
- * $Date: 2004/06/19 16:26:16 $
- *
- * ====================================================================
- *
- * The Apache Software License, Version 1.1
- *
- * Copyright (c) 1999-2003 The Apache Software Foundation.  All rights
- * reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *
- * 3. The end-user documentation included with the redistribution, if
- *    any, must include the following acknowledgement:
- *       "This product includes software developed by the
- *        Apache Software Foundation - http://www.apache.org/"
- *    Alternately, this acknowledgement may appear in the software itself,
- *    if and wherever such third-party acknowledgements normally appear.
- *
- * 4. The names "The Jakarta Project", "Commons", and "Apache Software
- *    Foundation" must not be used to endorse or promote products derived
- *    from this software without prior written permission. For written
- *    permission, please contact apache@apache.org.
- *
- * 5. Products derived from this software may not be called "Apache"
- *    nor may "Apache" appear in their names without prior written
- *    permission of the Apache Software Foundation.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
- * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- * ====================================================================
- *
- * This software consists of voluntary contributions made by many
- * individuals on behalf of the Apache Software Foundation.  For more
- * information on the Apache Software Foundation, please see
- * http://www.apache.org/
- *
- */
+/*
+ * $Revision: 206 $
+ * $Date: 2004-06-19 18:26:22 +0200 (Sat, 19 Jun 2004) $
+ *
+ * ====================================================================
+ *
+ * The Apache Software License, Version 1.1
+ *
+ * Copyright (c) 1999-2003 The Apache Software Foundation.  All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. The end-user documentation included with the redistribution, if
+ *    any, must include the following acknowledgement:
+ *       "This product includes software developed by the
+ *        Apache Software Foundation - http://www.apache.org/"
+ *    Alternately, this acknowledgement may appear in the software itself,
+ *    if and wherever such third-party acknowledgements normally appear.
+ *
+ * 4. The names "The Jakarta Project", "Commons", and "Apache Software
+ *    Foundation" must not be used to endorse or promote products derived
+ *    from this software without prior written permission. For written
+ *    permission, please contact apache@apache.org.
+ *
+ * 5. Products derived from this software may not be called "Apache"
+ *    nor may "Apache" appear in their names without prior written
+ *    permission of the Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation.  For more
+ * information on the Apache Software Foundation, please see
+ * http://www.apache.org/
+ *
+ */
  

Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml?rev=570327&r1=570326&r2=570327&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/clustering-carrot2/plugin.xml Mon Aug 27 23:26:51 2007
@@ -17,24 +17,29 @@
 -->
 <plugin
    id="clustering-carrot2"
-   name="Online Search Results Clustering using Carrot2's Lingo component"
-   version="1.0.2"
-   provider-name="carrot2.sourceforge.net">
+   name="Online Search Results Clustering using Carrot2's components"
+   version="1.0.3"
+   provider-name="www.carrot2.org">
 
    <runtime>
       <library name="clustering-carrot2.jar">
          <export name="*"/>
       </library>
 
+      <!--
+	   The defaults for Lingo. If you plan to use another clustering
+	   algorithm from the Carrot2 project, you'll need all the JARs
+	   required for that algorithm.
+	-->
       <library name="carrot2-filter-lingo.jar"/>
       <library name="carrot2-local-core.jar"/>
       <library name="carrot2-snowball-stemmers.jar"/>
       <library name="carrot2-util-common.jar"/>
       <library name="carrot2-util-tokenizer.jar"/>
 
-      <library name="commons-collections-3.1-patched.jar"/>
-      <library name="commons-pool-1.1.jar"/>
-      <library name="Jama-1.0.1-patched.jar"/>
+      <library name="commons-collections-3.2.jar"/>
+      <library name="commons-pool-1.3.jar"/>
+      <library name="Jama-1.0.2.jar"/>
       <library name="violinstrings-1.0.2.jar"/>
    </runtime>
 
@@ -45,8 +50,7 @@
    <extension id="org.apache.nutch.clustering.carrot2"
               name="Carrot2 Clusterer"
               point="org.apache.nutch.clustering.OnlineClusterer">
-
-      <implementation id="Carrot2-Lingo"
+      <implementation id="Carrot2"
                       class="org.apache.nutch.clustering.carrot2.Clusterer"/>
    </extension>
 </plugin>

Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/readme.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/clustering-carrot2/readme.txt?rev=570327&r1=570326&r2=570327&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/clustering-carrot2/readme.txt (original)
+++ lucene/nutch/trunk/src/plugin/clustering-carrot2/readme.txt Mon Aug 27 23:26:51 2007
@@ -1,44 +1,7 @@
 This plugin extension adds search results clustering capability to Nutch search 
 frontend.
 
-The user interface in Nutch is very limited and you'll most likely need something 
-more application-specific. Look at http://www.carrot2.org or 
-http://carrot.cs.put.poznan.pl for inspiration.
+Carrot2 JARs come from codebase in version: 2.1
 
-Libraries in this release are precompiled with stemming and stop words for various
-languages present in Carrot2 codebase (imported from the Snowball project). You 
-must define the default language and supported languages in Nutch configuration
-file (nutch-site.xml). If nothing is given in Nutch configuration, English is 
-taken by default.
-
-<!-- Carrot2 Clustering plugin configuration -->
-
-<property>
-  <name>extension.clustering.carrot2.defaultLanguage</name>
-  <value>en</value>
-  <description>Two-letter ISO code of the language. 
-  http://www.ics.uci.edu/pub/ietf/http/related/iso639.txt</description>
-</property>
-
-<property>
-  <name>extension.clustering.carrot2.languages</name>
-  <value>en,nl,da,fi,fr,de,it,no,pl,pt,ru,es,sv</value>
-  <description>All languages to be used by the clustering plugin. 
-  This list includes all currently supported languages (although not all of them
-  will successfully instantiate -- support for Polish requires additional
-  libraries for instance). Adjust to your needs, fewer languages take less
-  memory.
-  
-  If you use the language recognizer plugin, then each hit will come with its
-  own ISO language code. All hits with no explicit language take the default
-  language specified in "extension.clustering.carrot2.defaultLanguage" property.
-  </description>
-</property>
-
-
-If you need a different language/ clustering algorithm, you'll need to modify 
-Nutch plugin code a bit (we don't want the plugin to outgrow Nutch, so we 
-include just the essentials here). Ask on carrot@ developers mailing list for 
-help if you need it.
-
-Carrot2 JARs come from codebase in version: 1.0.2
+See the WIKI for more information about configuration and installation
+of this plugin.

Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/Clusterer.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/Clusterer.java?rev=570327&r1=570326&r2=570327&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/Clusterer.java (original)
+++ lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/Clusterer.java Mon Aug 27 23:26:51 2007
@@ -16,43 +16,59 @@
  */
 package org.apache.nutch.clustering.carrot2;
 
-import java.util.*;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
-
 import org.apache.hadoop.conf.Configurable;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.log4j.Level;
+import org.apache.log4j.Logger;
 import org.apache.nutch.clustering.HitsCluster;
 import org.apache.nutch.clustering.OnlineClusterer;
 import org.apache.nutch.searcher.HitDetails;
+import org.carrot2.core.DuplicatedKeyException;
+import org.carrot2.core.InitializationException;
+import org.carrot2.core.LocalComponent;
+import org.carrot2.core.LocalComponentFactory;
+import org.carrot2.core.LocalControllerBase;
+import org.carrot2.core.LocalProcess;
+import org.carrot2.core.LocalProcessBase;
+import org.carrot2.core.MissingComponentException;
+import org.carrot2.core.MissingProcessException;
+import org.carrot2.core.ProcessingResult;
+import org.carrot2.core.clustering.RawCluster;
+import org.carrot2.core.controller.ControllerHelper;
+import org.carrot2.core.controller.LoaderExtensionUnknownException;
+import org.carrot2.core.impl.ArrayOutputComponent;
+import org.carrot2.core.linguistic.Language;
+import org.carrot2.filter.lingo.local.LingoLocalFilterComponent;
+import org.carrot2.util.tokenizer.languages.AllKnownLanguages;
 
-import com.dawidweiss.carrot.core.local.*;
-import com.dawidweiss.carrot.core.local.clustering.RawCluster;
-import com.dawidweiss.carrot.core.local.impl.ClustersConsumerOutputComponent;
-import com.dawidweiss.carrot.core.local.linguistic.Language;
-import com.dawidweiss.carrot.util.tokenizer.languages.AllKnownLanguages;
-import com.stachoodev.carrot.filter.lingo.local.LingoLocalFilterComponent;
 
 
 /**
- * An plugin providing an implementation of {@link OnlineClusterer} 
+ * This plugin provides an implementation of {@link OnlineClusterer} 
  * extension using clustering components of the Carrot2 project
- * (<a href="http://carrot2.sourceforge.net">http://carrot2.sourceforge.net</a>).
+ * (<a href="http://www.carrot2.org">http://www.carrot2.org</a>).
  * 
- * We hardcode the following Carrot2 process:
+ * <p>This class hardcodes an equivalent of the following Carrot2 process:
  * <pre><![CDATA[
  * <local-process id="yahoo-lingo">
  *   <name>Yahoo Search API -- Lingo Classic Clusterer</name>
  * 
- *   <input  component-key="input-localnutch" />
+ *   <input  component-key="input-nutch" />
  *   <filter component-key="filter-lingo" />
  *   <output component-key="output-clustersConsumer" />
  * </local-process>
  * ]]></pre>
- *
- * @author Dawid Weiss
- * @version $Id: Clusterer.java,v 1.1 2004/08/09 23:23:53 johnnx Exp $
  */
 public class Clusterer implements OnlineClusterer, Configurable {
   /** Default language property name. */
@@ -69,7 +85,7 @@
   public static final Log logger = LogFactory.getLog(Clusterer.class);  
 
   /** The LocalController instance used for clustering */
-  private LocalController controller;
+  private LocalControllerBase controller;
 
   /** Nutch configuration. */
   private Configuration conf;
@@ -91,100 +107,22 @@
    * of the clusterer.
    */
   public Clusterer() {
-    initialize();
+    // Don't forget to call {@link #setConf(Configuration)}.
   }
 
-  private synchronized void initialize() {
-    controller = new LocalControllerBase();
-    addComponentFactories();
-    addProcesses();
-  }
-
-  /** Adds the required component factories to a local Carrot2 controller. */
-  private void addComponentFactories() {
-    //  *   <input  component-key="input-localnutch" />
-    LocalComponentFactory nutchInputFactory = new LocalComponentFactoryBase() {
-      public LocalComponent getInstance() {
-        return new LocalNutchInputComponent(defaultLanguage);
-      }
-    };
-    controller.addLocalComponentFactory("input-localnutch", nutchInputFactory);
-
-    // *   <filter component-key="filter-lingo" />
-    LocalComponentFactory lingoFactory = new LocalComponentFactoryBase() {
-      public LocalComponent getInstance() {
-        HashMap defaults = new HashMap();
-
-        // These are adjustments settings for the clustering algorithm.
-        // If you try the live WebStart demo of Carrot2 you can see how they affect
-        // the final clustering: http://www.carrot2.org/webstart 
-        defaults.put("lsi.threshold.clusterAssignment", "0.150");
-        defaults.put("lsi.threshold.candidateCluster",  "0.775");
-
-        // Initialize a new Lingo clustering component.
-        ArrayList languageList = new ArrayList(languages.length);
-        for (int i = 0; i < languages.length; i++) {
-          final String lcode = languages[i];
-          try {
-            Language lang = AllKnownLanguages.getLanguageForIsoCode(lcode);
-            if (lang == null) {
-              if (logger.isWarnEnabled()) {
-                logger.warn("Language not supported in Carrot2: " + lcode);
-              }
-            } else {
-              languageList.add(lang);
-              if (logger.isDebugEnabled()) {
-                logger.debug("Language loaded: " + lcode);
-              }
-            }
-          } catch (Throwable t) {
-            if (logger.isWarnEnabled()) {
-              logger.warn("Language could not be loaded: " + lcode, t);
-            }
-          }
-        }
-        return new LingoLocalFilterComponent(
-          (Language []) languageList.toArray(new Language [languageList.size()]), defaults);
-      }
-    };
-    controller.addLocalComponentFactory("filter-lingo", lingoFactory);
-
-    // *   <output component-key="output-clustersConsumer" />
-    LocalComponentFactory clusterConsumerOutputFactory = new LocalComponentFactoryBase() {
-      public LocalComponent getInstance() {
-        return new ClustersConsumerOutputComponent();
-      }
-    };
-    controller.addLocalComponentFactory("output-clustersConsumer", 
-      clusterConsumerOutputFactory);
-  }
-
-  /** 
-   * Adds a hardcoded clustering process to the local controller.
-   */  
-  private void addProcesses() {
-    LocalProcessBase process = new LocalProcessBase(
-        "input-localnutch",                                   // input
-        "output-clustersConsumer",                            // output
-        new String [] {"filter-lingo"},                       // filters
-        "The Lingo clustering algorithm (www.carrot2.org).",
-        "");
-
-    try {
-      controller.addProcess(PROCESS_ID, process);
-    } catch (Exception e) {
-      throw new RuntimeException("Could not assemble clustering process.", e);
-    }
-  }
-  
   /**
    * See {@link OnlineClusterer} for documentation.
    */
   public HitsCluster [] clusterHits(HitDetails [] hitDetails, String [] descriptions) {
-    Map requestParams = new HashMap();
-    requestParams.put(LocalNutchInputComponent.NUTCH_INPUT_HIT_DETAILS_ARRAY,
+    if (this.controller == null) {
+      logger.error("initialize() not called.");
+      return new HitsCluster[0];
+    }
+
+    final Map requestParams = new HashMap();
+    requestParams.put(NutchInputComponent.NUTCH_INPUT_HIT_DETAILS_ARRAY,
       hitDetails);
-    requestParams.put(LocalNutchInputComponent.NUTCH_INPUT_SUMMARIES_ARRAY,
+    requestParams.put(NutchInputComponent.NUTCH_INPUT_SUMMARIES_ARRAY,
       descriptions);
 
     try {
@@ -192,8 +130,8 @@
       final ProcessingResult result = 
         controller.query(PROCESS_ID, "no-query", requestParams);
 
-      final ClustersConsumerOutputComponent.Result output =
-        (ClustersConsumerOutputComponent.Result) result.getQueryResult();
+      final ArrayOutputComponent.Result output =
+        (ArrayOutputComponent.Result) result.getQueryResult();
 
       final List outputClusters = output.clusters;
       final HitsCluster [] clusters = new HitsCluster[ outputClusters.size() ];
@@ -218,7 +156,7 @@
    */
   public void setConf(Configuration conf) {
     this.conf = conf;
-    
+
     // Configure default language and other component settings.
     if (conf.get(CONF_PROP_DEFAULT_LANGUAGE) != null) {
       // Change the default language.
@@ -242,4 +180,151 @@
   public Configuration getConf() {
     return conf;
   }
+  
+  /**
+   * Initialize clustering processes and Carrot2 components.
+   */
+  private synchronized void initialize() {
+    // Initialize language list, temporarily switching off logging
+    // of warnings. This is a bit of a hack, but we don't want to
+    // redistribute the entire Carrot2 distro and this prevents
+    // nasty ClassNotFound warnings.
+    final Logger c2Logger = Logger.getLogger("org.carrot2");
+    final Level original = c2Logger.getLevel();
+    c2Logger.setLevel(Level.ERROR);
+    AllKnownLanguages.getLanguageCodes();
+    c2Logger.setLevel(original);
+
+    // Initialize the controller.    
+    controller = new LocalControllerBase();
+
+    final Configuration nutchConf = getConf();
+    final String processResource = nutchConf.get(
+        "extension.clustering.carrot2.process-resource");
+
+    if (processResource == null) {
+      logger.info("Using default clustering algorithm (Lingo).");
+      addDefaultProcess();
+    } else {
+      logger.info("Using custom clustering process: " + processResource);
+      controller.setComponentAutoload(true);
+      
+      final ControllerHelper helper = new ControllerHelper();
+      final InputStream is = Thread.currentThread()
+        .getContextClassLoader().getResourceAsStream(processResource);
+      if (is != null) {
+        try {
+          final LocalComponentFactory nutchInputFactory = new LocalComponentFactory() {
+            public LocalComponent getInstance() {
+              return new NutchInputComponent(defaultLanguage);
+            }
+          };
+          controller.addLocalComponentFactory("input-nutch", nutchInputFactory);
+          
+          final LocalProcess process = helper.loadProcess(
+              helper.getExtension(processResource), is).getProcess();
+          controller.addProcess(PROCESS_ID, process);
+          is.close();
+        } catch (IOException e) {
+          logger.error("Could not load process resource: " + processResource, e);
+        } catch (LoaderExtensionUnknownException e) {
+          logger.error("Unrecognized extension of process resource: " + processResource);
+        } catch (InstantiationException e) {
+          logger.error("Could not instantiate process: " + processResource, e);
+        } catch (InitializationException e) {
+          logger.error("Could not initialize process: " + processResource, e);
+        } catch (DuplicatedKeyException e) {
+          logger.error("Duplicated key (unreachable?): " + processResource, e);
+        } catch (MissingComponentException e) {
+          logger.error("Some components are missing, could not initialize process: " 
+              + processResource, e);
+        }
+      } else {
+        logger.error("Could not find process resource: " + processResource);
+      }
+    }
+  }
+
+  /**
+   * Adds a default clustering process using Lingo algorithm.
+   */
+  private void addDefaultProcess() {
+    try {
+      addComponentFactories();
+      addProcesses();
+    } catch (DuplicatedKeyException e) {
+      logger.fatal("Duplicated component or process identifier.", e);
+    }
+  }
+
+  /** Adds the required component factories to a local Carrot2 controller. */
+  private void addComponentFactories() throws DuplicatedKeyException {
+    //  *   <input  component-key="input-nutch" />
+    LocalComponentFactory nutchInputFactory = new LocalComponentFactory() {
+      public LocalComponent getInstance() {
+        return new NutchInputComponent(defaultLanguage);
+      }
+    };
+    controller.addLocalComponentFactory("input-nutch", nutchInputFactory);
+
+    // *   <filter component-key="filter-lingo" />
+    LocalComponentFactory lingoFactory = new LocalComponentFactory() {
+      public LocalComponent getInstance() {
+        final HashMap defaults = new HashMap();
+
+        // These are adjustments settings for the clustering algorithm.
+        // If you try the live WebStart demo of Carrot2 you can see how they affect
+        // the final clustering: http://www.carrot2.org 
+        defaults.put("lsi.threshold.clusterAssignment", "0.150");
+        defaults.put("lsi.threshold.candidateCluster",  "0.775");
+
+        // Initialize a new Lingo clustering component.
+        ArrayList languageList = new ArrayList(languages.length);
+        for (int i = 0; i < languages.length; i++) {
+          final String lcode = languages[i];
+          try {
+            final Language lang = AllKnownLanguages.getLanguageForIsoCode(lcode);
+            if (lang == null) {
+              logger.warn("Language not supported in Carrot2: " + lcode);
+            } else {
+              languageList.add(lang);
+              logger.debug("Language loaded: " + lcode);
+            }
+          } catch (Throwable t) {
+              logger.warn("Language could not be loaded: " + lcode, t);
+          }
+        }
+        return new LingoLocalFilterComponent(
+          (Language []) languageList.toArray(new Language [languageList.size()]), defaults);
+      }
+    };
+    controller.addLocalComponentFactory("filter-lingo", lingoFactory);
+
+    // *   <output component-key="output-clustersConsumer" />
+    LocalComponentFactory clusterConsumerOutputFactory = new LocalComponentFactory() {
+      public LocalComponent getInstance() {
+        return new ArrayOutputComponent();
+      }
+    };
+    controller.addLocalComponentFactory("output-array", 
+      clusterConsumerOutputFactory);
+  }
+
+  /** 
+   * Adds a hardcoded clustering process to the local controller.
+   */  
+  private void addProcesses() {
+    final LocalProcessBase process = new LocalProcessBase(
+        "input-nutch",
+        "output-array",
+        new String [] {"filter-lingo"},
+        "The Lingo clustering algorithm (www.carrot2.org).",
+        "");
+
+    try {
+      controller.addProcess(PROCESS_ID, process);
+    } catch (Exception e) {
+      throw new RuntimeException("Could not assemble clustering process.", e);
+    }
+  }  
 }

Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/HitsClusterAdapter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/HitsClusterAdapter.java?rev=570327&r1=570326&r2=570327&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/HitsClusterAdapter.java (original)
+++ lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/HitsClusterAdapter.java Mon Aug 27 23:26:51 2007
@@ -19,18 +19,14 @@
 import java.util.Iterator;
 import java.util.List;
 
-import com.dawidweiss.carrot.core.local.clustering.RawCluster;
-import com.dawidweiss.carrot.core.local.clustering.RawDocument;
-
 import org.apache.nutch.clustering.HitsCluster;
 import org.apache.nutch.searcher.HitDetails;
+import org.carrot2.core.clustering.RawCluster;
+import org.carrot2.core.clustering.RawDocument;
 
 /**
  * An adapter of Carrot2's {@link RawCluster} interface to
  * {@link HitsCluster} interface. 
- *
- * @author Dawid Weiss
- * @version $Id: HitsClusterAdapter.java,v 1.1 2004/08/09 23:23:53 johnnx Exp $
  */
 public class HitsClusterAdapter implements HitsCluster {
   private RawCluster rawCluster;
@@ -59,7 +55,7 @@
    */
   public HitsCluster[] getSubclusters() {
     if (this.subclusters == null) {
-      List rawSubclusters = rawCluster.getSubclusters();
+      final List rawSubclusters = rawCluster.getSubclusters();
       if (rawSubclusters == null || rawSubclusters.size() == 0) {
         subclusters = null;
       } else {

Modified: lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/NutchDocument.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/NutchDocument.java?rev=570327&r1=570326&r2=570327&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/NutchDocument.java (original)
+++ lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/NutchDocument.java Mon Aug 27 23:26:51 2007
@@ -17,15 +17,11 @@
 package org.apache.nutch.clustering.carrot2;
 
 import org.apache.nutch.searcher.HitDetails;
-
-import com.dawidweiss.carrot.core.local.clustering.RawDocument;
-import com.dawidweiss.carrot.core.local.clustering.RawDocumentBase;
+import org.carrot2.core.clustering.RawDocument;
+import org.carrot2.core.clustering.RawDocumentBase;
 
 /**
  * An adapter class that implements {@link RawDocument} required for Carrot2.  
- *
- * @author Dawid Weiss
- * @version $Id: NutchDocument.java,v 1.2 2004/08/10 00:18:43 johnnx Exp $
  */
 public class NutchDocument extends RawDocumentBase {
   /**
@@ -39,28 +35,25 @@
    * a <code>details</code> hit details.
    */
   public NutchDocument(int id, HitDetails details, String summary, String defaultLanguage) {
-    super.setProperty(RawDocument.PROPERTY_URL, details.getValue("url"));
-    super.setProperty(RawDocument.PROPERTY_SNIPPET, summary);
+    super(details.getValue("url"), details.getValue("title"), summary);
 
-    final String title = details.getValue("title");
-    if (title != null && !"".equals(title)) {
-      super.setProperty(RawDocument.PROPERTY_TITLE, title);
-    }
-    
+    // Handle document language -- attempt to extract it from the details,
+    // otherwise set to the default.
     String lang = details.getValue("lang");
     if (lang == null) {
       // No default language. Take the default from the configuration file.
       lang = defaultLanguage;
     }
+
     // Use this language for the snippet. Truncate longer ISO codes
     // to only include two-letter language code.
     if (lang.length() > 2) {
       lang = lang.substring(0, 2);
     }
-    lang = lang.toLowerCase();
+    lang = lang.toLowerCase();    
     super.setProperty(RawDocument.PROPERTY_LANGUAGE, lang);
 
-    this.id = new Integer(id);
+    this.id = Integer.valueOf(id);
   }
 
   /*
@@ -69,4 +62,4 @@
   public Object getId() {
     return id;
   }
-}
+}
\ No newline at end of file

Added: lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/NutchInputComponent.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/NutchInputComponent.java?rev=570327&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/NutchInputComponent.java (added)
+++ lucene/nutch/trunk/src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/NutchInputComponent.java Mon Aug 27 23:26:51 2007
@@ -0,0 +1,108 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.clustering.carrot2;
+
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.nutch.searcher.HitDetails;
+import org.carrot2.core.LocalInputComponentBase;
+import org.carrot2.core.ProcessingException;
+import org.carrot2.core.RequestContext;
+import org.carrot2.core.clustering.RawDocumentsConsumer;
+import org.carrot2.core.clustering.RawDocumentsProducer;
+
+/**
+ * An input component that ignores the query passed from the
+ * controller and instead looks for data stored in the request context.
+ * This enables us to reuse the same physical component implementation
+ * for data that has already been acquired from Nutch.
+ */
+public class NutchInputComponent extends LocalInputComponentBase {
+  public final static String NUTCH_INPUT_HIT_DETAILS_ARRAY
+    = "NUTCH_INPUT_HIT_DETAILS_ARRAY";
+
+  public final static String NUTCH_INPUT_SUMMARIES_ARRAY 
+    = "NUTCH_INPUT_SUMMARIES_ARRAY";
+
+  /** Capabilities required from the next component in the chain */
+  private final static Set SUCCESSOR_CAPABILITIES = toSet(RawDocumentsConsumer.class);
+
+  /** This component's capabilities */
+  private final static Set COMPONENT_CAPABILITIES = toSet(RawDocumentsProducer.class);
+
+  /**
+   * Default language code for hits that don't have their own.
+   */
+  private String defaultLanguage;
+
+  /**
+   * Creates an input component with the given default language code.
+   */
+  public NutchInputComponent(String defaultLanguage) {
+    this.defaultLanguage = defaultLanguage;
+  }
+
+  /*
+   * @see com.dawidweiss.carrot.core.local.LocalInputComponent#setQuery(java.lang.String)
+   */
+  public void setQuery(String query) {
+      // ignore the query; data will be provided from the request context.
+  }
+
+  /**
+   * A callback hook that starts the processing.
+   */
+  public void startProcessing(RequestContext context) throws ProcessingException {
+    // let successor components know that the processing has started.
+    super.startProcessing(context);
+    
+    // get the information about documents from the context.
+    final Map params = context.getRequestParameters();
+    final HitDetails [] details = (HitDetails[]) params.get(NUTCH_INPUT_HIT_DETAILS_ARRAY);
+    final String [] summaries = (String[]) params.get(NUTCH_INPUT_SUMMARIES_ARRAY);
+
+    if (details == null)
+      throw new ProcessingException("Details array must not be null.");
+
+    if (summaries == null)
+      throw new ProcessingException("Summaries array must not be null.");
+
+    if (summaries.length != details.length)
+      throw new ProcessingException("Summaries and details must be of the same length.");
+    
+    // produce 'documents' for successor components.
+    final RawDocumentsConsumer consumer = (RawDocumentsConsumer) next;
+    for (int i = 0; i < summaries.length; i++) {
+      consumer.addDocument(new NutchDocument(i, details[i], summaries[i], defaultLanguage));
+    }
+  }
+
+  /**
+   * Returns the capabilities provided by this component.
+   */
+  public Set getComponentCapabilities() {
+    return COMPONENT_CAPABILITIES;
+  }
+    
+  /**
+   * Returns the capabilities required from the successor component.
+   */
+  public Set getRequiredSuccessorCapabilities() {
+    return SUCCESSOR_CAPABILITIES;
+  }
+}

Added: lucene/nutch/trunk/src/plugin/clustering-carrot2/src/test/org/apache/nutch/clustering/carrot2/TestClusterer.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/clustering-carrot2/src/test/org/apache/nutch/clustering/carrot2/TestClusterer.java?rev=570327&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/clustering-carrot2/src/test/org/apache/nutch/clustering/carrot2/TestClusterer.java (added)
+++ lucene/nutch/trunk/src/plugin/clustering-carrot2/src/test/org/apache/nutch/clustering/carrot2/TestClusterer.java Mon Aug 27 23:26:51 2007
@@ -0,0 +1,169 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.clustering.carrot2;
+
+import java.io.InputStream;
+import java.util.ArrayList;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+
+import junit.framework.TestCase;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.clustering.HitsCluster;
+import org.apache.nutch.searcher.HitDetails;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+
+/**
+ * A test case for the Carrot2-based clusterer plugin to Nutch.
+ */
+public class TestClusterer extends TestCase {
+  private Clusterer c;
+  
+  public TestClusterer(String testName) {
+    super(testName);
+  }
+  
+  protected void setUp() throws Exception {
+    c = new Clusterer();
+    c.setConf(new Configuration());
+  }
+  
+  /**
+   * The clusterer should not fail on empty input, returning
+   * an empty array of {@link HitsCluster}.
+   */
+  public void testEmptyInput() {
+    final HitDetails [] hitDetails = new HitDetails[0];
+    final String [] descriptions = new String [0];
+    final HitsCluster [] clusters = c.clusterHits(hitDetails, descriptions);
+    assertTrue(clusters != null && clusters.length == 0);
+  }
+
+  /**
+   * Tests the clusterer on some cached data.
+   */
+  public void testOnCachedData() throws Exception {
+    final DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
+    final DocumentBuilder parser = factory.newDocumentBuilder();
+    final InputStream is = getClass().getResourceAsStream("test-input.xml");
+    assertNotNull("test-input.xml not found", is);
+    final Document document = parser.parse(is);
+    is.close();
+
+    final Element data = document.getDocumentElement();
+    final NodeList docs = data.getElementsByTagName("document");
+    
+    final ArrayList summaries = new ArrayList();
+    final ArrayList hitDetails = new ArrayList();
+
+    assertTrue(docs.getLength() > 0);
+    for (int i = 0; i < docs.getLength(); i++) {
+      final Element doc = (Element) docs.item(i);
+      assertTrue(doc.getNodeType() == Node.ELEMENT_NODE);
+      final Element urlElement = (Element) doc.getElementsByTagName("url").item(0);
+      final Element snippetElement = (Element) doc.getElementsByTagName("snippet").item(0);
+      final Element titleElement = (Element) doc.getElementsByTagName("title").item(0);
+
+      summaries.add(toText(titleElement) + " " + toText(snippetElement));
+      hitDetails.add(new HitDetails(
+          new String [] {"url"}, 
+          new String [] {toText(urlElement)}));
+    }
+
+    HitsCluster [] clusters = c.clusterHits(
+        (HitDetails[]) hitDetails.toArray(new HitDetails[hitDetails.size()]),
+        (String[]) summaries.toArray(new String[summaries.size()]));
+    
+    // There should be SOME clusters in the input... words distribution
+    // should not be random because some words have higher probability.
+    assertTrue(clusters != null);
+    assertTrue("Clusters expected, but not found.", clusters.length > 0);
+
+    // Check hit references inside clusters.
+    for (int i = 0; i < clusters.length; i++) {
+      assertTrue(clusters[i].getHits().length > 0);
+    }
+
+    /*
+    // Dump cluster content if you need to.
+    System.out.println("Clusters: " + clusters.length);
+    for (int i = 0; i < clusters.length; i++) {
+      dump(0, clusters[i]);
+    }
+    */
+  }
+  
+  /**
+   * Converts a {@link Element} to plain text.
+   */
+  private String toText(Element snippetElement) {
+    final StringBuffer buffer = new StringBuffer();
+    final NodeList list = snippetElement.getChildNodes();
+    for (int i = 0; i < list.getLength(); i++) {
+      Node n = list.item(i);
+      if (n.getNodeType() == Node.TEXT_NODE) {
+        buffer.append(n.getNodeValue());
+      } else if (n.getNodeType() == Node.CDATA_SECTION_NODE) {
+        n.getNodeValue();
+      } else throw new RuntimeException("Unexpected nested element when converting to text.");
+    }
+    return buffer.toString();
+  }
+
+  /**
+   * Dumps the content of {@link HitsCluster} to system output stream. 
+   */
+  private void dump(int level, HitsCluster cluster) {
+    String [] labels = cluster.getDescriptionLabels();
+    for (int indent = 0; indent<level; indent++) {
+      System.out.print( "   " );
+    }
+    System.out.print(">> ");
+    if (cluster.isJunkCluster()) System.out.print("(Junk) ");
+    System.out.print("CLUSTER: ");
+    for (int i=0;i<labels.length;i++) {
+      System.out.print( labels[i] + "; " );
+    }
+    System.out.println();
+    
+    HitsCluster [] subclusters = cluster.getSubclusters();
+    if (subclusters != null) {
+      for (int i=0;i<subclusters.length;i++) {
+        dump(level + 1, subclusters[i]);
+      }
+    }
+    
+    // dump documents.
+    HitDetails [] hits = cluster.getHits();
+    if (hits != null) {
+      for (int i=0;i<hits.length;i++ ) {
+        for (int indent = 0; indent<level; indent++) {
+          System.out.print( "   " );
+        }
+        System.out.print( hits[i].getValue("url") );
+        System.out.print( "; " );
+        System.out.println( hits[i].getValue("title") );
+      }
+    }
+  }
+}

Added: lucene/nutch/trunk/src/plugin/clustering-carrot2/src/test/org/apache/nutch/clustering/carrot2/test-input.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/clustering-carrot2/src/test/org/apache/nutch/clustering/carrot2/test-input.xml?rev=570327&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/clustering-carrot2/src/test/org/apache/nutch/clustering/carrot2/test-input.xml (added)
+++ lucene/nutch/trunk/src/plugin/clustering-carrot2/src/test/org/apache/nutch/clustering/carrot2/test-input.xml Mon Aug 27 23:26:51 2007
@@ -0,0 +1,303 @@
+<searchresult>
+<query requested-results="100">data mining</query>
+<document id="0">	<url>http://www.kdnuggets.com/</url>
+	<title>KD Nuggets</title>
+	<snippet>Newsletter on the data mining and knowledge industries, offering information on data mining, knowledge discovery, text mining, and web mining software, courses, jobs, publications, and meetings.</snippet>
+</document><document id="1">	<url>http://en.wikipedia.org/wiki/Data_mining</url>
+	<title>Data Mining - Wikipedia</title>
+	<snippet>Article about knowledge-discovery in databases (KDD), the practice of automatically searching large stores of data for patterns.</snippet>
+</document><document id="2">	<url>http://www.thearling.com/</url>
+	<title>Thearling.com</title>
+	<snippet>Kurt Thearling&apos;s site dedicated to sharing information about data mining, the automated extraction of hidden predictive information from databases, and other analytic technologies.</snippet>
+</document><document id="3">	<url>http://www.the-data-mine.com/</url>
+	<title>The Data Mine</title>
+	<snippet>Provides information about data mining also known as knowledge discovery in databases (KDD) or simply knowledge discovery. List software, events, organizations, and people working in data mining.</snippet>
+</document><document id="4">	<url>http://www.data-miners.com/</url>
+	<title>Data Miners</title>
+	<snippet>Data mining consultancy; services include predictive modeling, consulting, and seminars.</snippet>
+</document><document id="5">	<url>http://www.dmg.org/</url>
+	<title>DMG</title>
+	<snippet>The Laboratory for Advanced Computing develops technologies for high performance computing, high performance networking, internet computing, data mining and related areas. ... Data Mining Group. DMG. DMG Menu ... The Data Mining Group (DMG) is an independent, vendor led group which develops data mining standards, such as the ...</snippet>
+</document><document id="6">	<url>http://www.twocrows.com/glossary.htm</url>
+	<title>Two Crows: Data mining glossary</title>
+	<snippet>Data mining terms concisely defined. ... factor in assessing the success of data mining. When applied to data, accuracy refers to the rate of ... For example, a data mining software system may have an API which ...</snippet>
+</document><document id="7">	<url>http://www.monografias.com/trabajos/datamining/datamining.shtml</url>
+	<title>Data Mining - Monografias.com</title>
+	<snippet>... Data Mining, la extracción de información oculta y predecible de grandes bases de ... de Información (Data Warehouse). Las herramientas de Data Mining predicen futuras tendencias y ...</snippet>
+</document><document id="8">	<url>http://www.ccsu.edu/datamining/resources.html</url>
+	<title>CCSU - Data Mining</title>
+	<snippet>Data Mining Resources. Resources. Groups. Data Sets. Papers on Data Mining. Commercial. Register at</snippet>
+</document><document id="9">	<url>http://www-db.stanford.edu/~ullman/mining/mining.html</url>
+	<title>Jeff Ullman&apos;s Data Mining Lecture Notes</title>
+	<snippet>Offers an introduction to various data mining applications and techniques: association-rule mining, low-support/high correlation, query flocks, searching the Web, web mining, and clustering.</snippet>
+</document><document id="10">	<url>http://www.statsoft.com/textbook/stdatmin.html</url>
+	<title>Electronic Statistics Textbook: Data Mining Techniques</title>
+	<snippet>Outlines the crucial concepts in data mining, defines the data warehousing process, and offers examples of computational and graphical exploratory data analysis techniques.</snippet>
+</document><document id="11">	<url>http://www.autonlab.org/tutorials</url>
+	<title>Statistical Data Mining Tutorials</title>
+	<snippet>Includes a set of tutorials on many aspects of statistical data mining, including the foundations of probability, the foundations of statistical data analysis, and most of the classic machine learning and data mining algorithms.</snippet>
+</document><document id="12">	<url>http://www.sas.com/technologies/data_mining</url>
+	<title>SAS | Data and Text Mining</title>
+	<snippet>... of information, the potential would be enormous. With data mining, the possibilities are endless ... almost upon its introduction, our data mining technology continues to receive rave ...</snippet>
+</document><document id="13">	<url>http://www.almaden.ibm.com/cs/quest</url>
+	<title>IBM Research | Almaden Research Center | test</title>
+	<snippet>... Privacy-preserving data mining - preserves privacy at the individual level, while still allowing accurate data mining models at the aggregate level ...</snippet>
+</document><document id="14">	<url>http://www.oracle.com/technology/products/bi/odm/</url>
+	<title>Oracle Data Mining</title>
+	<snippet>... user interface for Oracle Data Mining that helps data analysts mine their Oracle data to find valuable ... With Oracle Data Miner and Oracle Data Mining, the data never leaves the ...</snippet>
+</document><document id="15">	<url>http://www.cs.waikato.ac.nz/~ml/weka/book.html</url>
+	<title>Data Mining: Practical Machine Learning Tools and Techniques</title>
+	<snippet>Data Mining: Practical Machine Learning Tools and Techniques (Second Edition) Morgan Kaufmann. June 2005. 525 pages. Paper. ISBN 0-12-088407-0. Comments ... What&apos;s it all about? 1.1 Data mining and machine learning ...</snippet>
+</document><document id="16">	<url>http://www.ccsu.edu/datamining</url>
+	<title>Data Mining @ CCSU</title>
+	<snippet>Offers degrees and certificates in data mining. Allows students to explore cutting-edge data mining techniques and applications: market basket analysis, decision trees, neural networks, machine learning, web mining, and data modeling.</snippet>
+</document><document id="17">	<url>http://searchcrm.techtarget.com/sDefinition/0,,sid11_gci211901,00.html</url>
+	<title>data mining - a Whatis.com definition - see also: data miner, data analysis</title>
+	<snippet>... whatis.com: searchCRM.com Definitions - data mining ... about the future (This area of data mining is known as predictive analytics.) Data mining techniques are used in ...</snippet>
+</document><document id="18">	<url>http://www.ccsu.edu/datamining/master.html</url>
+	<title>CCSU - Data Mining</title>
+	<snippet>Master of Science Degree. Accredited by the State of Connecticut Department of Higher Education. ... Details on how to apply to the Master of Science in data mining may be found here ... the Master of Science in Data Mining should download the revised Planned Program ...</snippet>
+</document><document id="19">	<url>http://www.statserv.com/datamining.html</url>
+	<title>St@tServ - About Data Mining</title>
+	<snippet>... What is Data Mining ? &quot; Data mining is the process of discovering meaningful new correlations, patterns ... Gartner Group). &quot; Data mining is the exploration and analysis, by automatic ...</snippet>
+</document><document id="20">	<url>http://www.data-mine.com/</url>
+	<title>Data Mining Technologies, Inc.</title>
+	<snippet>Provides software and consulting for data mining.</snippet>
+</document><document id="21">	<url>http://www.the-data-mine.com/bin/view/Misc/DataMiningBooksAndPapers</url>
+	<title>Data Mining - Data Mining Books And Papers</title>
+	<snippet>... Mastering Data Mining Michael J. A. Berry, Gordon S ... method=&quot;POST&quot; action=&quot;http://buybox.amazon.com/o/dt/assoc/handle-buy-box=0471331236&quot;&amp;gt; Data Mining Techniques Michael J ...</snippet>
+</document><document id="22">	<url>http://www.computerworld.com/databasetopics/businessintelligence/datamining</url>
+	<title>Computerworld Data Mining</title>
+	<snippet>This special topic page focuses on data mining software and business intelligence tools. ... Latest on Data Mining. Q&amp;A: CA&apos;s new CTO discusses development, recruiting ... View more on Data Mining. Data Mining Feature. Group files complaint against &apos;adware&apos; firm ...</snippet>
+</document><document id="23">	<url>http://datamining.typepad.com/data_mining/</url>
+	<title>Data Mining</title>
+	<snippet>Current Reading. On the Stack. January 29, 2006. The Strength of BlogAnalytics. A while back, I wrote about how dangerous trend mining over blogs could be in the wrong hands. ... Data Mining. About. Weblogs ... company providing non-trivial analytics over blog data - or any other data for that mater - has already solved this ...</snippet>
+</document><document id="24">	<url>http://www.wessex.ac.uk/conferences/2002/datamining02</url>
+	<title>DATA MINING 2002 - Post Conference Report</title>
+	<snippet>... Third International Conference on Data Mining Methods and Databases for Engineering, Finance and ... The third international conference on Data Mining took place recently in Bologna ...</snippet>
+</document><document id="25">	<url>http://www.thearling.com/text/dmwhite/dmwhite.htm</url>
+	<title>An Introduction to Data Mining</title>
+	<snippet>... Data mining, the extraction of hidden predictive information from large databases, is a ... important information in their data warehouses. Data mining tools predict future trends ...</snippet>
+</document><document id="26">	<url>http://www.spss.com/datamine</url>
+	<title>Data Mining Software, Data Mining Applications and Data Mining Solutions</title>
+	<snippet>Data Mining at SPSS. Your source for data mining software, data mining tools, data mining applications and data mining solutions ... Most analysts separate data mining software into two groups: data mining tools and data mining applications. Data mining tools provide ...</snippet>
+</document><document id="27">	<url>http://www.onlamp.com/pub/a/onlamp/2004/04/08/datamining_email.html</url>
+	<title>ONLamp.com: Data Mining Email</title>
+	<snippet>Robert Bernier demonstrates how to store data from emails into a database, where you can use data-mining techniques to analyze it. ... What is data mining anyway? Data mining is a class of database applications that look for hidden patterns in a group of data ...</snippet>
+</document><document id="28">	<url>http://www.aaai.org/AITopics/html/mining.html</url>
+	<title>Data Mining and Discovery</title>
+	<snippet>AI Topics provides basic, understandable information and helpful resources concerning artificial intelligence, with an emphasis on material available online. ... Data Mining and Discovery. (a subtopic of Machine Learning ... Data mining is an AI powered tool that can discover useful information within a database that can then be used ...</snippet>
+</document><document id="29">	<url>http://www.research.microsoft.com/dmx/</url>
+	<title>Data Management, Exploration and Mining- Home</title>
+	<snippet>The Data Management Exploration and Mining Group (DMX). ... break down with massive data sets. Therefore, we aim at exploiting data mining techniques, i.e ... Our research effort in data mining focuses on ensuring that traditional techniques ...</snippet>
+</document><document id="30">	<url>http://www.dmreview.com/</url>
+	<title>DMReview</title>
+	<snippet>An issues and solutions publication that focuses on data warehousing as well as client/server and object technology for the enterprise.</snippet>
+</document><document id="31">	<url>http://www.megaputer.com/</url>
+	<title>Megaputer Intelligence</title>
+	<snippet>Manufactures multi-strategy data mining and text mining software solutions.</snippet>
+</document><document id="32">	<url>http://databases.about.com/od/datamining</url>
+	<title>Data Mining and Data Warehousing</title>
+	<snippet>The Net&apos;s best collection of data mining and data warehousing links from your About.com guide. From data mining tutorials to data warehousing techniques, you&apos;ll find it all! ... Benefits of Outsourcing Data Warehouse and Data Mining. Many organizations are seeking ...</snippet>
+</document><document id="33">	<url>http://www.pcc.qub.ac.uk/tec/courses/datamining/stu_notes/dm_book_1.html</url>
+	<title>Data Mining Student Notes, QUB</title>
+	<snippet>Data Mining. An Introduction. Student Notes. Ruth Dilly. Parallel Computer Centre. Queens University Belfast. Version 2.0. December1995 ... 1 - Data mining. 1.1 - What is data mining? 1.2 - Data mining background. 1.2.1 - Inductive learning ...</snippet>
+</document><document id="34">	<url>http://itmanagement.webopedia.com/TERM/D/data_mining.html</url>
+	<title>data mining - Webopedia.com</title>
+	<snippet>Search for more IT management terms . . . data mining. A class of database applications that look for hidden patterns in a group of data that can be used to predict future behavior. ... For example, data mining software can help retail companies find customers with common interests ... that presents data in new ways. True data mining software doesn&apos;t just change the ...</snippet>
+</document><document id="35">	<url>http://www.twocrows.com/</url>
+	<title>Two Crows Corporation</title>
+	<snippet>Dedicated to the development, marketing, sales and support of tools for knowledge discovery to make data mining accessible and easy to use.</snippet>
+</document><document id="36">	<url>http://databases.about.com/library/weekly/aa100700a.htm</url>
+	<title>Data Mining: An Introduction</title>
+	<snippet>Data mining allows you to find the needles hidden in your haystacks of data. Learn how to use these advanced techniques to meet your business objectives. ... heard a good deal about data mining -- the database industry&apos;s latest buzzword ... of automated statistical analysis (or &quot;data mining&quot;) techniques, businesses are discovering new ...</snippet>
+</document><document id="37">	<url>http://www.kdnuggets.com/software</url>
+	<title>Software for Data Mining and Knowledge Discovery</title>
+	<snippet>This is a directory of general-purpose data mining software. To suggest an entry, email to . See also domain-specific data-mining solutions.</snippet>
+</document><document id="38">	<url>http://www.anderson.ucla.edu/faculty/jason.frand/teacher/technologies/palace/datamining.htm</url>
+	<title>Data Mining: What is Data Mining?</title>
+	<snippet>Outlines what knowledge discovery, the process of analyzing data from different perspectives and summarizing it into useful information, can do and how it works.</snippet>
+</document><document id="39">	<url>http://www.megaputer.com/products/pa/index.php3</url>
+	<title>Data Mining Software</title>
+	<snippet>Megaputer offers data mining, text mining, and web data mining software tools for e-commerce, database marketing, and CRM; seminars, training and consulting on data mining. Customer ... and versatile suite of advanced data mining tools. PolyAnalyst incorporates the latest ... discovery to analyze both structured and unstructured data. The PolyAnalyst platform offers ...</snippet>
+</document><document id="40">	<url>http://www.sims.berkeley.edu/~hearst/papers/acl99/acl99-tdm.html</url>
+	<title>Untangling Text Data Mining</title>
+	<snippet>... Untangling Text Data Mining. Marti A. Hearst. School of Information Management &amp;amp; Systems ... The possibilities for data mining from large text collections are virtually untapped ...</snippet>
+</document><document id="41">	<url>http://www.megaputer.com/dm/dm101.php3</url>
+	<title>What is Data Mining</title>
+	<snippet>Megaputer offers data mining, text mining, and web data mining software tools for e-commerce, database marketing, and CRM; seminars, training and consulting on data mining. Customer ... in order to make informed business decisions. Data mining automates the process of finding relationships and patterns in ... In these situations data mining is your only real option ...</snippet>
+</document><document id="42">	<url>http://www.ncbi.nih.gov/Tools</url>
+	<title>NCBI Tools for Bioinformatics Research</title>
+	<snippet>... Tools for Data Mining. PubMed. Entrez. BLAST. OMIM. Books ... results of analyses that have been done on the sequence data. The amount and type of information presented depend ...</snippet>
+</document><document id="43">	<url>http://www.computerworld.com/databasetopics/businessintelligence/story/0,10801,103726,00.html?source=x10</url>
+	<title>Explainer: Data mining - Computerworld</title>
+	<snippet>Often used for predictive modeling, data mining is a subset of business intelligence that can help organizations better understand relationships among variables. ... into usable shape, however, requires sophisticated data mining tools. The same technology that police ... retailers, are ideal candidates for data mining technology. Wal-Mart Stores Inc ...</snippet>
+</document><document id="44">	<url>http://www.dmbenchmarking.com/</url>
+	<title>Data Mining Benchmarking Association (DMBA)</title>
+	<snippet>Association of companies and organizations working to identify &quot;best in class&quot; data mining processes through benchmarking studies.</snippet>
+</document><document id="45">	<url>http://datamining.typepad.com/</url>
+	<title>Data Mining</title>
+	<snippet>Current Reading. On the Stack. January 30, 2006. Fact versus Opinion. Information overload overload is becoming a serious problem for me. ... Data Mining. About. Weblogs ... company providing non-trivial analytics over blog data - or any other data for that mater - has already solved this ...</snippet>
+</document><document id="46">	<url>http://www.wessex.ac.uk/conferences/2005/data05</url>
+	<title>DATA MINING 2005</title>
+	<snippet>... International Conference on Data Mining, Text Mining and their Business Applications ... Conference on Data Mining, Text Mining and their Business Applications (Data Mining ...</snippet>
+</document><document id="47">	<url>http://www.galaxy.gmu.edu/stats/syllabi/DMLIST.html</url>
+	<title>URL&apos;s for Data Mining</title>
+	<snippet>URL&apos;s for Data Mining. The following URL&apos;s are some links to a variety of Data Mining webpages. They are not in any particular order. Actually, they are in the order I discovered (mined) them.</snippet>
+</document><document id="48">	<url>http://www.pcai.com/web/ai_info/data_warehouse_mining.html</url>
+	<title>PC AI - Data Warehouse and Data Mining</title>
+	<snippet>... Data Mining. Overview: Data mining or knowledge discovery is becoming more important as more and ... To Distributed Computing. Data Warehouse and Data Mining Information on the Internet ...</snippet>
+</document><document id="49">	<url>http://www.gr-fx.com/graf-fx.htm</url>
+	<title>Data Mining</title>
+	<snippet>... databases with graphs and queries using a technique called Data Mining. It is also a quick way to ... learn how to use another data mining product. All you have to ...</snippet>
+</document><document id="50">	<url>http://www.dwinfocenter.org/</url>
+	<title>Data Warehousing Information Center</title>
+	<snippet>Provides information on tools and techniques to design, build, maintain, and retrieve information from a data warehouse.</snippet>
+</document><document id="51">	<url>http://www.siam.org/meetings/sdm02</url>
+	<title>SIAM International Conference on Data Mining</title>
+	<snippet>Co-Sponsored by AHPCRC and University of Illinois at Chicago ... The field of data mining draws upon extensive work in areas such as; statistics ... presentation of recent results in data mining, including; applications, algorithms, software, and ...</snippet>
+</document><document id="52">	<url>http://www.oclc.org/research/projects/mining/</url>
+	<title>Data mining [OCLC - Projects]</title>
+	<snippet>Describes the goals, methodology, and timing of the Data mining project. ... Data mining. DCMI Registry DSpace Harvesting Economics of Digital Preservation Electronic Theses and Dissertations ... this end, the OCLC Research Data-Mining Research Area will focus on ...</snippet>
+</document><document id="53">	<url>http://www.stat.rutgers.edu/~madigan/datamining</url>
+	<title>Data Mining</title>
+	<snippet>... DATA MINING SPECIAL TOPICS CLASS ... will be using a draft version of Principles of Data Mining , by Hand, Mannila, and Smyth (MIT Press, forthcoming), as ...</snippet>
+</document><document id="54">	<url>http://dmoz.org/Computers/Software/Databases/Data_Mining</url>
+	<title>Open Directory - Computers: Software: Databases: Data Mining</title>
+	<snippet>the entire directory only in Databases/Data_Mining. See also: ... About.com on Data Mining - About.com presents a collection of original feature articles, net ... room dedicated to data mining and data warehousing topics. The Data Mine - Launched ...</snippet>
+</document><document id="55">	<url>http://www.investorhome.com/mining.htm</url>
+	<title>Investor Home - Data Mining</title>
+	<snippet>... Data Mining. The rapid evolution of computer technology in the last few decades has provided ... and consequences of &quot;data mining.&quot; Data mining involves searching through databases for ...</snippet>
+</document><document id="56">	<url>http://www.sas.com/technologies/analytics/datamining</url>
+	<title>SAS | Data and Text Mining</title>
+	<snippet>... of information, the potential would be enormous. With data mining, the possibilities are endless ... almost upon its introduction, our data mining technology continues to receive rave ...</snippet>
+</document><document id="57">	<url>http://www.wessex.ac.uk/conferences/2003/datamining03</url>
+	<title>Data Mining 2003</title>
+	<snippet>... Data Mining 2003. Fourth International Conference on Data Mining Including Building Applications for CRM ...</snippet>
+</document><document id="58">	<url>http://datamining.itsc.uah.edu/</url>
+	<title>ITSC Data Mining Solutions Center</title>
+	<snippet>... The ITSC Data Mining Solutions Center is the focal point for data mining research, development and services at ...</snippet>
+</document><document id="59">	<url>http://www.webopedia.com/TERM/D/data_mining.html</url>
+	<title>What is data mining? - A Word Definition From the Webopedia Computer Dictionary</title>
+	<snippet>This page describes the term data mining and lists other pages on the Web where you can find additional information. ... For example, data mining software can help retail companies find customers with common interests ... that presents data in new ways. True data mining software doesn&apos;t just change the ...</snippet>
+</document><document id="60">	<url>http://research.microsoft.com/dmx/DataMining/default.aspx</url>
+	<title>Data Mining Project</title>
+	<snippet>Search: All Research OnlineAll Microsoft.com. Data Mining: Efficient Data Exploration and Modeling. Overview. Goal ... will focus on exploiting data mining for advanced data summarization and also enable tighter ... database querying and data mining. Scalable Data Mining Algorithms: We are exploring ...</snippet>
+</document><document id="61">	<url>http://www.fas.org/sgp/crs/intel/RL31798.pdf</url>
+	<title>Data Mining: An Overview</title>
+	<snippet>... assessing risk, and product. retailing, data mining involves the use of data analysis tools to discover ... homeland security, data mining is often viewed as a potential means to ...</snippet>
+</document><document id="62">	<url>http://www.statsoftinc.com/</url>
+	<title>Data Mining, Statistical Analysis, Quality Control - STATISTICA Software</title>
+	<snippet>Statsoft is the creator of STATISTICA, the most comprehensive suite of data mining and statistical analysis software. ... StatSoft logo, STATISTICA, SEWSS, SEDAS, Data Miner, SEPATH and GTrees are trademarks ... more information on STATISTICA, data mining, data analysis, statistical analysis &amp;amp; enterprise ...</snippet>
+</document><document id="63">	<url>http://www.insightful.com/</url>
+	<title>Insightful Corporation</title>
+	<snippet>The developer of the technical calculation application Mathcad, as well as developer and provider of a variety of other software tools for users of PCs, Macintosh computers, and UNIX workstations.</snippet>
+</document><document id="64">	<url>http://www.ncdm.uic.edu/</url>
+	<title>National Center for Data Mining (NCDM) - University of Illinois at Chicago</title>
+	<snippet>Conducts research in: scaling algorithms, applications and systems to massive data sets, developing algorithms, applications, and systems for mining distributed data, and establishing standard languages, protocols, and services for data mining and predictive modeling.</snippet>
+</document><document id="65">	<url>http://www.computerworld.com/hardwaretopics/hardware/desktops/story/0,10801,43509,00.html</url>
+	<title>Data Mining - Computerworld</title>
+	<snippet>Data mining is a process that finds relationships and patterns within a large amount of data stored in a database. The process uses tools based on algorithms to sift through mounds of data to find relationships. ... What has data mining done for Dick&apos;s Supermarkets ... What&apos;s the basis of a good data mining program? You have to establish the integrity of your data because that&apos;s ...</snippet>
+</document><document id="66">	<url>http://www.the-data-mine.com/bin/view/Software/WebHome</url>
+	<title>Data Mining - Web Home (Software)</title>
+	<snippet>... To find Data Mining Software, check the Web Index, use Web Search or check the most recent changes (Web Changes ... Misc. General Data Mining Information - Introductions, Tutorials etc ...</snippet>
+</document><document id="67">	<url>http://www.rulequest.com/</url>
+	<title>Rulequest Research</title>
+	<snippet>Provides software tools for data mining and knowledge discovery in databases.</snippet>
+</document><document id="68">	<url>http://www.bos.frb.org/economic/nerr/rr2000/q3/mining.htm</url>
+	<title>Regional Review: Mining Data</title>
+	<snippet>Mining Data. Quarter 3, 2000. by Miriam Wasserman. SCENE 1: It&apos;s late November 1999. The Celtics are struggling with their second lineup. ... They both include the use of data-mining computer technology to search for patterns in data ... player&apos;s potential is maximized. Although data mining by itself is not going to get ...</snippet>
+</document><document id="69">	<url>http://www.cisl.ucar.edu/hps/GROUPS/dm/dm.html</url>
+	<title>Data Mining Resources</title>
+	<snippet>... and Zantige, D. Data Mining, Harlow, UK: Addison-Wesley, 1996. Berry, M.J.A. and Linoff, G., Data Mining Techniques for Marketing, Sales, and Customer Support, New York, NY: John ...</snippet>
+</document><document id="70">	<url>http://www.wessex.ac.uk/conferences/2004/datamining04</url>
+	<title>DATA MINING 2004</title>
+	<snippet>... Fifth International Conference on Data Mining, Text Mining and their Business Applications ... 5th International Conference on Data Mining, Text Mining and their Business Applications ...</snippet>
+</document><document id="71">	<url>http://www.amazon.com/exec/obidos/tg/detail/-/1558605525?v=glance</url>
+	<title>Amazon.com: Data Mining: Practical Machine Learning Tools and Techniques with Java Implementations (The Morgan ... </title>
+	<snippet>... Topics covered: Data mining and machine learning basics, sample datasets and applications for data mining ... in the synthesis of data mining, data analysis, information theory and ...</snippet>
+</document><document id="72">	<url>http://www.sas.com/technologies/analytics/datamining/miner</url>
+	<title>SAS | SAS Enterprise Miner</title>
+	<snippet>... Miner streamlines the entire data mining process from data access to model deployment by ... It provides a powerful, complete data mining solution with unparalleled model development ...</snippet>
+</document><document id="73">	<url>http://ocw.mit.edu/OcwWeb/Sloan-School-of-Management/15-062Data-MiningSpring2003/CourseHome</url>
+	<title>MIT OpenCourseWare | Sloan School of Management | 15.062 Data Mining, Spring 2003 | Home</title>
+	<snippet>... marts specifically intended for management decision support. Data mining is a rapidly growing field that is ... The field of data mining has evolved from the disciplines of statistics ...</snippet>
+</document><document id="74">	<url>http://www.data-mining-guide.net/</url>
+	<title>Data Mining Software | Guide to Data Mining Software &amp; Concepts</title>
+	<snippet>What is Data Mining? Data Mining is the process of analyzing large data sets in order to find patterns that can help to isolate key variables to build predictive models for management decision making. ... In essence, data mining helps businesses to optimize their processes so that ...</snippet>
+</document><document id="75">	<url>http://www.cse.ohio-state.edu/~srini/694Z</url>
+	<title>CIS 694Z: Introduction to Data Mining</title>
+	<snippet>... discovery process, key data mining techniques, efficient high performance mining algorithms, exposure to applications of data mining (bioinformatics and intrusion detection ...</snippet>
+</document><document id="76">	<url>http://www.gao.gov/new.items/d05866.pdf</url>
+	<title>GAO-05-866 Data Mining: Agencies Have Taken Key Steps to Protect Privacy in Selected Efforts, but Significant ... </title>
+	<snippet>... The five data mining efforts we reviewed are used by federal agencies to ... individual privacy rights are being appropriately protected. Data mining—a technique for ...</snippet>
+</document><document id="77">	<url>http://datamining.typepad.com/data_mining/2005/08/rumour_mull.html</url>
+	<title>Data Mining: Rumour Mull</title>
+	<snippet>... Data Mining. About. Weblogs ... for 2005-08-15 from Emergence Marketing. Data Mining: Rumour Mull Interesting analysis of the Technorati takeover rumour ...</snippet>
+</document><document id="78">	<url>http://www.crm2day.com/data_mining</url>
+	<title>CRM Today - Data Mining &amp; Data Warehousing</title>
+	<snippet>... Abstract: The field of data mining, like statistics, concerns itself with ... at the connection between data mining and statistics, and ask ourselves whether data mining is &quot;statistical ...</snippet>
+</document><document id="79">	<url>http://www.kdnuggets.com/meetings</url>
+	<title>Meetings and Conferences in Data Mining and Knowledge Discovery</title>
+	<snippet>Meetings and Conferences in Data Mining, Knowledge Discovery, Genomic Mining, and Web Mining. March 7: Proposals due for. March 7: Proposals due for. 23-24 Oct, M2006, SAS 9th annual Data Mining Technology Conference, Las Vegas, NV, USA. ... with The second workshop on Algorithmic Techniques for Data Mining 2006 (ATDM 2006 ...</snippet>
+</document><document id="80">	<url>http://www.siam.org/meetings/sdm01</url>
+	<title>First SIAM International Conference on Data Mining</title>
+	<snippet>Registration. is Closed. Advances in information technology and data collection methods have led to the availability of large data sets in commercial enterprises and in a wide variety of scientific and engineering disciplines. ... The field of data mining draws upon extensive work in areas such as statistics ... presentation of recent results in data mining, including applications, algorithms, software, and ...</snippet>
+</document><document id="81">	<url>http://crm.ittoolbox.com/topics/t.asp?t=520&amp;p=520&amp;h1=520</url>
+	<title>CRM Analytical Data Mining</title>
+	<snippet>... Quality&apos; Model (Line56)- Learning from the past; data mining and Service Quality provide roadmaps, but CRM ... trade-off analysis. Data Mining in Depth: Data Mining and Privacy (DM ...</snippet>
+</document><document id="82">	<url>http://www.statoo.com/sections/Datamining/</url>
+	<title>Statoo Consulting, Statistical Consulting + Data Analysis + Data Mining Services, Lausanne, Switzerland</title>
+	<snippet>Statoo Consulting is a vendor independent Swiss consulting firm specialized in statistical consulting and training, data analysis, data mining, analytical CRM and bioinformatics services. ... Statistical Consulting + Data Analysis + Data Mining Services. Lausanne, Switzerland. Séminaire de méthodologie en data mining statistique, 6-8 Mars, 2006, Paris, France ...</snippet>
+</document><document id="83">	<url>http://www.cio.com/research/data/</url>
+	<title>Knowledge Management - Data Storage &amp; Mining - Warehouse, OLAP, glossary resources - Knowledge Management RC - CIO</title>
+	<snippet>CIO Data Storage &amp;amp; Mining Research Center is a compilation of articles, case studies, organizations, conferences, glossary of terms, and white papers related to data storage, mining/OLAP, and data warehousing.</snippet>
+</document><document id="84">	<url>http://www.thearling.com/dmintro/dmintro.htm</url>
+	<title>An Introduction to Data Mining by Kurt Thearling</title>
+	<snippet>7-Mar-03: An Introduction to Data Mining</snippet>
+</document><document id="85">	<url>http://www.stayfreemagazine.org/archives/14/datamining.html</url>
+	<title>Data Mining</title>
+	<snippet>... is arguably at the cutting edge of &quot;data mining&quot;: a new kind of information analysis that ... positively timid by comparison. Data mining uses artificial intelligence software to hunt ...</snippet>
+</document><document id="86">	<url>http://www.siam.org/meetings/sdm05</url>
+	<title>SIAM 2005 Data Mining Conference</title>
+	<snippet>... The field of data mining draws upon extensive work in areas ... and high-performance data mining. Distributed data mining. Scalable algorithms. Integration: mining, warehousing and OLAP ...</snippet>
+</document><document id="87">	<url>http://www.jcp.org/en/jsr/detail?id=73</url>
+	<title>The Java Community Process(SM) Program - JSRs: Java Specification Requests - detail JSR# 73</title>
+	<snippet>... and maintain data and metadata supporting data mining models, data scoring, and data mining results serving J2EE ... agreed upon, standard API for data mining. By using JDMAPI ...</snippet>
+</document><document id="88">	<url>http://www.megaputer.com/dm/index.php3</url>
+	<title>Data Mining Introduction</title>
+	<snippet>Megaputer offers data mining, text mining, and web data mining software tools for e-commerce, database marketing, and CRM; seminars, training and consulting on data mining. Customer ... Data Mining. What is data mining? PolyAnalyst Machine Learning Algorithms ... &quot;Data Mining is the process of identifying valid, novel, potentially useful, and ultimately comprehensible ...</snippet>
+</document><document id="89">	<url>http://www.healthcare-informatics.com/issues/2004/04_04/hagland.htm</url>
+	<title>Healthcare Informatics: Data Mining</title>
+	<snippet>... Data Mining. Stronger computer tools allow deeper analysis of medical research, patient care and ... well the tremendous potential of data mining--using software programs for pattern ...</snippet>
+</document><document id="90">	<url>http://www.dmreview.com/article_sub.cfm?articleId=1010449</url>
+	<title>Volume Analytics: Duo-Mining: Combining Data and Text Mining</title>
+	<snippet>... As standalone capabilities, the pattern-finding technologies of data mining and text mining have been around for years ... of all, what are data mining and text mining? They are similar ...</snippet>
+</document><document id="91">	<url>http://www.itworld.com/App/110/050805datamining</url>
+	<title>ITworld.com - Data mining</title>
+	<snippet>... it into usable shape, however, requires sophisticated data mining tools. The same technology that police departments ... How does data mining work? Data mining is a subset of business ...</snippet>
+</document><document id="92">	<url>http://www.statsoft.com/textbook/glosd.html</url>
+	<title>daniell (or equal weight) window. in time series, the daniell ...</title>
+	<snippet>... Data Mining. StatSoft defines data mining as an analytic process designed to ... information, see Data Mining. Data Preparation Phase. In Data Mining, the input data are often &quot;noisy ...</snippet>
+</document><document id="93">	<url>http://oracle.ittoolbox.com/topics/t.asp?t=427&amp;p=427&amp;h1=427</url>
+	<title>Oracle Business Intelligence Data Mining</title>
+	<snippet>... Sub-topic definition: Data Mining is a method of searching data with mathematical algorithms to identify ... the product evaluation process for Data Mining software. Oracle-BI-l - The ...</snippet>
+</document><document id="94">	<url>http://www.time.com/time/globalbusiness/article/0,9171,1101021223-400017,00.html?cnn=yes</url>
+	<title>TIME.com: Data Miners -- Dec. 23, 2002 -- Page 1</title>
+	<snippet>New software instantly connects key bits of data that once eluded teams of researchers ... The data-mining algorithms of ClearForest, based in New York City, are at work within both ... And these days, data-mining software, combined with technologies that connect disparate ...</snippet>
+</document><document id="95">	<url>http://www.sqlserverdatamining.com/</url>
+	<title>SQL Server Data Mining</title>
+	<snippet>sql server | data mining. Happy Birthday to SQLServerDataMining.com! ... .com with the mission to let the world know about the data mining functionality in SQL Server and help them use it ...</snippet>
+</document><document id="96">	<url>http://www.kdd.org/</url>
+	<title>Knowledge Discovery and Data Mining Foundation</title>
+	<snippet>Have you heard about ACM SIGKDD, the newly formed society for knowledge discovery and data mining? Click here to see the brand new ACM SIGKDD web page. KnowledgeDiscovery &amp;amp; Data Mining ... starting point for exploring Internet resources in knowledge discovery and data mining ...</snippet>
+</document><document id="97">	<url>http://www.knightsbridge.com/solutions/client/professional/requirements/mining.php</url>
+	<title>Data Mining</title>
+	<snippet>... Data mining is a powerful data warehousing technology to assist users with the abundance ... that they have collected. Data mining uses sophisticated statistical analyses and modeling ...</snippet>
+</document><document id="98">	<url>http://www.comp.nus.edu.sg/~dm2</url>
+	<title>DM II - Data Mining II</title>
+	<snippet>The DM-II system has two downloadable tools: CBA (v2.1) and IAS. CBA (v2.1) (Last Modify June, 25, 2001) is a data mining tool developed at School of Computing, National University of Singapore. ... Integrating Classification and Association Rule Mining&quot; (KDD-98). Further improvements were made ...</snippet>
+</document><document id="99">	<url>http://www.thearling.com/text/dmviz/modelviz.htm</url>
+	<title>Visualizing Data Mining Models</title>
+	<snippet>... Visualizing Data Mining Models. by Kurt Thearling, Barry Becker, Dennis DeCoste, Bill Mawby ... is going on. Since data mining usually involves extracting &quot;hidden&quot; information from ...</snippet>
+</document></searchresult>
\ No newline at end of file