You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by st...@apache.org on 2011/05/16 16:21:42 UTC
svn commit: r1103746 - in /lucene/dev/branches/branch_3x: dev-tools/maven/ solr/ solr/contrib/clustering/ solr/contrib/clustering/lib/ solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/ solr/contrib/clustering/src/test/ja...

Author: stanislaw
Date: Mon May 16 14:21:41 2011
New Revision: 1103746

URL: http://svn.apache.org/viewvc?rev=1103746&view=rev
Log:
SOLR-2448: Upgrade of Carrot2 to version 3.5.0 and a number of related clustering improvements (SOLR-2449, SOLR-2450, SOLR-2505)

Added:
    lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/carrot2-core-3.5.0.jar   (with props)
    lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/hppc-0.3.4-jdk15.jar   (with props)
    lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/LuceneCarrot2StemmerFactory.java
    lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/LuceneCarrot2TokenizerFactory.java
    lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/SolrStopwordsCarrot2LexicalDataFactory.java
    lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/LexicalResourcesCheckClusteringAlgorithm.java
    lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/clustering/
    lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/clustering/carrot2/
    lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/clustering/carrot2/stoplabels.mt
    lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/clustering/carrot2/stopwords.mt
    lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/clustering/custom/
    lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/clustering/custom/stoplabels.mt
    lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/clustering/custom/stopwords.mt
Removed:
    lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/carrot2-core-3.4.2.jar
    lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/hppc-0.3.1.jar
    lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/LuceneLanguageModelFactory.java
Modified:
    lucene/dev/branches/branch_3x/dev-tools/maven/pom.xml.template
    lucene/dev/branches/branch_3x/solr/CHANGES.txt
    lucene/dev/branches/branch_3x/solr/build.xml
    lucene/dev/branches/branch_3x/solr/contrib/clustering/CHANGES.txt
    lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/solr-carrot2-core-pom.xml.template
    lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java
    lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java
    lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java
    lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/MockClusteringAlgorithm.java
    lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/solrconfig.xml
    lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/stopwords.txt
    lucene/dev/branches/branch_3x/solr/example/solr/conf/solrconfig.xml

Modified: lucene/dev/branches/branch_3x/dev-tools/maven/pom.xml.template
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/dev-tools/maven/pom.xml.template?rev=1103746&r1=1103745&r2=1103746&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/dev-tools/maven/pom.xml.template (original)
+++ lucene/dev/branches/branch_3x/dev-tools/maven/pom.xml.template Mon May 16 14:21:41 2011
@@ -106,14 +106,6 @@
   </licenses>
   <repositories>
     <repository>
-      <id>carrot2.org</id>
-      <name>Carrot2 Maven2 repository</name>
-      <url>http://download.carrot2.org/maven2/</url>
-      <snapshots>
-        <updatePolicy>never</updatePolicy>
-      </snapshots>
-    </repository>
-    <repository>
       <id>apache.snapshots</id>
       <name>Apache Snapshot Repository</name>
       <url>http://repository.apache.org/snapshots</url>
@@ -737,7 +729,7 @@
                   <goal>install-file</goal>
                 </goals>
                 <configuration>
-                  <file>solr/contrib/clustering/lib/carrot2-core-3.4.2.jar</file>
+                  <file>solr/contrib/clustering/lib/carrot2-core-3.5.0.jar</file>
                   <pomFile>lucene/build/lucene-solr-grandparent/solr-carrot2-core-pom.xml.template</pomFile>
                 </configuration>  
               </execution>

Modified: lucene/dev/branches/branch_3x/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/CHANGES.txt?rev=1103746&r1=1103745&r2=1103746&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/solr/CHANGES.txt Mon May 16 14:21:41 2011
@@ -29,7 +29,7 @@ Versions of Major Components
 ---------------------
 Apache Lucene 3x
 Apache Tika 0.8
-Carrot2 3.4.2
+Carrot2 3.5.0
 
 
 Upgrading from Solr 3.1

Modified: lucene/dev/branches/branch_3x/solr/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/build.xml?rev=1103746&r1=1103745&r2=1103746&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/build.xml (original)
+++ lucene/dev/branches/branch_3x/solr/build.xml Mon May 16 14:21:41 2011
@@ -1034,7 +1034,7 @@
                                    jar.file="contrib/uima/lib/uima-an-wst-2.3.1-SNAPSHOT-r1076132.jar" />
 
       <m2-deploy-with-pom-template pom.xml="contrib/clustering/lib/solr-carrot2-core-pom.xml.template"
-                                   jar.file="contrib/clustering/lib/carrot2-core-3.4.2.jar" />
+                                   jar.file="contrib/clustering/lib/carrot2-core-3.5.0.jar" />
 
       <!-- ========== SOLR ARTIFACTS ========== -->
 

Modified: lucene/dev/branches/branch_3x/solr/contrib/clustering/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/CHANGES.txt?rev=1103746&r1=1103745&r2=1103746&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/clustering/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/solr/contrib/clustering/CHANGES.txt Mon May 16 14:21:41 2011
@@ -10,8 +10,11 @@ $Id$
 
 ================== Release 3.2.0-dev ==================
 
-(No Changes)
-
+* SOLR-2448: Search results clustering updates: bisecting k-means 
+  clustering algorithm added, loading of Carrot2 stop words from 
+  <solr.home>/conf/carrot2 (SOLR-2449), using Solr's stopwords.txt
+  for clustering (SOLR-2450), output of cluster scores (SOLR-2505).  
+  
 ================== Release 3.1.0-dev ==================
 
 * SOLR-1684: Switch to use the SolrIndexSearcher.doc(int, Set<String>) method b/c it can use the document cache (gsingers)

Added: lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/carrot2-core-3.5.0.jar
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/carrot2-core-3.5.0.jar?rev=1103746&view=auto
==============================================================================
Binary file - no diff available.

Added: lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/hppc-0.3.4-jdk15.jar
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/hppc-0.3.4-jdk15.jar?rev=1103746&view=auto
==============================================================================
Binary file - no diff available.

Modified: lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/solr-carrot2-core-pom.xml.template
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/solr-carrot2-core-pom.xml.template?rev=1103746&r1=1103745&r2=1103746&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/solr-carrot2-core-pom.xml.template (original)
+++ lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/solr-carrot2-core-pom.xml.template Mon May 16 14:21:41 2011
@@ -26,10 +26,10 @@
   <packaging>jar</packaging>
   <name>Solr Specific Carrot2</name>
   <description>
-    Carrot2 search results clustering framework core, document
-    sources and clustering algorithms.
+    Carrot2 search results clustering framework core
+    and clustering algorithms.
 
-    Compiled with Java 1.5 from carrot2-core v3.4.2 sources.
+    Compiled with Java 1.5 from carrot2-core v3.5.0 sources.
   </description>
   <licenses>
     <license>
@@ -59,196 +59,78 @@
     <url>http://issues.carrot2.org/</url>
   </issueManagement>
 
-  <repositories>
-      <repository>
-          <id>carrotsearch.labs.releases</id>
-          <name>Carrot Search Labs Releases Repository</name>
-          <url>http://repository.carrotsearch.com/labs/releases</url>
-      </repository>
-  </repositories>
+  <properties>
+    <jackson.version>1.7.4</jackson.version>
+    <slf4j.version>1.6.1</slf4j.version>
+  </properties>
 
   <dependencies>
     <dependency>
-      <groupId>org.carrot2</groupId>
-      <artifactId>rome</artifactId>
-      <version>1.0.RC1</version>
-    </dependency>
-
-    <dependency>
-      <groupId>org.carrot2</groupId>
-      <artifactId>rome-fetcher</artifactId>
-      <version>0.7</version>
-    </dependency>
-
-    <dependency>
       <groupId>org.codehaus.jackson</groupId>
       <artifactId>jackson-core-asl</artifactId>
-      <version>1.5.2</version>
+      <version>${jackson.version}</version>
     </dependency>
 
     <dependency>
       <groupId>org.codehaus.jackson</groupId>
       <artifactId>jackson-mapper-asl</artifactId>
-      <version>1.5.2</version>
+      <version>${jackson.version}</version>
     </dependency>
-        
+
     <dependency>
       <groupId>net.sf.ehcache</groupId>
       <artifactId>ehcache-core</artifactId>
-      <version>1.7.1</version>
-    </dependency>
-
-    <dependency>
-      <groupId>commons-codec</groupId>
-      <artifactId>commons-codec</artifactId>
-      <version>1.3</version>
-    </dependency>
-
-    <dependency>
-      <groupId>commons-collections</groupId>
-      <artifactId>commons-collections</artifactId>
-      <version>3.2.1</version>
-    </dependency>
-
-    <dependency>
-      <groupId>commons-discovery</groupId>
-      <artifactId>commons-discovery</artifactId>
-      <version>0.2</version>
-    </dependency>
-
-    <dependency>
-      <groupId>commons-httpclient</groupId>
-      <artifactId>commons-httpclient</artifactId>
-      <version>3.1</version>
-    </dependency>
-
-    <dependency>
-      <groupId>commons-io</groupId>
-      <artifactId>commons-io</artifactId>
-      <version>1.4</version>
+      <version>1.7.2</version>
     </dependency>
 
     <dependency>
       <groupId>commons-lang</groupId>
       <artifactId>commons-lang</artifactId>
-      <version>2.4</version>
-    </dependency>
-
-    <dependency>
-      <groupId>commons-logging</groupId>
-      <artifactId>commons-logging</artifactId>
-      <version>1.1.1</version>
+      <version>2.6</version>
     </dependency>
 
     <dependency>
       <groupId>com.google.guava</groupId>
       <artifactId>guava</artifactId>
-      <version>r05</version>
-    </dependency>
-
-    <dependency>
-      <groupId>org.jdom</groupId>
-      <artifactId>jdom</artifactId>
-      <version>1.1</version>
+      <version>r08</version>
     </dependency>
 
     <dependency>
       <groupId>org.slf4j</groupId>
       <artifactId>slf4j-api</artifactId>
-      <version>1.5.8</version>
-    </dependency>
-
-    <dependency>
-      <groupId>org.apache.lucene</groupId>
-      <artifactId>lucene-core</artifactId>
-      <version>3.0.1</version>
-    </dependency>
-        
-    <dependency>
-      <groupId>org.apache.lucene</groupId>
-      <artifactId>lucene-analyzers</artifactId>
-      <version>3.0.1</version>
-    </dependency>
-        
-    <dependency>
-      <groupId>org.apache.lucene</groupId>
-      <artifactId>lucene-highlighter</artifactId>
-      <version>3.0.1</version>
-    </dependency>
-        
-    <dependency>
-      <groupId>org.apache.lucene</groupId>
-      <artifactId>lucene-snowball</artifactId>
-      <version>3.0.1</version>
-    </dependency>
-        
-    <dependency>
-      <groupId>org.apache.lucene</groupId>
-      <artifactId>lucene-smartcn</artifactId>
-      <version>3.0.1</version>
-      <optional>true</optional>
-    </dependency>
-        
-    <dependency>
-      <groupId>org.apache.lucene</groupId>
-      <artifactId>lucene-memory</artifactId>
-      <version>3.0.1</version>
-    </dependency>
-
-    <dependency>
-      <groupId>oro</groupId>
-      <artifactId>oro</artifactId>
-      <version>2.0.8</version>
-    </dependency>
-
-    <dependency>
-      <groupId>org.simpleframework</groupId>
-      <artifactId>simple-xml</artifactId>
-      <version>2.3.5</version>
-    </dependency>
-
-    <dependency>
-      <groupId>org.codehaus.woodstox</groupId>
-      <artifactId>wstx-asl</artifactId>
-      <version>4.0.0</version>
-    </dependency> 
-
-    <dependency>
-      <groupId>xalan</groupId>
-      <artifactId>xalan</artifactId>
-      <version>2.7.0</version>
-    </dependency>
-
-    <dependency>
-      <groupId>xerces</groupId>
-      <artifactId>xercesImpl</artifactId>
-      <version>2.8.1</version>
+      <version>${slf4j.version}</version>
     </dependency>
 
     <dependency>
       <groupId>org.apache.mahout</groupId>
       <artifactId>mahout-math</artifactId>
       <version>0.3</version>
+      <exclusions>
+        <exclusion>
+          <groupId>com.google.code.gson</groupId>
+          <artifactId>gson</artifactId>
+        </exclusion>
+      </exclusions>
     </dependency>
 
     <dependency>
       <groupId>org.apache.mahout</groupId>
       <artifactId>mahout-collections</artifactId>
       <version>0.3</version>
-    </dependency>
-
-    <dependency>
-      <groupId>com.carrotsearch</groupId>
-      <artifactId>nni</artifactId>
-      <version>1.0.0</version>
-      <optional>true</optional>
+      <exclusions>
+        <exclusion>
+          <groupId>org.apache.mahout</groupId>
+          <artifactId>mahout-collection-codegen-plugin</artifactId>
+        </exclusion>
+      </exclusions>
     </dependency>
 
     <dependency>
       <groupId>com.carrotsearch</groupId>
       <artifactId>hppc</artifactId>
-      <version>0.3.1</version>
-    </dependency>    
+      <version>0.3.4</version>
+      <classifier>jdk15</classifier>
+    </dependency>
   </dependencies>
 </project>
 

Modified: lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java?rev=1103746&r1=1103745&r2=1103746&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java (original)
+++ lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java Mon May 16 14:21:41 2011
@@ -18,9 +18,11 @@ package org.apache.solr.handler.clusteri
  */
 
 import java.io.IOException;
+import java.io.InputStream;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
@@ -37,6 +39,7 @@ import org.apache.solr.common.params.Sol
 import org.apache.solr.common.util.NamedList;
 import org.apache.solr.common.util.SimpleOrderedMap;
 import org.apache.solr.core.SolrCore;
+import org.apache.solr.core.SolrResourceLoader;
 import org.apache.solr.handler.clustering.SearchClusteringEngine;
 import org.apache.solr.highlight.SolrHighlighter;
 import org.apache.solr.request.LocalSolrQueryRequest;
@@ -51,9 +54,17 @@ import org.carrot2.core.ControllerFactor
 import org.carrot2.core.Document;
 import org.carrot2.core.IClusteringAlgorithm;
 import org.carrot2.core.attribute.AttributeNames;
+import org.carrot2.text.linguistic.DefaultLexicalDataFactoryDescriptor;
+import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipelineDescriptor;
+import org.carrot2.util.resource.ClassLoaderLocator;
+import org.carrot2.util.resource.IResource;
+import org.carrot2.util.resource.IResourceLocator;
+import org.carrot2.util.resource.ResourceLookup;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
 import com.google.common.collect.Sets;
 
 /**
@@ -63,19 +74,33 @@ import com.google.common.collect.Sets;
  *
  * @link http://project.carrot2.org
  */
-@SuppressWarnings("unchecked")
 public class CarrotClusteringEngine extends SearchClusteringEngine {
   private transient static Logger log = LoggerFactory
           .getLogger(CarrotClusteringEngine.class);
+  
+  /**
+   * The subdirectory in Solr config dir to read customized Carrot2 resources from.
+   */
+  private static final String CARROT_RESOURCES_PREFIX = "clustering/carrot2";
+   
+  /**
+   * Name of Carrot2 document's field containing Solr document's identifier.
+   */
+  private static final String SOLR_DOCUMENT_ID = "solrId";
+
+  /**
+   * Name of Solr document's field containing the document's identifier. To avoid
+   * repeating the content of documents in clusters on output, each cluster contains
+   * identifiers of documents it contains.
+   */
+  private String idFieldName;
 
   /**
    * Carrot2 controller that manages instances of clustering algorithms
    */
   private Controller controller = ControllerFactory.createPooling();
   private Class<? extends IClusteringAlgorithm> clusteringAlgorithmClass;
-
-  private String idFieldName;
-
+  
   @Override
   @Deprecated
   public Object cluster(Query query, DocList docList, SolrQueryRequest sreq) {
@@ -100,6 +125,10 @@ public class CarrotClusteringEngine exte
       attributes.put(AttributeNames.DOCUMENTS, documents);
       attributes.put(AttributeNames.QUERY, query.toString());
 
+      // Pass the fields on which clustering runs to the
+      // SolrStopwordsCarrot2LexicalDataFactory
+      attributes.put("solrFieldNames", getFieldsForClustering(sreq));
+
       // Pass extra overriding attributes from the request, if any
       extractCarrotAttributes(sreq.getParams(), attributes);
 
@@ -113,21 +142,66 @@ public class CarrotClusteringEngine exte
   }
 
   @Override
+	@SuppressWarnings({ "unchecked", "rawtypes" })
   public String init(NamedList config, final SolrCore core) {
     String result = super.init(config, core);
-    SolrParams initParams = SolrParams.toSolrParams(config);
+    final SolrParams initParams = SolrParams.toSolrParams(config);
 
     // Initialize Carrot2 controller. Pass initialization attributes, if any.
     HashMap<String, Object> initAttributes = new HashMap<String, Object>();
     extractCarrotAttributes(initParams, initAttributes);
     
-    // Customize the language model factory. The implementation we provide here
-    // is included in the code base of Solr, so that it's possible to refactor
-    // the Lucene APIs the factory relies on if needed.
-    initAttributes.put("PreprocessingPipeline.languageModelFactory",
-      LuceneLanguageModelFactory.class);
-    this.controller.init(initAttributes);
+    // Customize the stemmer and tokenizer factories. The implementations we provide here
+    // are included in the code base of Solr, so that it's possible to refactor
+    // the Lucene APIs the factories rely on if needed.
+    // Additionally, we set a custom lexical resource factory for Carrot2 that 
+    // will use both Carrot2 default stop words as well as stop words from 
+    // the StopFilter defined on the field.
+    BasicPreprocessingPipelineDescriptor.attributeBuilder(initAttributes)
+        .stemmerFactory(LuceneCarrot2StemmerFactory.class)
+        .tokenizerFactory(LuceneCarrot2TokenizerFactory.class)
+        .lexicalDataFactory(SolrStopwordsCarrot2LexicalDataFactory.class);
+    
+    // Pass the schema to SolrStopwordsCarrot2LexicalDataFactory.
+    initAttributes.put("solrIndexSchema", core.getSchema());
 
+    // Customize Carrot2's resource lookup to first look for resources
+    // using Solr's resource loader. If that fails, try loading from the classpath.
+    DefaultLexicalDataFactoryDescriptor.attributeBuilder(initAttributes)
+        .resourceLookup(new ResourceLookup(new IResourceLocator() {
+          public IResource[] getAll(final String resource) {
+            final SolrResourceLoader resourceLoader = core.getResourceLoader();
+            final String carrot2ResourcesDir = resourceLoader.getConfigDir()
+                + initParams.get(CarrotParams.LEXICAL_RESOURCES_DIR, CARROT_RESOURCES_PREFIX);
+            try {
+              log.debug("Looking for " + resource + " in "
+                  + carrot2ResourcesDir);
+              final InputStream resourceStream = resourceLoader
+                  .openResource(carrot2ResourcesDir + "/" + resource);
+              
+              log.info(resource + " loaded from " + carrot2ResourcesDir);
+              final IResource foundResource = new IResource() {
+                public InputStream open() throws IOException {
+                  return resourceStream;
+                }
+              };
+              return new IResource[] { foundResource };
+            } catch (RuntimeException e) {
+              // No way to distinguish if the resource was found but failed
+              // to load or wasn't found at all, so we simply fall back
+              // to Carrot2 defaults here by returning an empty locations array.
+              log.debug(resource + " not found in " + carrot2ResourcesDir
+                  + ". Using the default " + resource + " from Carrot JAR.");
+              return new IResource[] {};
+            }
+
+          }
+        },
+        
+        // Using the class loader directly because this time we want to omit the prefix 
+        new ClassLoaderLocator(core.getResourceLoader().getClassLoader())));
+    
+    this.controller.init(initAttributes);
     this.idFieldName = core.getSchema().getUniqueKeyField().getName();
 
     // Make sure the requested Carrot2 clustering algorithm class is available
@@ -147,20 +221,33 @@ public class CarrotClusteringEngine exte
   protected Set<String> getFieldsToLoad(SolrQueryRequest sreq){
     SolrParams solrParams = sreq.getParams();
 
-    // Names of fields to deliver content for clustering
-    String urlField = solrParams.get(CarrotParams.URL_FIELD_NAME, "url");
+    HashSet<String> fields = Sets.newHashSet(getFieldsForClustering(sreq));
+    fields.add(idFieldName);
+    fields.add(solrParams.get(CarrotParams.URL_FIELD_NAME, "url"));
+		return fields;
+  }
+
+	/**
+	 * Returns the names of fields that will be delivering the actual
+	 * content for clustering. Currently, there are two such fields: document
+	 * title and document content.
+	 */
+	private Set<String> getFieldsForClustering(SolrQueryRequest sreq) {
+    SolrParams solrParams = sreq.getParams();
+
     String titleField = solrParams.get(CarrotParams.TITLE_FIELD_NAME, "title");
     String snippetField = solrParams.get(CarrotParams.SNIPPET_FIELD_NAME, titleField);
     if (StringUtils.isBlank(snippetField)) {
       throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, CarrotParams.SNIPPET_FIELD_NAME
               + " must not be blank.");
     }
-    return Sets.newHashSet(urlField, titleField, snippetField, idFieldName);
-  }
+    return Sets.newHashSet(titleField, snippetField);
+	}
   
   /**
    * Prepares Carrot2 documents for clustering.
    */
+  @SuppressWarnings("deprecation")
   private List<Document> getDocuments(SolrDocumentList solrDocList, Map<SolrDocument, Integer> docIds,
                                       Query query, final SolrQueryRequest sreq) throws IOException {
     SolrHighlighter highlighter = null;
@@ -179,7 +266,7 @@ public class CarrotClusteringEngine exte
     if (produceSummary == true) {
       highlighter = core.getHighlighter();
       if (highlighter != null){
-        Map args = new HashMap();
+        Map<String, Object> args = Maps.newHashMap();
         snippetFieldAry = new String[]{snippetField};
         args.put(HighlightParams.FIELDS, snippetFieldAry);
         args.put(HighlightParams.HIGHLIGHT, "true");
@@ -213,11 +300,12 @@ public class CarrotClusteringEngine exte
       if (produceSummary && docIds != null) {
         docsHolder[0] = docIds.get(sdoc).intValue();
         DocList docAsList = new DocSlice(0, 1, docsHolder, scores, 1, 1.0f);
-        NamedList highlights = highlighter.doHighlighting(docAsList, theQuery, req, snippetFieldAry);
+        NamedList<Object> highlights = highlighter.doHighlighting(docAsList, theQuery, req, snippetFieldAry);
         if (highlights != null && highlights.size() == 1) {//should only be one value given our setup
           //should only be one document with one field
-          NamedList tmp = (NamedList) highlights.getVal(0);
-          String [] highlt = (String[]) tmp.get(snippetField);
+          @SuppressWarnings("unchecked")
+					NamedList<String []> tmp = (NamedList<String[]>) highlights.getVal(0);
+          String [] highlt = tmp.get(snippetField);
           if (highlt != null && highlt.length == 1) {
             snippet = highlt[0];
           }
@@ -225,7 +313,7 @@ public class CarrotClusteringEngine exte
       }
       Document carrotDocument = new Document(getValue(sdoc, titleField),
               snippet, (String)sdoc.getFieldValue(urlField));
-      carrotDocument.setField("solrId", sdoc.getFieldValue(idFieldName));
+      carrotDocument.setField(SOLR_DOCUMENT_ID, sdoc.getFieldValue(idFieldName));
       result.add(carrotDocument);
     }
 
@@ -260,9 +348,9 @@ public class CarrotClusteringEngine exte
     return result.toString().trim();
   }
 
-  private List clustersToNamedList(List<Cluster> carrotClusters,
+  private List<NamedList<Object>> clustersToNamedList(List<Cluster> carrotClusters,
                                    SolrParams solrParams) {
-    List result = new ArrayList();
+    List<NamedList<Object>> result = Lists.newArrayList();
     clustersToNamedList(carrotClusters, result, solrParams.getBool(
             CarrotParams.OUTPUT_SUB_CLUSTERS, true), solrParams.getInt(
             CarrotParams.NUM_DESCRIPTIONS, Integer.MAX_VALUE));
@@ -270,25 +358,40 @@ public class CarrotClusteringEngine exte
   }
 
   private void clustersToNamedList(List<Cluster> outputClusters,
-                                   List parent, boolean outputSubClusters, int maxLabels) {
+                                   List<NamedList<Object>> parent, boolean outputSubClusters, int maxLabels) {
     for (Cluster outCluster : outputClusters) {
-      NamedList cluster = new SimpleOrderedMap();
+      NamedList<Object> cluster = new SimpleOrderedMap<Object>();
       parent.add(cluster);
 
+      // Add labels
       List<String> labels = outCluster.getPhrases();
-      if (labels.size() > maxLabels)
+      if (labels.size() > maxLabels) {
         labels = labels.subList(0, maxLabels);
+      }
       cluster.add("labels", labels);
 
+      // Add cluster score
+      final Double score = outCluster.getScore();
+      if (score != null) {
+        cluster.add("score", score);
+      }
+
+      // Add other topics marker
+      if (outCluster.isOtherTopics()) {
+        cluster.add("other-topics", outCluster.isOtherTopics());
+      }
+
+      // Add documents
       List<Document> docs = outputSubClusters ? outCluster.getDocuments() : outCluster.getAllDocuments();
-      List docList = new ArrayList();
+      List<Object> docList = Lists.newArrayList();
       cluster.add("docs", docList);
       for (Document doc : docs) {
-        docList.add(doc.getField("solrId"));
+        docList.add(doc.getField(SOLR_DOCUMENT_ID));
       }
 
-      if (outputSubClusters) {
-        List subclusters = new ArrayList();
+      // Add subclusters
+      if (outputSubClusters && !outCluster.getSubclusters().isEmpty()) {
+        List<NamedList<Object>> subclusters = Lists.newArrayList();
         cluster.add("clusters", subclusters);
         clustersToNamedList(outCluster.getSubclusters(), subclusters,
                 outputSubClusters, maxLabels);

Modified: lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java?rev=1103746&r1=1103745&r2=1103746&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java (original)
+++ lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java Mon May 16 14:21:41 2011
@@ -35,6 +35,8 @@ public interface CarrotParams {
   String OUTPUT_SUB_CLUSTERS = CARROT_PREFIX + "outputSubClusters";
   String SUMMARY_FRAGSIZE = CARROT_PREFIX + "fragzise";
 
+  String LEXICAL_RESOURCES_DIR = CARROT_PREFIX + "lexicalResourcesDir";
+
   public static final Set<String> CARROT_PARAM_NAMES = ImmutableSet.of(
           ALGORITHM, TITLE_FIELD_NAME, URL_FIELD_NAME, SNIPPET_FIELD_NAME,
           PRODUCE_SUMMARY, NUM_DESCRIPTIONS, OUTPUT_SUB_CLUSTERS, SUMMARY_FRAGSIZE);

Added: lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/LuceneCarrot2StemmerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/LuceneCarrot2StemmerFactory.java?rev=1103746&view=auto
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/LuceneCarrot2StemmerFactory.java (added)
+++ lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/LuceneCarrot2StemmerFactory.java Mon May 16 14:21:41 2011
@@ -0,0 +1,239 @@
+package org.apache.solr.handler.clustering.carrot2;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.nio.CharBuffer;
+import java.util.HashMap;
+
+import org.apache.lucene.analysis.ar.ArabicNormalizer;
+import org.apache.lucene.analysis.ar.ArabicStemmer;
+import org.carrot2.core.LanguageCode;
+import org.carrot2.text.linguistic.IStemmer;
+import org.carrot2.text.linguistic.IStemmerFactory;
+import org.carrot2.util.ReflectionUtils;
+import org.slf4j.Logger;
+import org.tartarus.snowball.SnowballProgram;
+import org.tartarus.snowball.ext.DanishStemmer;
+import org.tartarus.snowball.ext.DutchStemmer;
+import org.tartarus.snowball.ext.EnglishStemmer;
+import org.tartarus.snowball.ext.FinnishStemmer;
+import org.tartarus.snowball.ext.FrenchStemmer;
+import org.tartarus.snowball.ext.GermanStemmer;
+import org.tartarus.snowball.ext.HungarianStemmer;
+import org.tartarus.snowball.ext.ItalianStemmer;
+import org.tartarus.snowball.ext.NorwegianStemmer;
+import org.tartarus.snowball.ext.PortugueseStemmer;
+import org.tartarus.snowball.ext.RomanianStemmer;
+import org.tartarus.snowball.ext.RussianStemmer;
+import org.tartarus.snowball.ext.SpanishStemmer;
+import org.tartarus.snowball.ext.SwedishStemmer;
+import org.tartarus.snowball.ext.TurkishStemmer;
+
+/**
+ * An implementation of Carrot2's {@link IStemmerFactory} based on Lucene's
+ * APIs. Should the relevant Lucene APIs need to change, the changes can be made
+ * in this class.
+ */
+public class LuceneCarrot2StemmerFactory implements IStemmerFactory {
+	final static Logger logger = org.slf4j.LoggerFactory
+			.getLogger(LuceneCarrot2StemmerFactory.class);
+
+	public IStemmer getStemmer(LanguageCode language) {
+		switch (language) {
+		case ARABIC:
+			return ArabicStemmerFactory.createStemmer();
+
+		case CHINESE_SIMPLIFIED:
+			return IdentityStemmer.INSTANCE;
+
+		default:
+			/*
+			 * For other languages, try to use snowball's stemming.
+			 */
+			return SnowballStemmerFactory.createStemmer(language);
+		}
+	}
+
+	/**
+	 * Factory of {@link IStemmer} implementations from the <code>snowball</code>
+	 * project.
+	 */
+	private final static class SnowballStemmerFactory {
+		/**
+		 * Static hard mapping from language codes to stemmer classes in Snowball.
+		 * This mapping is not dynamic because we want to keep the possibility to
+		 * obfuscate these classes.
+		 */
+		private static HashMap<LanguageCode, Class<? extends SnowballProgram>> snowballStemmerClasses;
+		static {
+			snowballStemmerClasses = new HashMap<LanguageCode, Class<? extends SnowballProgram>>();
+			snowballStemmerClasses.put(LanguageCode.DANISH, DanishStemmer.class);
+			snowballStemmerClasses.put(LanguageCode.DUTCH, DutchStemmer.class);
+			snowballStemmerClasses.put(LanguageCode.ENGLISH, EnglishStemmer.class);
+			snowballStemmerClasses.put(LanguageCode.FINNISH, FinnishStemmer.class);
+			snowballStemmerClasses.put(LanguageCode.FRENCH, FrenchStemmer.class);
+			snowballStemmerClasses.put(LanguageCode.GERMAN, GermanStemmer.class);
+			snowballStemmerClasses
+					.put(LanguageCode.HUNGARIAN, HungarianStemmer.class);
+			snowballStemmerClasses.put(LanguageCode.ITALIAN, ItalianStemmer.class);
+			snowballStemmerClasses
+					.put(LanguageCode.NORWEGIAN, NorwegianStemmer.class);
+			snowballStemmerClasses.put(LanguageCode.PORTUGUESE,
+					PortugueseStemmer.class);
+			snowballStemmerClasses.put(LanguageCode.ROMANIAN, RomanianStemmer.class);
+			snowballStemmerClasses.put(LanguageCode.RUSSIAN, RussianStemmer.class);
+			snowballStemmerClasses.put(LanguageCode.SPANISH, SpanishStemmer.class);
+			snowballStemmerClasses.put(LanguageCode.SWEDISH, SwedishStemmer.class);
+			snowballStemmerClasses.put(LanguageCode.TURKISH, TurkishStemmer.class);
+		}
+
+		/**
+		 * An adapter converting Snowball programs into {@link IStemmer} interface.
+		 */
+		private static class SnowballStemmerAdapter implements IStemmer {
+			private final SnowballProgram snowballStemmer;
+
+			public SnowballStemmerAdapter(SnowballProgram snowballStemmer) {
+				this.snowballStemmer = snowballStemmer;
+			}
+
+			public CharSequence stem(CharSequence word) {
+				snowballStemmer.setCurrent(word.toString());
+				if (snowballStemmer.stem()) {
+					return snowballStemmer.getCurrent();
+				} else {
+					return null;
+				}
+			}
+		}
+
+		/**
+		 * Create and return an {@link IStemmer} adapter for a
+		 * {@link SnowballProgram} for a given language code. An identity stemmer is
+		 * returned for unknown languages.
+		 */
+		public static IStemmer createStemmer(LanguageCode language) {
+			final Class<? extends SnowballProgram> stemmerClazz = snowballStemmerClasses
+					.get(language);
+
+			if (stemmerClazz == null) {
+				logger.warn("No Snowball stemmer class for: " + language.name()
+						+ ". Quality of clustering may be degraded.");
+				return IdentityStemmer.INSTANCE;
+			}
+
+			try {
+				return new SnowballStemmerAdapter(stemmerClazz.newInstance());
+			} catch (Exception e) {
+				logger.warn("Could not instantiate snowball stemmer"
+						+ " for language: " + language.name()
+						+ ". Quality of clustering may be degraded.", e);
+
+				return IdentityStemmer.INSTANCE;
+			}
+		}
+	}
+
+	/**
+	 * Factory of {@link IStemmer} implementations for the
+	 * {@link LanguageCode#ARABIC} language. Requires <code>lucene-contrib</code>
+	 * to be present in classpath, otherwise an empty (identity) stemmer is
+	 * returned.
+	 */
+	private static class ArabicStemmerFactory {
+		static {
+			try {
+				ReflectionUtils.classForName(ArabicStemmer.class.getName(), false);
+				ReflectionUtils.classForName(ArabicNormalizer.class.getName(), false);
+			} catch (ClassNotFoundException e) {
+				logger
+						.warn(
+								"Could not instantiate Lucene stemmer for Arabic, clustering quality "
+										+ "of Arabic content may be degraded. For best quality clusters, "
+										+ "make sure Lucene's Arabic analyzer JAR is in the classpath",
+								e);
+			}
+		}
+
+		/**
+		 * Adapter to lucene-contrib Arabic analyzers.
+		 */
+		private static class LuceneStemmerAdapter implements IStemmer {
+			private final org.apache.lucene.analysis.ar.ArabicStemmer delegate;
+			private final org.apache.lucene.analysis.ar.ArabicNormalizer normalizer;
+
+			private char[] buffer = new char[0];
+
+			private LuceneStemmerAdapter() throws Exception {
+				delegate = new org.apache.lucene.analysis.ar.ArabicStemmer();
+				normalizer = new org.apache.lucene.analysis.ar.ArabicNormalizer();
+			}
+
+			public CharSequence stem(CharSequence word) {
+				if (word.length() > buffer.length) {
+					buffer = new char[word.length()];
+				}
+
+				for (int i = 0; i < word.length(); i++) {
+					buffer[i] = word.charAt(i);
+				}
+
+				int newLen = normalizer.normalize(buffer, word.length());
+				newLen = delegate.stem(buffer, newLen);
+
+				if (newLen != word.length() || !equals(buffer, newLen, word)) {
+					return CharBuffer.wrap(buffer, 0, newLen);
+				}
+
+				// Same-same.
+				return null;
+			}
+
+			private boolean equals(char[] buffer, int len, CharSequence word) {
+				assert len == word.length();
+
+				for (int i = 0; i < len; i++) {
+					if (buffer[i] != word.charAt(i))
+						return false;
+				}
+
+				return true;
+			}
+		}
+
+		public static IStemmer createStemmer() {
+			try {
+				return new LuceneStemmerAdapter();
+			} catch (Throwable e) {
+				return IdentityStemmer.INSTANCE;
+			}
+		}
+	}
+
+	/**
+	 * An implementation of {@link IStemmer} that always returns <code>null</code>
+	 * which means no stemming.
+	 */
+	private static class IdentityStemmer implements IStemmer {
+		private final static IdentityStemmer INSTANCE = new IdentityStemmer();
+
+		public CharSequence stem(CharSequence word) {
+			return null;
+		}
+	}
+}

Added: lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/LuceneCarrot2TokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/LuceneCarrot2TokenizerFactory.java?rev=1103746&view=auto
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/LuceneCarrot2TokenizerFactory.java (added)
+++ lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/LuceneCarrot2TokenizerFactory.java Mon May 16 14:21:41 2011
@@ -0,0 +1,155 @@
+package org.apache.solr.handler.clustering.carrot2;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.regex.Pattern;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.carrot2.core.LanguageCode;
+import org.carrot2.text.analysis.ExtendedWhitespaceTokenizer;
+import org.carrot2.text.analysis.ITokenizer;
+import org.carrot2.text.linguistic.ITokenizerFactory;
+import org.carrot2.text.util.MutableCharArray;
+import org.carrot2.util.ExceptionUtils;
+import org.carrot2.util.ReflectionUtils;
+import org.slf4j.Logger;
+
+/**
+ * An implementation of Carrot2's {@link ITokenizerFactory} based on Lucene's
+ * Smart Chinese tokenizer. If Smart Chinese tokenizer is not available in
+ * classpath at runtime, the default Carrot2's tokenizer is used. Should the
+ * Lucene APIs need to change, the changes can be made in this class.
+ */
+public class LuceneCarrot2TokenizerFactory implements ITokenizerFactory {
+	final static Logger logger = org.slf4j.LoggerFactory
+			.getLogger(LuceneCarrot2TokenizerFactory.class);
+
+	public ITokenizer getTokenizer(LanguageCode language) {
+		switch (language) {
+		case CHINESE_SIMPLIFIED:
+			return ChineseTokenizerFactory.createTokenizer();
+
+			/*
+			 * We use our own analyzer for Arabic. Lucene's version has special
+			 * support for Nonspacing-Mark characters (see
+			 * http://www.fileformat.info/info/unicode/category/Mn/index.htm), but we
+			 * have them included as letters in the parser.
+			 */
+		case ARABIC:
+			// Intentional fall-through.
+
+		default:
+			return new ExtendedWhitespaceTokenizer();
+		}
+	}
+
+	/**
+	 * Creates tokenizers that adapt Lucene's Smart Chinese Tokenizer to Carrot2's
+	 * {@link ITokenizer}. If Smart Chinese is not available in the classpath, the
+	 * factory will fall back to the default white space tokenizer.
+	 */
+	private static final class ChineseTokenizerFactory {
+		static {
+			try {
+				ReflectionUtils.classForName(
+						"org.apache.lucene.analysis.cn.smart.WordTokenFilter", false);
+				ReflectionUtils.classForName(
+						"org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false);
+			} catch (Throwable e) {
+				logger
+						.warn("Could not instantiate Smart Chinese Analyzer, clustering quality "
+								+ "of Chinese content may be degraded. For best quality clusters, "
+								+ "make sure Lucene's Smart Chinese Analyzer JAR is in the classpath");
+			}
+		}
+
+		static ITokenizer createTokenizer() {
+			try {
+				return new ChineseTokenizer();
+			} catch (Throwable e) {
+				return new ExtendedWhitespaceTokenizer();
+			}
+		}
+
+		private final static class ChineseTokenizer implements ITokenizer {
+			private final static Pattern numeric = Pattern
+					.compile("[\\-+'$]?\\d+([:\\-/,.]?\\d+)*[%$]?");
+
+			private Tokenizer sentenceTokenizer;
+			private TokenStream wordTokenFilter;
+			private CharTermAttribute term = null;
+
+			private final MutableCharArray tempCharSequence;
+			private final Class<?> tokenFilterClass;
+
+			private ChineseTokenizer() throws Exception {
+				this.tempCharSequence = new MutableCharArray(new char[0]);
+
+				// As Smart Chinese is not available during compile time,
+				// we need to resort to reflection.
+				final Class<?> tokenizerClass = ReflectionUtils.classForName(
+						"org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false);
+				this.sentenceTokenizer = (Tokenizer) tokenizerClass.getConstructor(
+						Reader.class).newInstance((Reader) null);
+				this.tokenFilterClass = ReflectionUtils.classForName(
+						"org.apache.lucene.analysis.cn.smart.WordTokenFilter", false);
+			}
+
+			public short nextToken() throws IOException {
+				final boolean hasNextToken = wordTokenFilter.incrementToken();
+				if (hasNextToken) {
+					short flags = 0;
+					final char[] image = term.buffer();
+					final int length = term.length();
+					tempCharSequence.reset(image, 0, length);
+					if (length == 1 && image[0] == ',') {
+						// ChineseTokenizer seems to convert all punctuation to ','
+						// characters
+						flags = ITokenizer.TT_PUNCTUATION;
+					} else if (numeric.matcher(tempCharSequence).matches()) {
+						flags = ITokenizer.TT_NUMERIC;
+					} else {
+						flags = ITokenizer.TT_TERM;
+					}
+					return flags;
+				}
+
+				return ITokenizer.TT_EOF;
+			}
+
+			public void setTermBuffer(MutableCharArray array) {
+				array.reset(term.buffer(), 0, term.length());
+			}
+
+			public void reset(Reader input) throws IOException {
+				try {
+					sentenceTokenizer.reset(input);
+					wordTokenFilter = (TokenStream) tokenFilterClass.getConstructor(
+							TokenStream.class).newInstance(sentenceTokenizer);
+					term = wordTokenFilter.addAttribute(CharTermAttribute.class);
+				} catch (Exception e) {
+					throw ExceptionUtils.wrapAsRuntimeException(e);
+				}
+			}
+		}
+	}
+}

Added: lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/SolrStopwordsCarrot2LexicalDataFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/SolrStopwordsCarrot2LexicalDataFactory.java?rev=1103746&view=auto
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/SolrStopwordsCarrot2LexicalDataFactory.java (added)
+++ lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/SolrStopwordsCarrot2LexicalDataFactory.java Mon May 16 14:21:41 2011
@@ -0,0 +1,138 @@
+package org.apache.solr.handler.clustering.carrot2;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Collection;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.solr.analysis.CommonGramsFilterFactory;
+import org.apache.solr.analysis.StopFilterFactory;
+import org.apache.solr.analysis.TokenFilterFactory;
+import org.apache.solr.analysis.TokenizerChain;
+import org.apache.solr.schema.IndexSchema;
+import org.carrot2.core.LanguageCode;
+import org.carrot2.core.attribute.Init;
+import org.carrot2.core.attribute.Processing;
+import org.carrot2.text.linguistic.DefaultLexicalDataFactory;
+import org.carrot2.text.linguistic.ILexicalData;
+import org.carrot2.text.linguistic.ILexicalDataFactory;
+import org.carrot2.text.util.MutableCharArray;
+import org.carrot2.util.attribute.Attribute;
+import org.carrot2.util.attribute.Bindable;
+import org.carrot2.util.attribute.Input;
+import org.slf4j.Logger;
+
+import com.google.common.collect.HashMultimap;
+import com.google.common.collect.Multimap;
+
+/**
+ * An implementation of Carrot2's {@link ILexicalDataFactory} that adds stop
+ * words from a field's StopFilter to the default stop words used in Carrot2,
+ * for all languages Carrot2 supports. Completely replacing Carrot2 stop words
+ * with Solr's wouldn't make much sense because clustering needs more aggressive
+ * stop words removal. In other words, if something is a stop word during
+ * indexing, then it should also be a stop word during clustering, but not the
+ * other way round.
+ */
+@Bindable
+public class SolrStopwordsCarrot2LexicalDataFactory implements
+		ILexicalDataFactory {
+	final static Logger logger = org.slf4j.LoggerFactory
+			.getLogger(SolrStopwordsCarrot2LexicalDataFactory.class);
+
+	@Init
+	@Input
+	@Attribute(key = "solrIndexSchema")
+	private IndexSchema schema;
+
+	@Processing
+	@Input
+	@Attribute(key = "solrFieldNames")
+	private Set<String> fieldNames;
+
+	/**
+	 * A lazily-built cache of stop words per field.
+	 */
+	private Multimap<String, CharArraySet> solrStopWords = HashMultimap.create();
+
+	/**
+	 * Carrot2's default lexical resources to use in addition to Solr's stop
+	 * words.
+	 */
+	private DefaultLexicalDataFactory carrot2LexicalDataFactory = new DefaultLexicalDataFactory();
+
+	/**
+	 * Obtains stop words for a field from the associated
+	 * {@link StopFilterFactory}, if any.
+	 */
+	private Collection<CharArraySet> getSolrStopWordsForField(String fieldName) {
+		// No need to synchronize here, Carrot2 ensures that instances
+		// of this class are not used by multiple threads at a time.
+		if (!solrStopWords.containsKey(fieldName)) {
+			final Analyzer fieldAnalyzer = schema.getFieldType(fieldName)
+					.getAnalyzer();
+			if (fieldAnalyzer instanceof TokenizerChain) {
+				final TokenFilterFactory[] filterFactories = ((TokenizerChain) fieldAnalyzer)
+						.getTokenFilterFactories();
+				for (TokenFilterFactory factory : filterFactories) {
+					if (factory instanceof StopFilterFactory) {
+						// StopFilterFactory holds the stop words in a CharArraySet, but
+						// the getStopWords() method returns a Set<?>, so we need to cast.
+						solrStopWords.put(fieldName,
+								(CharArraySet) ((StopFilterFactory) factory).getStopWords());
+					}
+
+					if (factory instanceof CommonGramsFilterFactory) {
+						solrStopWords.put(fieldName,
+								(CharArraySet) ((CommonGramsFilterFactory) factory)
+										.getCommonWords());
+					}
+				}
+			}
+		}
+		return solrStopWords.get(fieldName);
+	}
+
+	public ILexicalData getLexicalData(LanguageCode languageCode) {
+		final ILexicalData carrot2LexicalData = carrot2LexicalDataFactory
+				.getLexicalData(languageCode);
+
+		return new ILexicalData() {
+			public boolean isStopLabel(CharSequence word) {
+				// Nothing in Solr maps to the concept of a stop label,
+				// so return Carrot2's default here.
+				return carrot2LexicalData.isStopLabel(word);
+			}
+
+			public boolean isCommonWord(MutableCharArray word) {
+				// Loop over the fields involved in clustering first
+				for (String fieldName : fieldNames) {
+					for (CharArraySet stopWords : getSolrStopWordsForField(fieldName)) {
+						if (stopWords.contains(word)) {
+							return true;
+						}
+					}
+				}
+				// Check default Carrot2 stop words too
+				return carrot2LexicalData.isCommonWord(word);
+			}
+		};
+	}
+}

Modified: lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java?rev=1103746&r1=1103745&r2=1103746&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java (original)
+++ lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java Mon May 16 14:21:41 2011
@@ -17,6 +17,11 @@ package org.apache.solr.handler.clusteri
  * limitations under the License.
  */
 
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.MatchAllDocsQuery;
 import org.apache.lucene.search.Query;
@@ -37,22 +42,18 @@ import org.apache.solr.util.SolrPluginUt
 import org.carrot2.util.attribute.AttributeUtils;
 import org.junit.Test;
 
-import java.io.IOException;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
+import com.google.common.collect.ImmutableList;
 
 /**
  *
  */
-@SuppressWarnings("unchecked")
 public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
   @Test
   public void testCarrotLingo() throws Exception {
-  	// Note: the expected number of clusters may change after upgrading Carrot2
-  	// due to e.g. internal improvements or tuning of Carrot2 clustering.
+     // Note: the expected number of clusters may change after upgrading Carrot2
+     // due to e.g. internal improvements or tuning of Carrot2 clustering.
     final int expectedNumClusters = 10;
-		checkEngine(getClusteringEngine("default"), expectedNumClusters);
+         checkEngine(getClusteringEngine("default"), expectedNumClusters);
   }
 
   @Test
@@ -61,8 +62,8 @@ public class CarrotClusteringEngineTest 
     solrParams.add(CarrotParams.SNIPPET_FIELD_NAME, "snippet");
     solrParams.add(CarrotParams.SUMMARY_FRAGSIZE, "200");//how do we validate this?
     
-  	// Note: the expected number of clusters may change after upgrading Carrot2
-  	// due to e.g. internal improvements or tuning of Carrot2 clustering.
+     // Note: the expected number of clusters may change after upgrading Carrot2
+     // due to e.g. internal improvements or tuning of Carrot2 clustering.
     final int expectedNumClusters = 15;
     checkEngine(getClusteringEngine("default"), numberOfDocs -2 /*two don't have mining in the snippet*/, expectedNumClusters, new TermQuery(new Term("snippet", "mine")), solrParams);
   }
@@ -74,7 +75,7 @@ public class CarrotClusteringEngineTest 
 
   @Test
   public void testWithoutSubclusters() throws Exception {
-    checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs),
+    checkClusters(checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs),
             1, 1, 0);
   }
 
@@ -82,7 +83,7 @@ public class CarrotClusteringEngineTest 
   public void testWithSubclusters() throws Exception {
     ModifiableSolrParams params = new ModifiableSolrParams();
     params.set(CarrotParams.OUTPUT_SUB_CLUSTERS, true);
-    checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs), 1, 1, 2);
+    checkClusters(checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs), 1, 1, 2);
   }
 
   @Test
@@ -90,19 +91,107 @@ public class CarrotClusteringEngineTest 
     ModifiableSolrParams params = new ModifiableSolrParams();
     params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "labels"), 5);
     params.set(CarrotParams.NUM_DESCRIPTIONS, 3);
-    checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs,
+    checkClusters(checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs,
             params), 1, 3, 0);
   }
 
   @Test
+  public void testClusterScores() throws Exception {
+    ModifiableSolrParams params = new ModifiableSolrParams();
+    params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "depth"), 1);
+    List<NamedList<Object>> clusters = checkEngine(getClusteringEngine("mock"),
+        AbstractClusteringTestCase.numberOfDocs, params);
+    int i = 1;
+    for (NamedList<Object> cluster : clusters) {
+      final Double score = getScore(cluster);
+      assertNotNull(score);
+      assertEquals(0.25 * i++, score, 0);
+    }
+  }
+
+  @Test
+  public void testOtherTopics() throws Exception {
+    ModifiableSolrParams params = new ModifiableSolrParams();
+    params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "depth"), 1);
+    params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "otherTopicsModulo"), 2);
+    List<NamedList<Object>> clusters = checkEngine(getClusteringEngine("mock"),
+        AbstractClusteringTestCase.numberOfDocs, params);
+    int i = 1;
+    for (NamedList<Object> cluster : clusters) {
+      assertEquals(i++ % 2 == 0 ? true : null, isOtherTopics(cluster));
+    }
+  }
+
+  @Test
   public void testCarrotAttributePassing() throws Exception {
     ModifiableSolrParams params = new ModifiableSolrParams();
     params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "depth"), 1);
     params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "labels"), 3);
-    checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs,
+    checkClusters(checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs,
             params), 1, 3, 0);
   }
 
+	@Test
+	public void testLexicalResourcesFromSolrConfigDefaultDir() throws Exception {
+		checkLexicalResourcesFromSolrConfig("lexical-resource-check",
+				"online,customsolrstopword,customsolrstoplabel");
+	}
+
+	@Test
+	public void testLexicalResourcesFromSolrConfigCustomDir() throws Exception {
+		checkLexicalResourcesFromSolrConfig("lexical-resource-check-custom-resource-dir",
+				"online,customsolrstopwordcustomdir,customsolrstoplabelcustomdir");
+	}
+
+	private void checkLexicalResourcesFromSolrConfig(String engineName, String wordsToCheck)
+			throws IOException {
+		ModifiableSolrParams params = new ModifiableSolrParams();
+		params.set("merge-resources", false);
+		params.set(AttributeUtils.getKey(
+				LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
+				wordsToCheck);
+
+		// "customsolrstopword" is in stopwords.en, "customsolrstoplabel" is in
+		// stoplabels.en, so we're expecting only one cluster with label "online".
+		final List<NamedList<Object>> clusters = checkEngine(
+				getClusteringEngine(engineName), 1, params);
+		assertEquals(getLabels(clusters.get(0)), ImmutableList.of("online"));
+	}
+
+	@Test
+	public void solrStopWordsUsedInCarrot2Clustering() throws Exception {
+		ModifiableSolrParams params = new ModifiableSolrParams();
+		params.set("merge-resources", false);
+		params.set(AttributeUtils.getKey(
+				LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
+		"online,solrownstopword");
+
+		// "solrownstopword" is in stopwords.txt, so we're expecting
+		// only one cluster with label "online".
+		final List<NamedList<Object>> clusters = checkEngine(
+				getClusteringEngine("lexical-resource-check"), 1, params);
+		assertEquals(getLabels(clusters.get(0)), ImmutableList.of("online"));
+	}
+
+	@Test
+	public void solrStopWordsNotDefinedOnAFieldForClustering() throws Exception {
+		ModifiableSolrParams params = new ModifiableSolrParams();
+		// Force string fields to be used for clustering. Does not make sense
+		// in a real word, but does the job in the test.
+		params.set(CarrotParams.TITLE_FIELD_NAME, "url");
+		params.set(CarrotParams.SNIPPET_FIELD_NAME, "url");
+		params.set("merge-resources", false);
+		params.set(AttributeUtils.getKey(
+				LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
+		"online,solrownstopword");
+
+		final List<NamedList<Object>> clusters = checkEngine(
+				getClusteringEngine("lexical-resource-check"), 2, params);
+		assertEquals(ImmutableList.of("online"), getLabels(clusters.get(0)));
+		assertEquals(ImmutableList.of("solrownstopword"),
+				getLabels(clusters.get(1)));
+	}
+
   private CarrotClusteringEngine getClusteringEngine(String engineName) {
     ClusteringComponent comp = (ClusteringComponent) h.getCore()
             .getSearchComponent("clustering");
@@ -114,18 +203,18 @@ public class CarrotClusteringEngineTest 
     return engine;
   }
 
-  private List checkEngine(CarrotClusteringEngine engine,
+  private List<NamedList<Object>> checkEngine(CarrotClusteringEngine engine,
                             int expectedNumClusters) throws IOException {
     return checkEngine(engine, numberOfDocs, expectedNumClusters, new MatchAllDocsQuery(), new ModifiableSolrParams());
   }
 
-  private List checkEngine(CarrotClusteringEngine engine,
+  private List<NamedList<Object>> checkEngine(CarrotClusteringEngine engine,
                             int expectedNumClusters, SolrParams clusteringParams) throws IOException {
     return checkEngine(engine, numberOfDocs, expectedNumClusters, new MatchAllDocsQuery(), clusteringParams);
   }
 
 
-  private List checkEngine(CarrotClusteringEngine engine, int expectedNumDocs,
+  private List<NamedList<Object>> checkEngine(CarrotClusteringEngine engine, int expectedNumDocs,
                            int expectedNumClusters, Query query, SolrParams clusteringParams) throws IOException {
     // Get all documents to cluster
     RefCounted<SolrIndexSearcher> ref = h.getCore().getSearcher();
@@ -145,7 +234,9 @@ public class CarrotClusteringEngineTest 
       LocalSolrQueryRequest req = new LocalSolrQueryRequest(h.getCore(), solrParams);
       Map<SolrDocument,Integer> docIds = new HashMap<SolrDocument, Integer>(docList.size());
       SolrDocumentList solrDocList = SolrPluginUtils.docListToSolrDocumentList( docList, searcher, engine.getFieldsToLoad(req), docIds );
-      List results = (List)engine.cluster(query, solrDocList, docIds, req);
+      
+      @SuppressWarnings("unchecked")
+             List<NamedList<Object>> results = (List<NamedList<Object>>) engine.cluster(query, solrDocList, docIds, req);
       req.close();
       assertEquals("number of clusters: " + results, expectedNumClusters, results.size());
       checkClusters(results, false);
@@ -155,51 +246,74 @@ public class CarrotClusteringEngineTest 
     }
   }
 
-  private void checkClusters(List results, int expectedDocCount,
+  private void checkClusters(List<NamedList<Object>> results, int expectedDocCount,
                              int expectedLabelCount, int expectedSubclusterCount) {
     for (int i = 0; i < results.size(); i++) {
-      NamedList cluster = (NamedList) results.get(i);
+      NamedList<Object> cluster = results.get(i);
       checkCluster(cluster, expectedDocCount, expectedLabelCount,
               expectedSubclusterCount);
     }
   }
 
-  private void checkClusters(List results, boolean hasSubclusters) {
+  private void checkClusters(List<NamedList<Object>> results, boolean hasSubclusters) {
     for (int i = 0; i < results.size(); i++) {
-      checkCluster((NamedList) results.get(i), hasSubclusters);
+      checkCluster(results.get(i), hasSubclusters);
     }
   }
 
-  private void checkCluster(NamedList cluster, boolean hasSubclusters) {
-    List docs = (List) cluster.get("docs");
+  private void checkCluster(NamedList<Object> cluster, boolean hasSubclusters) {
+    List<Object> docs = getDocs(cluster);
     assertNotNull("docs is null and it shouldn't be", docs);
     for (int j = 0; j < docs.size(); j++) {
       String id = (String) docs.get(j);
       assertNotNull("id is null and it shouldn't be", id);
     }
 
-    List labels = (List) cluster.get("labels");
+    List<String> labels = getLabels(cluster);
     assertNotNull("labels is null but it shouldn't be", labels);
 
     if (hasSubclusters) {
-      List subclusters = (List) cluster.get("clusters");
+      List<NamedList<Object>> subclusters = getSubclusters(cluster);
       assertNotNull("subclusters is null but it shouldn't be", subclusters);
     }
   }
 
-  private void checkCluster(NamedList cluster, int expectedDocCount,
+  private void checkCluster(NamedList<Object> cluster, int expectedDocCount,
                             int expectedLabelCount, int expectedSubclusterCount) {
     checkCluster(cluster, expectedSubclusterCount > 0);
     assertEquals("number of docs in cluster", expectedDocCount,
-            ((List) cluster.get("docs")).size());
+            getDocs(cluster).size());
     assertEquals("number of labels in cluster", expectedLabelCount,
-            ((List) cluster.get("labels")).size());
+            getLabels(cluster).size());
 
     if (expectedSubclusterCount > 0) {
-      List subclusters = (List) cluster.get("clusters");
+      List<NamedList<Object>> subclusters = getSubclusters(cluster);
       assertEquals("numClusters", expectedSubclusterCount, subclusters.size());
       assertEquals("number of subclusters in cluster",
               expectedSubclusterCount, subclusters.size());
     }
   }
+
+  @SuppressWarnings("unchecked")
+  private List<NamedList<Object>> getSubclusters(NamedList<Object> cluster) {
+    return (List<NamedList<Object>>) cluster.get("clusters");
+  }
+
+  @SuppressWarnings("unchecked")
+  private List<String> getLabels(NamedList<Object> cluster) {
+    return (List<String>) cluster.get("labels");
+  }
+  
+  private Double getScore(NamedList<Object> cluster) {
+    return (Double) cluster.get("score");
+  }
+
+  private Boolean isOtherTopics(NamedList<Object> cluster) {
+    return (Boolean) cluster.get("other-topics");
+  }
+
+  @SuppressWarnings("unchecked")
+  private List<Object> getDocs(NamedList<Object> cluster) {
+    return (List<Object>) cluster.get("docs");
+  }
 }

Added: lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/LexicalResourcesCheckClusteringAlgorithm.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/LexicalResourcesCheckClusteringAlgorithm.java?rev=1103746&view=auto
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/LexicalResourcesCheckClusteringAlgorithm.java (added)
+++ lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/LexicalResourcesCheckClusteringAlgorithm.java Mon May 16 14:21:41 2011
@@ -0,0 +1,82 @@
+package org.apache.solr.handler.clustering.carrot2;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import java.util.List;
+
+import org.carrot2.core.Cluster;
+import org.carrot2.core.IClusteringAlgorithm;
+import org.carrot2.core.LanguageCode;
+import org.carrot2.core.ProcessingComponentBase;
+import org.carrot2.core.ProcessingException;
+import org.carrot2.core.attribute.AttributeNames;
+import org.carrot2.core.attribute.Processing;
+import org.carrot2.text.linguistic.DefaultLexicalDataFactory;
+import org.carrot2.text.linguistic.ILexicalData;
+import org.carrot2.text.linguistic.ILexicalDataFactory;
+import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipeline;
+import org.carrot2.text.util.MutableCharArray;
+import org.carrot2.util.attribute.Attribute;
+import org.carrot2.util.attribute.Bindable;
+import org.carrot2.util.attribute.Input;
+import org.carrot2.util.attribute.Output;
+
+import com.google.common.collect.Lists;
+
+/**
+ * A mock implementation of Carrot2 clustering algorithm for testing whether the
+ * customized lexical resource lookup works correctly. This algorithm ignores
+ * the input documents and instead for each word from {@link #wordsToCheck}, it
+ * outputs a cluster labeled with the word only if the word is neither a stop
+ * word nor a stop label.
+ */
+@Bindable(prefix = "LexicalResourcesCheckClusteringAlgorithm")
+public class LexicalResourcesCheckClusteringAlgorithm extends
+		ProcessingComponentBase implements IClusteringAlgorithm {
+
+	@Output
+	@Processing
+	@Attribute(key = AttributeNames.CLUSTERS)
+	private List<Cluster> clusters;
+
+	@Input
+	@Processing
+	@Attribute
+	private String wordsToCheck;
+
+	private BasicPreprocessingPipeline preprocessing = new BasicPreprocessingPipeline();
+
+	@Override
+	public void process() throws ProcessingException {
+		clusters = Lists.newArrayList();
+		if (wordsToCheck == null) {
+			return;
+		}
+
+		// Test with Maltese so that the English clustering performed in other tests
+		// is not affected by the test stopwords and stoplabels.
+		ILexicalData lexicalData = preprocessing.lexicalDataFactory
+				.getLexicalData(LanguageCode.MALTESE);
+
+		for (String word : wordsToCheck.split(",")) {
+			if (!lexicalData.isCommonWord(new MutableCharArray(word))
+					&& !lexicalData.isStopLabel(word)) {
+				clusters.add(new Cluster(word));
+			}
+		}
+	}
+}

Modified: lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/MockClusteringAlgorithm.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/MockClusteringAlgorithm.java?rev=1103746&r1=1103745&r2=1103746&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/MockClusteringAlgorithm.java (original)
+++ lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/MockClusteringAlgorithm.java Mon May 16 14:21:41 2011
@@ -49,6 +49,11 @@ public class MockClusteringAlgorithm ext
   @IntRange(min = 1, max = 5)
   private int labels = 1;
 
+  @Input
+  @Processing
+  @Attribute
+  private int otherTopicsModulo = 0;
+
   @Override
   public void process() throws ProcessingException {
     clusters = Lists.newArrayList();
@@ -59,21 +64,26 @@ public class MockClusteringAlgorithm ext
     int documentIndex = 1;
     for (Document document : documents) {
       StringBuilder label = new StringBuilder("Cluster " + documentIndex);
-      Cluster cluster = createCluster(label.toString(), document);
+      Cluster cluster = createCluster(label.toString(), documentIndex, document);
       clusters.add(cluster);
       for (int i = 1; i <= depth; i++) {
         label.append(".");
         label.append(i);
-        Cluster newCluster = createCluster(label.toString(), document);
-        cluster.addSubclusters(createCluster(label.toString(), document), newCluster);
+        Cluster newCluster = createCluster(label.toString(), documentIndex, document);
+        cluster.addSubclusters(createCluster(label.toString(), documentIndex, document), newCluster);
         cluster = newCluster;
       }
       documentIndex++;
     }
   }
 
-  private Cluster createCluster(String labelBase, Document... documents) {
+  private Cluster createCluster(String labelBase, int documentIndex, Document... documents) {
     Cluster cluster = new Cluster();
+    cluster.setScore(documentIndex * 0.25);
+    if (otherTopicsModulo != 0 && documentIndex % otherTopicsModulo == 0)
+    {
+      cluster.setOtherTopics(true);
+    }
     for (int i = 0; i < labels; i++) {
       cluster.addPhrases(labelBase + "#" + (i + 1));
     }

Added: lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/clustering/carrot2/stoplabels.mt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/clustering/carrot2/stoplabels.mt?rev=1103746&view=auto
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/clustering/carrot2/stoplabels.mt (added)
+++ lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/clustering/carrot2/stoplabels.mt Mon May 16 14:21:41 2011
@@ -0,0 +1 @@
+customsolrstoplabel

Added: lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/clustering/carrot2/stopwords.mt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/clustering/carrot2/stopwords.mt?rev=1103746&view=auto
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/clustering/carrot2/stopwords.mt (added)
+++ lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/clustering/carrot2/stopwords.mt Mon May 16 14:21:41 2011
@@ -0,0 +1 @@
+customsolrstopword

Added: lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/clustering/custom/stoplabels.mt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/clustering/custom/stoplabels.mt?rev=1103746&view=auto
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/clustering/custom/stoplabels.mt (added)
+++ lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/clustering/custom/stoplabels.mt Mon May 16 14:21:41 2011
@@ -0,0 +1 @@
+customsolrstoplabelcustomdir

Added: lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/clustering/custom/stopwords.mt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/clustering/custom/stopwords.mt?rev=1103746&view=auto
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/clustering/custom/stopwords.mt (added)
+++ lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/clustering/custom/stopwords.mt Mon May 16 14:21:41 2011
@@ -0,0 +1 @@
+customsolrstopwordcustomdir

Modified: lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/solrconfig.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/solrconfig.xml?rev=1103746&r1=1103745&r2=1103746&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/solrconfig.xml (original)
+++ lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/solrconfig.xml Mon May 16 14:21:41 2011
@@ -404,6 +404,15 @@
       <str name="name">mock</str>
       <str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.MockClusteringAlgorithm</str>
     </lst>
+    <lst name="engine">
+      <str name="name">lexical-resource-check</str>
+      <str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.LexicalResourcesCheckClusteringAlgorithm</str>
+    </lst>
+    <lst name="engine">
+      <str name="name">lexical-resource-check-custom-resource-dir</str>
+      <str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.LexicalResourcesCheckClusteringAlgorithm</str>
+      <str name="carrot.lexicalResourcesDir">clustering/custom</str>
+    </lst>
   </searchComponent>
 
   <searchComponent class="org.apache.solr.handler.clustering.ClusteringComponent" name="doc-clustering">

Modified: lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/stopwords.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/stopwords.txt?rev=1103746&r1=1103745&r2=1103746&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/stopwords.txt (original)
+++ lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/stopwords.txt Mon May 16 14:21:41 2011
@@ -55,4 +55,5 @@ to
 was
 will
 with
+solrownstopword
 

Modified: lucene/dev/branches/branch_3x/solr/example/solr/conf/solrconfig.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/example/solr/conf/solrconfig.xml?rev=1103746&r1=1103745&r2=1103746&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/example/solr/conf/solrconfig.xml (original)
+++ lucene/dev/branches/branch_3x/solr/example/solr/conf/solrconfig.xml Mon May 16 14:21:41 2011
@@ -1193,17 +1193,20 @@
     <lst name="engine">
       <!-- The name, only one can be named "default" -->
       <str name="name">default</str>
+
       <!-- Class name of Carrot2 clustering algorithm. 
            
            Currently available algorithms are:
            
            * org.carrot2.clustering.lingo.LingoClusteringAlgorithm
            * org.carrot2.clustering.stc.STCClusteringAlgorithm
+           * org.carrot2.clustering.kmeans.BisectingKMeansClusteringAlgorithm
            
            See http://project.carrot2.org/algorithms.html for the
            algorithm's characteristics.
         -->
       <str name="carrot.algorithm">org.carrot2.clustering.lingo.LingoClusteringAlgorithm</str>
+
       <!-- Overriding values for Carrot2 default algorithm attributes.
 
            For a description of all available attributes, see:
@@ -1215,6 +1218,19 @@
         -->
       <str name="LingoClusteringAlgorithm.desiredClusterCountBase">20</str>
       
+      <!-- Location of Carrot2 lexical resources.
+
+           A directory from which to load Carrot2-specific stop words
+           and stop labels. Absolute or relative to Solr config directory.
+           If a specific resource (e.g. stopwords.en) is present in the
+           specified dir, it will completely override the corresponding
+           default one that ships with Carrot2.
+
+           For an overview of Carrot2 lexical resources, see:
+           http://download.carrot2.org/head/manual/#chapter.lexical-resources
+        -->
+      <str name="carrot.lexicalResourcesDir">clustering/carrot2</str>
+
       <!-- The language to assume for the documents.
            
            For a list of allowed values, see: