You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by st...@apache.org on 2011/05/16 16:21:42 UTC
svn commit: r1103746 - in /lucene/dev/branches/branch_3x: dev-tools/maven/
solr/ solr/contrib/clustering/ solr/contrib/clustering/lib/
solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/
solr/contrib/clustering/src/test/ja...
Author: stanislaw
Date: Mon May 16 14:21:41 2011
New Revision: 1103746
URL: http://svn.apache.org/viewvc?rev=1103746&view=rev
Log:
SOLR-2448: Upgrade of Carrot2 to version 3.5.0 and a number of related clustering improvements (SOLR-2449, SOLR-2450, SOLR-2505)
Added:
lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/carrot2-core-3.5.0.jar (with props)
lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/hppc-0.3.4-jdk15.jar (with props)
lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/LuceneCarrot2StemmerFactory.java
lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/LuceneCarrot2TokenizerFactory.java
lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/SolrStopwordsCarrot2LexicalDataFactory.java
lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/LexicalResourcesCheckClusteringAlgorithm.java
lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/clustering/
lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/clustering/carrot2/
lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/clustering/carrot2/stoplabels.mt
lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/clustering/carrot2/stopwords.mt
lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/clustering/custom/
lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/clustering/custom/stoplabels.mt
lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/clustering/custom/stopwords.mt
Removed:
lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/carrot2-core-3.4.2.jar
lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/hppc-0.3.1.jar
lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/LuceneLanguageModelFactory.java
Modified:
lucene/dev/branches/branch_3x/dev-tools/maven/pom.xml.template
lucene/dev/branches/branch_3x/solr/CHANGES.txt
lucene/dev/branches/branch_3x/solr/build.xml
lucene/dev/branches/branch_3x/solr/contrib/clustering/CHANGES.txt
lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/solr-carrot2-core-pom.xml.template
lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java
lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java
lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java
lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/MockClusteringAlgorithm.java
lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/solrconfig.xml
lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/stopwords.txt
lucene/dev/branches/branch_3x/solr/example/solr/conf/solrconfig.xml
Modified: lucene/dev/branches/branch_3x/dev-tools/maven/pom.xml.template
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/dev-tools/maven/pom.xml.template?rev=1103746&r1=1103745&r2=1103746&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/dev-tools/maven/pom.xml.template (original)
+++ lucene/dev/branches/branch_3x/dev-tools/maven/pom.xml.template Mon May 16 14:21:41 2011
@@ -106,14 +106,6 @@
</licenses>
<repositories>
<repository>
- <id>carrot2.org</id>
- <name>Carrot2 Maven2 repository</name>
- <url>http://download.carrot2.org/maven2/</url>
- <snapshots>
- <updatePolicy>never</updatePolicy>
- </snapshots>
- </repository>
- <repository>
<id>apache.snapshots</id>
<name>Apache Snapshot Repository</name>
<url>http://repository.apache.org/snapshots</url>
@@ -737,7 +729,7 @@
<goal>install-file</goal>
</goals>
<configuration>
- <file>solr/contrib/clustering/lib/carrot2-core-3.4.2.jar</file>
+ <file>solr/contrib/clustering/lib/carrot2-core-3.5.0.jar</file>
<pomFile>lucene/build/lucene-solr-grandparent/solr-carrot2-core-pom.xml.template</pomFile>
</configuration>
</execution>
Modified: lucene/dev/branches/branch_3x/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/CHANGES.txt?rev=1103746&r1=1103745&r2=1103746&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/solr/CHANGES.txt Mon May 16 14:21:41 2011
@@ -29,7 +29,7 @@ Versions of Major Components
---------------------
Apache Lucene 3x
Apache Tika 0.8
-Carrot2 3.4.2
+Carrot2 3.5.0
Upgrading from Solr 3.1
Modified: lucene/dev/branches/branch_3x/solr/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/build.xml?rev=1103746&r1=1103745&r2=1103746&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/build.xml (original)
+++ lucene/dev/branches/branch_3x/solr/build.xml Mon May 16 14:21:41 2011
@@ -1034,7 +1034,7 @@
jar.file="contrib/uima/lib/uima-an-wst-2.3.1-SNAPSHOT-r1076132.jar" />
<m2-deploy-with-pom-template pom.xml="contrib/clustering/lib/solr-carrot2-core-pom.xml.template"
- jar.file="contrib/clustering/lib/carrot2-core-3.4.2.jar" />
+ jar.file="contrib/clustering/lib/carrot2-core-3.5.0.jar" />
<!-- ========== SOLR ARTIFACTS ========== -->
Modified: lucene/dev/branches/branch_3x/solr/contrib/clustering/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/CHANGES.txt?rev=1103746&r1=1103745&r2=1103746&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/clustering/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/solr/contrib/clustering/CHANGES.txt Mon May 16 14:21:41 2011
@@ -10,8 +10,11 @@ $Id$
================== Release 3.2.0-dev ==================
-(No Changes)
-
+* SOLR-2448: Search results clustering updates: bisecting k-means
+ clustering algorithm added, loading of Carrot2 stop words from
+ <solr.home>/conf/carrot2 (SOLR-2449), using Solr's stopwords.txt
+ for clustering (SOLR-2450), output of cluster scores (SOLR-2505).
+
================== Release 3.1.0-dev ==================
* SOLR-1684: Switch to use the SolrIndexSearcher.doc(int, Set<String>) method b/c it can use the document cache (gsingers)
Added: lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/carrot2-core-3.5.0.jar
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/carrot2-core-3.5.0.jar?rev=1103746&view=auto
==============================================================================
Binary file - no diff available.
Added: lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/hppc-0.3.4-jdk15.jar
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/hppc-0.3.4-jdk15.jar?rev=1103746&view=auto
==============================================================================
Binary file - no diff available.
Modified: lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/solr-carrot2-core-pom.xml.template
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/solr-carrot2-core-pom.xml.template?rev=1103746&r1=1103745&r2=1103746&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/solr-carrot2-core-pom.xml.template (original)
+++ lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/solr-carrot2-core-pom.xml.template Mon May 16 14:21:41 2011
@@ -26,10 +26,10 @@
<packaging>jar</packaging>
<name>Solr Specific Carrot2</name>
<description>
- Carrot2 search results clustering framework core, document
- sources and clustering algorithms.
+ Carrot2 search results clustering framework core
+ and clustering algorithms.
- Compiled with Java 1.5 from carrot2-core v3.4.2 sources.
+ Compiled with Java 1.5 from carrot2-core v3.5.0 sources.
</description>
<licenses>
<license>
@@ -59,196 +59,78 @@
<url>http://issues.carrot2.org/</url>
</issueManagement>
- <repositories>
- <repository>
- <id>carrotsearch.labs.releases</id>
- <name>Carrot Search Labs Releases Repository</name>
- <url>http://repository.carrotsearch.com/labs/releases</url>
- </repository>
- </repositories>
+ <properties>
+ <jackson.version>1.7.4</jackson.version>
+ <slf4j.version>1.6.1</slf4j.version>
+ </properties>
<dependencies>
<dependency>
- <groupId>org.carrot2</groupId>
- <artifactId>rome</artifactId>
- <version>1.0.RC1</version>
- </dependency>
-
- <dependency>
- <groupId>org.carrot2</groupId>
- <artifactId>rome-fetcher</artifactId>
- <version>0.7</version>
- </dependency>
-
- <dependency>
<groupId>org.codehaus.jackson</groupId>
<artifactId>jackson-core-asl</artifactId>
- <version>1.5.2</version>
+ <version>${jackson.version}</version>
</dependency>
<dependency>
<groupId>org.codehaus.jackson</groupId>
<artifactId>jackson-mapper-asl</artifactId>
- <version>1.5.2</version>
+ <version>${jackson.version}</version>
</dependency>
-
+
<dependency>
<groupId>net.sf.ehcache</groupId>
<artifactId>ehcache-core</artifactId>
- <version>1.7.1</version>
- </dependency>
-
- <dependency>
- <groupId>commons-codec</groupId>
- <artifactId>commons-codec</artifactId>
- <version>1.3</version>
- </dependency>
-
- <dependency>
- <groupId>commons-collections</groupId>
- <artifactId>commons-collections</artifactId>
- <version>3.2.1</version>
- </dependency>
-
- <dependency>
- <groupId>commons-discovery</groupId>
- <artifactId>commons-discovery</artifactId>
- <version>0.2</version>
- </dependency>
-
- <dependency>
- <groupId>commons-httpclient</groupId>
- <artifactId>commons-httpclient</artifactId>
- <version>3.1</version>
- </dependency>
-
- <dependency>
- <groupId>commons-io</groupId>
- <artifactId>commons-io</artifactId>
- <version>1.4</version>
+ <version>1.7.2</version>
</dependency>
<dependency>
<groupId>commons-lang</groupId>
<artifactId>commons-lang</artifactId>
- <version>2.4</version>
- </dependency>
-
- <dependency>
- <groupId>commons-logging</groupId>
- <artifactId>commons-logging</artifactId>
- <version>1.1.1</version>
+ <version>2.6</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
- <version>r05</version>
- </dependency>
-
- <dependency>
- <groupId>org.jdom</groupId>
- <artifactId>jdom</artifactId>
- <version>1.1</version>
+ <version>r08</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
- <version>1.5.8</version>
- </dependency>
-
- <dependency>
- <groupId>org.apache.lucene</groupId>
- <artifactId>lucene-core</artifactId>
- <version>3.0.1</version>
- </dependency>
-
- <dependency>
- <groupId>org.apache.lucene</groupId>
- <artifactId>lucene-analyzers</artifactId>
- <version>3.0.1</version>
- </dependency>
-
- <dependency>
- <groupId>org.apache.lucene</groupId>
- <artifactId>lucene-highlighter</artifactId>
- <version>3.0.1</version>
- </dependency>
-
- <dependency>
- <groupId>org.apache.lucene</groupId>
- <artifactId>lucene-snowball</artifactId>
- <version>3.0.1</version>
- </dependency>
-
- <dependency>
- <groupId>org.apache.lucene</groupId>
- <artifactId>lucene-smartcn</artifactId>
- <version>3.0.1</version>
- <optional>true</optional>
- </dependency>
-
- <dependency>
- <groupId>org.apache.lucene</groupId>
- <artifactId>lucene-memory</artifactId>
- <version>3.0.1</version>
- </dependency>
-
- <dependency>
- <groupId>oro</groupId>
- <artifactId>oro</artifactId>
- <version>2.0.8</version>
- </dependency>
-
- <dependency>
- <groupId>org.simpleframework</groupId>
- <artifactId>simple-xml</artifactId>
- <version>2.3.5</version>
- </dependency>
-
- <dependency>
- <groupId>org.codehaus.woodstox</groupId>
- <artifactId>wstx-asl</artifactId>
- <version>4.0.0</version>
- </dependency>
-
- <dependency>
- <groupId>xalan</groupId>
- <artifactId>xalan</artifactId>
- <version>2.7.0</version>
- </dependency>
-
- <dependency>
- <groupId>xerces</groupId>
- <artifactId>xercesImpl</artifactId>
- <version>2.8.1</version>
+ <version>${slf4j.version}</version>
</dependency>
<dependency>
<groupId>org.apache.mahout</groupId>
<artifactId>mahout-math</artifactId>
<version>0.3</version>
+ <exclusions>
+ <exclusion>
+ <groupId>com.google.code.gson</groupId>
+ <artifactId>gson</artifactId>
+ </exclusion>
+ </exclusions>
</dependency>
<dependency>
<groupId>org.apache.mahout</groupId>
<artifactId>mahout-collections</artifactId>
<version>0.3</version>
- </dependency>
-
- <dependency>
- <groupId>com.carrotsearch</groupId>
- <artifactId>nni</artifactId>
- <version>1.0.0</version>
- <optional>true</optional>
+ <exclusions>
+ <exclusion>
+ <groupId>org.apache.mahout</groupId>
+ <artifactId>mahout-collection-codegen-plugin</artifactId>
+ </exclusion>
+ </exclusions>
</dependency>
<dependency>
<groupId>com.carrotsearch</groupId>
<artifactId>hppc</artifactId>
- <version>0.3.1</version>
- </dependency>
+ <version>0.3.4</version>
+ <classifier>jdk15</classifier>
+ </dependency>
</dependencies>
</project>
Modified: lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java?rev=1103746&r1=1103745&r2=1103746&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java (original)
+++ lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java Mon May 16 14:21:41 2011
@@ -18,9 +18,11 @@ package org.apache.solr.handler.clusteri
*/
import java.io.IOException;
+import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
@@ -37,6 +39,7 @@ import org.apache.solr.common.params.Sol
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.core.SolrCore;
+import org.apache.solr.core.SolrResourceLoader;
import org.apache.solr.handler.clustering.SearchClusteringEngine;
import org.apache.solr.highlight.SolrHighlighter;
import org.apache.solr.request.LocalSolrQueryRequest;
@@ -51,9 +54,17 @@ import org.carrot2.core.ControllerFactor
import org.carrot2.core.Document;
import org.carrot2.core.IClusteringAlgorithm;
import org.carrot2.core.attribute.AttributeNames;
+import org.carrot2.text.linguistic.DefaultLexicalDataFactoryDescriptor;
+import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipelineDescriptor;
+import org.carrot2.util.resource.ClassLoaderLocator;
+import org.carrot2.util.resource.IResource;
+import org.carrot2.util.resource.IResourceLocator;
+import org.carrot2.util.resource.ResourceLookup;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
/**
@@ -63,19 +74,33 @@ import com.google.common.collect.Sets;
*
* @link http://project.carrot2.org
*/
-@SuppressWarnings("unchecked")
public class CarrotClusteringEngine extends SearchClusteringEngine {
private transient static Logger log = LoggerFactory
.getLogger(CarrotClusteringEngine.class);
+
+ /**
+ * The subdirectory in Solr config dir to read customized Carrot2 resources from.
+ */
+ private static final String CARROT_RESOURCES_PREFIX = "clustering/carrot2";
+
+ /**
+ * Name of Carrot2 document's field containing Solr document's identifier.
+ */
+ private static final String SOLR_DOCUMENT_ID = "solrId";
+
+ /**
+ * Name of Solr document's field containing the document's identifier. To avoid
+ * repeating the content of documents in clusters on output, each cluster contains
+ * identifiers of documents it contains.
+ */
+ private String idFieldName;
/**
* Carrot2 controller that manages instances of clustering algorithms
*/
private Controller controller = ControllerFactory.createPooling();
private Class<? extends IClusteringAlgorithm> clusteringAlgorithmClass;
-
- private String idFieldName;
-
+
@Override
@Deprecated
public Object cluster(Query query, DocList docList, SolrQueryRequest sreq) {
@@ -100,6 +125,10 @@ public class CarrotClusteringEngine exte
attributes.put(AttributeNames.DOCUMENTS, documents);
attributes.put(AttributeNames.QUERY, query.toString());
+ // Pass the fields on which clustering runs to the
+ // SolrStopwordsCarrot2LexicalDataFactory
+ attributes.put("solrFieldNames", getFieldsForClustering(sreq));
+
// Pass extra overriding attributes from the request, if any
extractCarrotAttributes(sreq.getParams(), attributes);
@@ -113,21 +142,66 @@ public class CarrotClusteringEngine exte
}
@Override
+ @SuppressWarnings({ "unchecked", "rawtypes" })
public String init(NamedList config, final SolrCore core) {
String result = super.init(config, core);
- SolrParams initParams = SolrParams.toSolrParams(config);
+ final SolrParams initParams = SolrParams.toSolrParams(config);
// Initialize Carrot2 controller. Pass initialization attributes, if any.
HashMap<String, Object> initAttributes = new HashMap<String, Object>();
extractCarrotAttributes(initParams, initAttributes);
- // Customize the language model factory. The implementation we provide here
- // is included in the code base of Solr, so that it's possible to refactor
- // the Lucene APIs the factory relies on if needed.
- initAttributes.put("PreprocessingPipeline.languageModelFactory",
- LuceneLanguageModelFactory.class);
- this.controller.init(initAttributes);
+ // Customize the stemmer and tokenizer factories. The implementations we provide here
+ // are included in the code base of Solr, so that it's possible to refactor
+ // the Lucene APIs the factories rely on if needed.
+ // Additionally, we set a custom lexical resource factory for Carrot2 that
+ // will use both Carrot2 default stop words as well as stop words from
+ // the StopFilter defined on the field.
+ BasicPreprocessingPipelineDescriptor.attributeBuilder(initAttributes)
+ .stemmerFactory(LuceneCarrot2StemmerFactory.class)
+ .tokenizerFactory(LuceneCarrot2TokenizerFactory.class)
+ .lexicalDataFactory(SolrStopwordsCarrot2LexicalDataFactory.class);
+
+ // Pass the schema to SolrStopwordsCarrot2LexicalDataFactory.
+ initAttributes.put("solrIndexSchema", core.getSchema());
+ // Customize Carrot2's resource lookup to first look for resources
+ // using Solr's resource loader. If that fails, try loading from the classpath.
+ DefaultLexicalDataFactoryDescriptor.attributeBuilder(initAttributes)
+ .resourceLookup(new ResourceLookup(new IResourceLocator() {
+ public IResource[] getAll(final String resource) {
+ final SolrResourceLoader resourceLoader = core.getResourceLoader();
+ final String carrot2ResourcesDir = resourceLoader.getConfigDir()
+ + initParams.get(CarrotParams.LEXICAL_RESOURCES_DIR, CARROT_RESOURCES_PREFIX);
+ try {
+ log.debug("Looking for " + resource + " in "
+ + carrot2ResourcesDir);
+ final InputStream resourceStream = resourceLoader
+ .openResource(carrot2ResourcesDir + "/" + resource);
+
+ log.info(resource + " loaded from " + carrot2ResourcesDir);
+ final IResource foundResource = new IResource() {
+ public InputStream open() throws IOException {
+ return resourceStream;
+ }
+ };
+ return new IResource[] { foundResource };
+ } catch (RuntimeException e) {
+ // No way to distinguish if the resource was found but failed
+ // to load or wasn't found at all, so we simply fall back
+ // to Carrot2 defaults here by returning an empty locations array.
+ log.debug(resource + " not found in " + carrot2ResourcesDir
+ + ". Using the default " + resource + " from Carrot JAR.");
+ return new IResource[] {};
+ }
+
+ }
+ },
+
+ // Using the class loader directly because this time we want to omit the prefix
+ new ClassLoaderLocator(core.getResourceLoader().getClassLoader())));
+
+ this.controller.init(initAttributes);
this.idFieldName = core.getSchema().getUniqueKeyField().getName();
// Make sure the requested Carrot2 clustering algorithm class is available
@@ -147,20 +221,33 @@ public class CarrotClusteringEngine exte
protected Set<String> getFieldsToLoad(SolrQueryRequest sreq){
SolrParams solrParams = sreq.getParams();
- // Names of fields to deliver content for clustering
- String urlField = solrParams.get(CarrotParams.URL_FIELD_NAME, "url");
+ HashSet<String> fields = Sets.newHashSet(getFieldsForClustering(sreq));
+ fields.add(idFieldName);
+ fields.add(solrParams.get(CarrotParams.URL_FIELD_NAME, "url"));
+ return fields;
+ }
+
+ /**
+ * Returns the names of fields that will be delivering the actual
+ * content for clustering. Currently, there are two such fields: document
+ * title and document content.
+ */
+ private Set<String> getFieldsForClustering(SolrQueryRequest sreq) {
+ SolrParams solrParams = sreq.getParams();
+
String titleField = solrParams.get(CarrotParams.TITLE_FIELD_NAME, "title");
String snippetField = solrParams.get(CarrotParams.SNIPPET_FIELD_NAME, titleField);
if (StringUtils.isBlank(snippetField)) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, CarrotParams.SNIPPET_FIELD_NAME
+ " must not be blank.");
}
- return Sets.newHashSet(urlField, titleField, snippetField, idFieldName);
- }
+ return Sets.newHashSet(titleField, snippetField);
+ }
/**
* Prepares Carrot2 documents for clustering.
*/
+ @SuppressWarnings("deprecation")
private List<Document> getDocuments(SolrDocumentList solrDocList, Map<SolrDocument, Integer> docIds,
Query query, final SolrQueryRequest sreq) throws IOException {
SolrHighlighter highlighter = null;
@@ -179,7 +266,7 @@ public class CarrotClusteringEngine exte
if (produceSummary == true) {
highlighter = core.getHighlighter();
if (highlighter != null){
- Map args = new HashMap();
+ Map<String, Object> args = Maps.newHashMap();
snippetFieldAry = new String[]{snippetField};
args.put(HighlightParams.FIELDS, snippetFieldAry);
args.put(HighlightParams.HIGHLIGHT, "true");
@@ -213,11 +300,12 @@ public class CarrotClusteringEngine exte
if (produceSummary && docIds != null) {
docsHolder[0] = docIds.get(sdoc).intValue();
DocList docAsList = new DocSlice(0, 1, docsHolder, scores, 1, 1.0f);
- NamedList highlights = highlighter.doHighlighting(docAsList, theQuery, req, snippetFieldAry);
+ NamedList<Object> highlights = highlighter.doHighlighting(docAsList, theQuery, req, snippetFieldAry);
if (highlights != null && highlights.size() == 1) {//should only be one value given our setup
//should only be one document with one field
- NamedList tmp = (NamedList) highlights.getVal(0);
- String [] highlt = (String[]) tmp.get(snippetField);
+ @SuppressWarnings("unchecked")
+ NamedList<String []> tmp = (NamedList<String[]>) highlights.getVal(0);
+ String [] highlt = tmp.get(snippetField);
if (highlt != null && highlt.length == 1) {
snippet = highlt[0];
}
@@ -225,7 +313,7 @@ public class CarrotClusteringEngine exte
}
Document carrotDocument = new Document(getValue(sdoc, titleField),
snippet, (String)sdoc.getFieldValue(urlField));
- carrotDocument.setField("solrId", sdoc.getFieldValue(idFieldName));
+ carrotDocument.setField(SOLR_DOCUMENT_ID, sdoc.getFieldValue(idFieldName));
result.add(carrotDocument);
}
@@ -260,9 +348,9 @@ public class CarrotClusteringEngine exte
return result.toString().trim();
}
- private List clustersToNamedList(List<Cluster> carrotClusters,
+ private List<NamedList<Object>> clustersToNamedList(List<Cluster> carrotClusters,
SolrParams solrParams) {
- List result = new ArrayList();
+ List<NamedList<Object>> result = Lists.newArrayList();
clustersToNamedList(carrotClusters, result, solrParams.getBool(
CarrotParams.OUTPUT_SUB_CLUSTERS, true), solrParams.getInt(
CarrotParams.NUM_DESCRIPTIONS, Integer.MAX_VALUE));
@@ -270,25 +358,40 @@ public class CarrotClusteringEngine exte
}
private void clustersToNamedList(List<Cluster> outputClusters,
- List parent, boolean outputSubClusters, int maxLabels) {
+ List<NamedList<Object>> parent, boolean outputSubClusters, int maxLabels) {
for (Cluster outCluster : outputClusters) {
- NamedList cluster = new SimpleOrderedMap();
+ NamedList<Object> cluster = new SimpleOrderedMap<Object>();
parent.add(cluster);
+ // Add labels
List<String> labels = outCluster.getPhrases();
- if (labels.size() > maxLabels)
+ if (labels.size() > maxLabels) {
labels = labels.subList(0, maxLabels);
+ }
cluster.add("labels", labels);
+ // Add cluster score
+ final Double score = outCluster.getScore();
+ if (score != null) {
+ cluster.add("score", score);
+ }
+
+ // Add other topics marker
+ if (outCluster.isOtherTopics()) {
+ cluster.add("other-topics", outCluster.isOtherTopics());
+ }
+
+ // Add documents
List<Document> docs = outputSubClusters ? outCluster.getDocuments() : outCluster.getAllDocuments();
- List docList = new ArrayList();
+ List<Object> docList = Lists.newArrayList();
cluster.add("docs", docList);
for (Document doc : docs) {
- docList.add(doc.getField("solrId"));
+ docList.add(doc.getField(SOLR_DOCUMENT_ID));
}
- if (outputSubClusters) {
- List subclusters = new ArrayList();
+ // Add subclusters
+ if (outputSubClusters && !outCluster.getSubclusters().isEmpty()) {
+ List<NamedList<Object>> subclusters = Lists.newArrayList();
cluster.add("clusters", subclusters);
clustersToNamedList(outCluster.getSubclusters(), subclusters,
outputSubClusters, maxLabels);
Modified: lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java?rev=1103746&r1=1103745&r2=1103746&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java (original)
+++ lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java Mon May 16 14:21:41 2011
@@ -35,6 +35,8 @@ public interface CarrotParams {
String OUTPUT_SUB_CLUSTERS = CARROT_PREFIX + "outputSubClusters";
String SUMMARY_FRAGSIZE = CARROT_PREFIX + "fragzise";
+ String LEXICAL_RESOURCES_DIR = CARROT_PREFIX + "lexicalResourcesDir";
+
public static final Set<String> CARROT_PARAM_NAMES = ImmutableSet.of(
ALGORITHM, TITLE_FIELD_NAME, URL_FIELD_NAME, SNIPPET_FIELD_NAME,
PRODUCE_SUMMARY, NUM_DESCRIPTIONS, OUTPUT_SUB_CLUSTERS, SUMMARY_FRAGSIZE);
Added: lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/LuceneCarrot2StemmerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/LuceneCarrot2StemmerFactory.java?rev=1103746&view=auto
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/LuceneCarrot2StemmerFactory.java (added)
+++ lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/LuceneCarrot2StemmerFactory.java Mon May 16 14:21:41 2011
@@ -0,0 +1,239 @@
+package org.apache.solr.handler.clustering.carrot2;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.nio.CharBuffer;
+import java.util.HashMap;
+
+import org.apache.lucene.analysis.ar.ArabicNormalizer;
+import org.apache.lucene.analysis.ar.ArabicStemmer;
+import org.carrot2.core.LanguageCode;
+import org.carrot2.text.linguistic.IStemmer;
+import org.carrot2.text.linguistic.IStemmerFactory;
+import org.carrot2.util.ReflectionUtils;
+import org.slf4j.Logger;
+import org.tartarus.snowball.SnowballProgram;
+import org.tartarus.snowball.ext.DanishStemmer;
+import org.tartarus.snowball.ext.DutchStemmer;
+import org.tartarus.snowball.ext.EnglishStemmer;
+import org.tartarus.snowball.ext.FinnishStemmer;
+import org.tartarus.snowball.ext.FrenchStemmer;
+import org.tartarus.snowball.ext.GermanStemmer;
+import org.tartarus.snowball.ext.HungarianStemmer;
+import org.tartarus.snowball.ext.ItalianStemmer;
+import org.tartarus.snowball.ext.NorwegianStemmer;
+import org.tartarus.snowball.ext.PortugueseStemmer;
+import org.tartarus.snowball.ext.RomanianStemmer;
+import org.tartarus.snowball.ext.RussianStemmer;
+import org.tartarus.snowball.ext.SpanishStemmer;
+import org.tartarus.snowball.ext.SwedishStemmer;
+import org.tartarus.snowball.ext.TurkishStemmer;
+
+/**
+ * An implementation of Carrot2's {@link IStemmerFactory} based on Lucene's
+ * APIs. Should the relevant Lucene APIs need to change, the changes can be made
+ * in this class.
+ */
+public class LuceneCarrot2StemmerFactory implements IStemmerFactory {
+ final static Logger logger = org.slf4j.LoggerFactory
+ .getLogger(LuceneCarrot2StemmerFactory.class);
+
+ public IStemmer getStemmer(LanguageCode language) {
+ switch (language) {
+ case ARABIC:
+ return ArabicStemmerFactory.createStemmer();
+
+ case CHINESE_SIMPLIFIED:
+ return IdentityStemmer.INSTANCE;
+
+ default:
+ /*
+ * For other languages, try to use snowball's stemming.
+ */
+ return SnowballStemmerFactory.createStemmer(language);
+ }
+ }
+
+ /**
+ * Factory of {@link IStemmer} implementations from the <code>snowball</code>
+ * project.
+ */
+ private final static class SnowballStemmerFactory {
+ /**
+ * Static hard mapping from language codes to stemmer classes in Snowball.
+ * This mapping is not dynamic because we want to keep the possibility to
+ * obfuscate these classes.
+ */
+ private static HashMap<LanguageCode, Class<? extends SnowballProgram>> snowballStemmerClasses;
+ static {
+ snowballStemmerClasses = new HashMap<LanguageCode, Class<? extends SnowballProgram>>();
+ snowballStemmerClasses.put(LanguageCode.DANISH, DanishStemmer.class);
+ snowballStemmerClasses.put(LanguageCode.DUTCH, DutchStemmer.class);
+ snowballStemmerClasses.put(LanguageCode.ENGLISH, EnglishStemmer.class);
+ snowballStemmerClasses.put(LanguageCode.FINNISH, FinnishStemmer.class);
+ snowballStemmerClasses.put(LanguageCode.FRENCH, FrenchStemmer.class);
+ snowballStemmerClasses.put(LanguageCode.GERMAN, GermanStemmer.class);
+ snowballStemmerClasses
+ .put(LanguageCode.HUNGARIAN, HungarianStemmer.class);
+ snowballStemmerClasses.put(LanguageCode.ITALIAN, ItalianStemmer.class);
+ snowballStemmerClasses
+ .put(LanguageCode.NORWEGIAN, NorwegianStemmer.class);
+ snowballStemmerClasses.put(LanguageCode.PORTUGUESE,
+ PortugueseStemmer.class);
+ snowballStemmerClasses.put(LanguageCode.ROMANIAN, RomanianStemmer.class);
+ snowballStemmerClasses.put(LanguageCode.RUSSIAN, RussianStemmer.class);
+ snowballStemmerClasses.put(LanguageCode.SPANISH, SpanishStemmer.class);
+ snowballStemmerClasses.put(LanguageCode.SWEDISH, SwedishStemmer.class);
+ snowballStemmerClasses.put(LanguageCode.TURKISH, TurkishStemmer.class);
+ }
+
+ /**
+ * An adapter converting Snowball programs into {@link IStemmer} interface.
+ */
+ private static class SnowballStemmerAdapter implements IStemmer {
+ private final SnowballProgram snowballStemmer;
+
+ public SnowballStemmerAdapter(SnowballProgram snowballStemmer) {
+ this.snowballStemmer = snowballStemmer;
+ }
+
+ public CharSequence stem(CharSequence word) {
+ snowballStemmer.setCurrent(word.toString());
+ if (snowballStemmer.stem()) {
+ return snowballStemmer.getCurrent();
+ } else {
+ return null;
+ }
+ }
+ }
+
+ /**
+ * Create and return an {@link IStemmer} adapter for a
+ * {@link SnowballProgram} for a given language code. An identity stemmer is
+ * returned for unknown languages.
+ */
+ public static IStemmer createStemmer(LanguageCode language) {
+ final Class<? extends SnowballProgram> stemmerClazz = snowballStemmerClasses
+ .get(language);
+
+ if (stemmerClazz == null) {
+ logger.warn("No Snowball stemmer class for: " + language.name()
+ + ". Quality of clustering may be degraded.");
+ return IdentityStemmer.INSTANCE;
+ }
+
+ try {
+ return new SnowballStemmerAdapter(stemmerClazz.newInstance());
+ } catch (Exception e) {
+ logger.warn("Could not instantiate snowball stemmer"
+ + " for language: " + language.name()
+ + ". Quality of clustering may be degraded.", e);
+
+ return IdentityStemmer.INSTANCE;
+ }
+ }
+ }
+
+ /**
+ * Factory of {@link IStemmer} implementations for the
+ * {@link LanguageCode#ARABIC} language. Requires <code>lucene-contrib</code>
+ * to be present in classpath, otherwise an empty (identity) stemmer is
+ * returned.
+ */
+ private static class ArabicStemmerFactory {
+ static {
+ try {
+ ReflectionUtils.classForName(ArabicStemmer.class.getName(), false);
+ ReflectionUtils.classForName(ArabicNormalizer.class.getName(), false);
+ } catch (ClassNotFoundException e) {
+ logger
+ .warn(
+ "Could not instantiate Lucene stemmer for Arabic, clustering quality "
+ + "of Arabic content may be degraded. For best quality clusters, "
+ + "make sure Lucene's Arabic analyzer JAR is in the classpath",
+ e);
+ }
+ }
+
+ /**
+ * Adapter to lucene-contrib Arabic analyzers.
+ */
+ private static class LuceneStemmerAdapter implements IStemmer {
+ private final org.apache.lucene.analysis.ar.ArabicStemmer delegate;
+ private final org.apache.lucene.analysis.ar.ArabicNormalizer normalizer;
+
+ private char[] buffer = new char[0];
+
+ private LuceneStemmerAdapter() throws Exception {
+ delegate = new org.apache.lucene.analysis.ar.ArabicStemmer();
+ normalizer = new org.apache.lucene.analysis.ar.ArabicNormalizer();
+ }
+
+ public CharSequence stem(CharSequence word) {
+ if (word.length() > buffer.length) {
+ buffer = new char[word.length()];
+ }
+
+ for (int i = 0; i < word.length(); i++) {
+ buffer[i] = word.charAt(i);
+ }
+
+ int newLen = normalizer.normalize(buffer, word.length());
+ newLen = delegate.stem(buffer, newLen);
+
+ if (newLen != word.length() || !equals(buffer, newLen, word)) {
+ return CharBuffer.wrap(buffer, 0, newLen);
+ }
+
+ // Same-same.
+ return null;
+ }
+
+ private boolean equals(char[] buffer, int len, CharSequence word) {
+ assert len == word.length();
+
+ for (int i = 0; i < len; i++) {
+ if (buffer[i] != word.charAt(i))
+ return false;
+ }
+
+ return true;
+ }
+ }
+
+ public static IStemmer createStemmer() {
+ try {
+ return new LuceneStemmerAdapter();
+ } catch (Throwable e) {
+ return IdentityStemmer.INSTANCE;
+ }
+ }
+ }
+
+ /**
+ * An implementation of {@link IStemmer} that always returns <code>null</code>
+ * which means no stemming.
+ */
+ private static class IdentityStemmer implements IStemmer {
+ private final static IdentityStemmer INSTANCE = new IdentityStemmer();
+
+ public CharSequence stem(CharSequence word) {
+ return null;
+ }
+ }
+}
Added: lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/LuceneCarrot2TokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/LuceneCarrot2TokenizerFactory.java?rev=1103746&view=auto
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/LuceneCarrot2TokenizerFactory.java (added)
+++ lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/LuceneCarrot2TokenizerFactory.java Mon May 16 14:21:41 2011
@@ -0,0 +1,155 @@
+package org.apache.solr.handler.clustering.carrot2;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.regex.Pattern;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.carrot2.core.LanguageCode;
+import org.carrot2.text.analysis.ExtendedWhitespaceTokenizer;
+import org.carrot2.text.analysis.ITokenizer;
+import org.carrot2.text.linguistic.ITokenizerFactory;
+import org.carrot2.text.util.MutableCharArray;
+import org.carrot2.util.ExceptionUtils;
+import org.carrot2.util.ReflectionUtils;
+import org.slf4j.Logger;
+
+/**
+ * An implementation of Carrot2's {@link ITokenizerFactory} based on Lucene's
+ * Smart Chinese tokenizer. If Smart Chinese tokenizer is not available in
+ * classpath at runtime, the default Carrot2's tokenizer is used. Should the
+ * Lucene APIs need to change, the changes can be made in this class.
+ */
+public class LuceneCarrot2TokenizerFactory implements ITokenizerFactory {
+ final static Logger logger = org.slf4j.LoggerFactory
+ .getLogger(LuceneCarrot2TokenizerFactory.class);
+
+ public ITokenizer getTokenizer(LanguageCode language) {
+ switch (language) {
+ case CHINESE_SIMPLIFIED:
+ return ChineseTokenizerFactory.createTokenizer();
+
+ /*
+ * We use our own analyzer for Arabic. Lucene's version has special
+ * support for Nonspacing-Mark characters (see
+ * http://www.fileformat.info/info/unicode/category/Mn/index.htm), but we
+ * have them included as letters in the parser.
+ */
+ case ARABIC:
+ // Intentional fall-through.
+
+ default:
+ return new ExtendedWhitespaceTokenizer();
+ }
+ }
+
+ /**
+ * Creates tokenizers that adapt Lucene's Smart Chinese Tokenizer to Carrot2's
+ * {@link ITokenizer}. If Smart Chinese is not available in the classpath, the
+ * factory will fall back to the default white space tokenizer.
+ */
+ private static final class ChineseTokenizerFactory {
+ static {
+ try {
+ ReflectionUtils.classForName(
+ "org.apache.lucene.analysis.cn.smart.WordTokenFilter", false);
+ ReflectionUtils.classForName(
+ "org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false);
+ } catch (Throwable e) {
+ logger
+ .warn("Could not instantiate Smart Chinese Analyzer, clustering quality "
+ + "of Chinese content may be degraded. For best quality clusters, "
+ + "make sure Lucene's Smart Chinese Analyzer JAR is in the classpath");
+ }
+ }
+
+ static ITokenizer createTokenizer() {
+ try {
+ return new ChineseTokenizer();
+ } catch (Throwable e) {
+ return new ExtendedWhitespaceTokenizer();
+ }
+ }
+
+ private final static class ChineseTokenizer implements ITokenizer {
+ private final static Pattern numeric = Pattern
+ .compile("[\\-+'$]?\\d+([:\\-/,.]?\\d+)*[%$]?");
+
+ private Tokenizer sentenceTokenizer;
+ private TokenStream wordTokenFilter;
+ private CharTermAttribute term = null;
+
+ private final MutableCharArray tempCharSequence;
+ private final Class<?> tokenFilterClass;
+
+ private ChineseTokenizer() throws Exception {
+ this.tempCharSequence = new MutableCharArray(new char[0]);
+
+ // As Smart Chinese is not available during compile time,
+ // we need to resort to reflection.
+ final Class<?> tokenizerClass = ReflectionUtils.classForName(
+ "org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false);
+ this.sentenceTokenizer = (Tokenizer) tokenizerClass.getConstructor(
+ Reader.class).newInstance((Reader) null);
+ this.tokenFilterClass = ReflectionUtils.classForName(
+ "org.apache.lucene.analysis.cn.smart.WordTokenFilter", false);
+ }
+
+ public short nextToken() throws IOException {
+ final boolean hasNextToken = wordTokenFilter.incrementToken();
+ if (hasNextToken) {
+ short flags = 0;
+ final char[] image = term.buffer();
+ final int length = term.length();
+ tempCharSequence.reset(image, 0, length);
+ if (length == 1 && image[0] == ',') {
+ // ChineseTokenizer seems to convert all punctuation to ','
+ // characters
+ flags = ITokenizer.TT_PUNCTUATION;
+ } else if (numeric.matcher(tempCharSequence).matches()) {
+ flags = ITokenizer.TT_NUMERIC;
+ } else {
+ flags = ITokenizer.TT_TERM;
+ }
+ return flags;
+ }
+
+ return ITokenizer.TT_EOF;
+ }
+
+ public void setTermBuffer(MutableCharArray array) {
+ array.reset(term.buffer(), 0, term.length());
+ }
+
+ public void reset(Reader input) throws IOException {
+ try {
+ sentenceTokenizer.reset(input);
+ wordTokenFilter = (TokenStream) tokenFilterClass.getConstructor(
+ TokenStream.class).newInstance(sentenceTokenizer);
+ term = wordTokenFilter.addAttribute(CharTermAttribute.class);
+ } catch (Exception e) {
+ throw ExceptionUtils.wrapAsRuntimeException(e);
+ }
+ }
+ }
+ }
+}
Added: lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/SolrStopwordsCarrot2LexicalDataFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/SolrStopwordsCarrot2LexicalDataFactory.java?rev=1103746&view=auto
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/SolrStopwordsCarrot2LexicalDataFactory.java (added)
+++ lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/SolrStopwordsCarrot2LexicalDataFactory.java Mon May 16 14:21:41 2011
@@ -0,0 +1,138 @@
+package org.apache.solr.handler.clustering.carrot2;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Collection;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.solr.analysis.CommonGramsFilterFactory;
+import org.apache.solr.analysis.StopFilterFactory;
+import org.apache.solr.analysis.TokenFilterFactory;
+import org.apache.solr.analysis.TokenizerChain;
+import org.apache.solr.schema.IndexSchema;
+import org.carrot2.core.LanguageCode;
+import org.carrot2.core.attribute.Init;
+import org.carrot2.core.attribute.Processing;
+import org.carrot2.text.linguistic.DefaultLexicalDataFactory;
+import org.carrot2.text.linguistic.ILexicalData;
+import org.carrot2.text.linguistic.ILexicalDataFactory;
+import org.carrot2.text.util.MutableCharArray;
+import org.carrot2.util.attribute.Attribute;
+import org.carrot2.util.attribute.Bindable;
+import org.carrot2.util.attribute.Input;
+import org.slf4j.Logger;
+
+import com.google.common.collect.HashMultimap;
+import com.google.common.collect.Multimap;
+
+/**
+ * An implementation of Carrot2's {@link ILexicalDataFactory} that adds stop
+ * words from a field's StopFilter to the default stop words used in Carrot2,
+ * for all languages Carrot2 supports. Completely replacing Carrot2 stop words
+ * with Solr's wouldn't make much sense because clustering needs more aggressive
+ * stop words removal. In other words, if something is a stop word during
+ * indexing, then it should also be a stop word during clustering, but not the
+ * other way round.
+ */
+@Bindable
+public class SolrStopwordsCarrot2LexicalDataFactory implements
+ ILexicalDataFactory {
+ final static Logger logger = org.slf4j.LoggerFactory
+ .getLogger(SolrStopwordsCarrot2LexicalDataFactory.class);
+
+ @Init
+ @Input
+ @Attribute(key = "solrIndexSchema")
+ private IndexSchema schema;
+
+ @Processing
+ @Input
+ @Attribute(key = "solrFieldNames")
+ private Set<String> fieldNames;
+
+ /**
+ * A lazily-built cache of stop words per field.
+ */
+ private Multimap<String, CharArraySet> solrStopWords = HashMultimap.create();
+
+ /**
+ * Carrot2's default lexical resources to use in addition to Solr's stop
+ * words.
+ */
+ private DefaultLexicalDataFactory carrot2LexicalDataFactory = new DefaultLexicalDataFactory();
+
+ /**
+ * Obtains stop words for a field from the associated
+ * {@link StopFilterFactory}, if any.
+ */
+ private Collection<CharArraySet> getSolrStopWordsForField(String fieldName) {
+ // No need to synchronize here, Carrot2 ensures that instances
+ // of this class are not used by multiple threads at a time.
+ if (!solrStopWords.containsKey(fieldName)) {
+ final Analyzer fieldAnalyzer = schema.getFieldType(fieldName)
+ .getAnalyzer();
+ if (fieldAnalyzer instanceof TokenizerChain) {
+ final TokenFilterFactory[] filterFactories = ((TokenizerChain) fieldAnalyzer)
+ .getTokenFilterFactories();
+ for (TokenFilterFactory factory : filterFactories) {
+ if (factory instanceof StopFilterFactory) {
+ // StopFilterFactory holds the stop words in a CharArraySet, but
+ // the getStopWords() method returns a Set<?>, so we need to cast.
+ solrStopWords.put(fieldName,
+ (CharArraySet) ((StopFilterFactory) factory).getStopWords());
+ }
+
+ if (factory instanceof CommonGramsFilterFactory) {
+ solrStopWords.put(fieldName,
+ (CharArraySet) ((CommonGramsFilterFactory) factory)
+ .getCommonWords());
+ }
+ }
+ }
+ }
+ return solrStopWords.get(fieldName);
+ }
+
+ public ILexicalData getLexicalData(LanguageCode languageCode) {
+ final ILexicalData carrot2LexicalData = carrot2LexicalDataFactory
+ .getLexicalData(languageCode);
+
+ return new ILexicalData() {
+ public boolean isStopLabel(CharSequence word) {
+ // Nothing in Solr maps to the concept of a stop label,
+ // so return Carrot2's default here.
+ return carrot2LexicalData.isStopLabel(word);
+ }
+
+ public boolean isCommonWord(MutableCharArray word) {
+ // Loop over the fields involved in clustering first
+ for (String fieldName : fieldNames) {
+ for (CharArraySet stopWords : getSolrStopWordsForField(fieldName)) {
+ if (stopWords.contains(word)) {
+ return true;
+ }
+ }
+ }
+ // Check default Carrot2 stop words too
+ return carrot2LexicalData.isCommonWord(word);
+ }
+ };
+ }
+}
Modified: lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java?rev=1103746&r1=1103745&r2=1103746&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java (original)
+++ lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java Mon May 16 14:21:41 2011
@@ -17,6 +17,11 @@ package org.apache.solr.handler.clusteri
* limitations under the License.
*/
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
import org.apache.lucene.index.Term;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.Query;
@@ -37,22 +42,18 @@ import org.apache.solr.util.SolrPluginUt
import org.carrot2.util.attribute.AttributeUtils;
import org.junit.Test;
-import java.io.IOException;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
+import com.google.common.collect.ImmutableList;
/**
*
*/
-@SuppressWarnings("unchecked")
public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
@Test
public void testCarrotLingo() throws Exception {
- // Note: the expected number of clusters may change after upgrading Carrot2
- // due to e.g. internal improvements or tuning of Carrot2 clustering.
+ // Note: the expected number of clusters may change after upgrading Carrot2
+ // due to e.g. internal improvements or tuning of Carrot2 clustering.
final int expectedNumClusters = 10;
- checkEngine(getClusteringEngine("default"), expectedNumClusters);
+ checkEngine(getClusteringEngine("default"), expectedNumClusters);
}
@Test
@@ -61,8 +62,8 @@ public class CarrotClusteringEngineTest
solrParams.add(CarrotParams.SNIPPET_FIELD_NAME, "snippet");
solrParams.add(CarrotParams.SUMMARY_FRAGSIZE, "200");//how do we validate this?
- // Note: the expected number of clusters may change after upgrading Carrot2
- // due to e.g. internal improvements or tuning of Carrot2 clustering.
+ // Note: the expected number of clusters may change after upgrading Carrot2
+ // due to e.g. internal improvements or tuning of Carrot2 clustering.
final int expectedNumClusters = 15;
checkEngine(getClusteringEngine("default"), numberOfDocs -2 /*two don't have mining in the snippet*/, expectedNumClusters, new TermQuery(new Term("snippet", "mine")), solrParams);
}
@@ -74,7 +75,7 @@ public class CarrotClusteringEngineTest
@Test
public void testWithoutSubclusters() throws Exception {
- checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs),
+ checkClusters(checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs),
1, 1, 0);
}
@@ -82,7 +83,7 @@ public class CarrotClusteringEngineTest
public void testWithSubclusters() throws Exception {
ModifiableSolrParams params = new ModifiableSolrParams();
params.set(CarrotParams.OUTPUT_SUB_CLUSTERS, true);
- checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs), 1, 1, 2);
+ checkClusters(checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs), 1, 1, 2);
}
@Test
@@ -90,19 +91,107 @@ public class CarrotClusteringEngineTest
ModifiableSolrParams params = new ModifiableSolrParams();
params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "labels"), 5);
params.set(CarrotParams.NUM_DESCRIPTIONS, 3);
- checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs,
+ checkClusters(checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs,
params), 1, 3, 0);
}
@Test
+ public void testClusterScores() throws Exception {
+ ModifiableSolrParams params = new ModifiableSolrParams();
+ params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "depth"), 1);
+ List<NamedList<Object>> clusters = checkEngine(getClusteringEngine("mock"),
+ AbstractClusteringTestCase.numberOfDocs, params);
+ int i = 1;
+ for (NamedList<Object> cluster : clusters) {
+ final Double score = getScore(cluster);
+ assertNotNull(score);
+ assertEquals(0.25 * i++, score, 0);
+ }
+ }
+
+ @Test
+ public void testOtherTopics() throws Exception {
+ ModifiableSolrParams params = new ModifiableSolrParams();
+ params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "depth"), 1);
+ params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "otherTopicsModulo"), 2);
+ List<NamedList<Object>> clusters = checkEngine(getClusteringEngine("mock"),
+ AbstractClusteringTestCase.numberOfDocs, params);
+ int i = 1;
+ for (NamedList<Object> cluster : clusters) {
+ assertEquals(i++ % 2 == 0 ? true : null, isOtherTopics(cluster));
+ }
+ }
+
+ @Test
public void testCarrotAttributePassing() throws Exception {
ModifiableSolrParams params = new ModifiableSolrParams();
params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "depth"), 1);
params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "labels"), 3);
- checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs,
+ checkClusters(checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs,
params), 1, 3, 0);
}
+ @Test
+ public void testLexicalResourcesFromSolrConfigDefaultDir() throws Exception {
+ checkLexicalResourcesFromSolrConfig("lexical-resource-check",
+ "online,customsolrstopword,customsolrstoplabel");
+ }
+
+ @Test
+ public void testLexicalResourcesFromSolrConfigCustomDir() throws Exception {
+ checkLexicalResourcesFromSolrConfig("lexical-resource-check-custom-resource-dir",
+ "online,customsolrstopwordcustomdir,customsolrstoplabelcustomdir");
+ }
+
+ private void checkLexicalResourcesFromSolrConfig(String engineName, String wordsToCheck)
+ throws IOException {
+ ModifiableSolrParams params = new ModifiableSolrParams();
+ params.set("merge-resources", false);
+ params.set(AttributeUtils.getKey(
+ LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
+ wordsToCheck);
+
+ // "customsolrstopword" is in stopwords.en, "customsolrstoplabel" is in
+ // stoplabels.en, so we're expecting only one cluster with label "online".
+ final List<NamedList<Object>> clusters = checkEngine(
+ getClusteringEngine(engineName), 1, params);
+ assertEquals(getLabels(clusters.get(0)), ImmutableList.of("online"));
+ }
+
+ @Test
+ public void solrStopWordsUsedInCarrot2Clustering() throws Exception {
+ ModifiableSolrParams params = new ModifiableSolrParams();
+ params.set("merge-resources", false);
+ params.set(AttributeUtils.getKey(
+ LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
+ "online,solrownstopword");
+
+ // "solrownstopword" is in stopwords.txt, so we're expecting
+ // only one cluster with label "online".
+ final List<NamedList<Object>> clusters = checkEngine(
+ getClusteringEngine("lexical-resource-check"), 1, params);
+ assertEquals(getLabels(clusters.get(0)), ImmutableList.of("online"));
+ }
+
+ @Test
+ public void solrStopWordsNotDefinedOnAFieldForClustering() throws Exception {
+ ModifiableSolrParams params = new ModifiableSolrParams();
+ // Force string fields to be used for clustering. Does not make sense
+ // in a real word, but does the job in the test.
+ params.set(CarrotParams.TITLE_FIELD_NAME, "url");
+ params.set(CarrotParams.SNIPPET_FIELD_NAME, "url");
+ params.set("merge-resources", false);
+ params.set(AttributeUtils.getKey(
+ LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
+ "online,solrownstopword");
+
+ final List<NamedList<Object>> clusters = checkEngine(
+ getClusteringEngine("lexical-resource-check"), 2, params);
+ assertEquals(ImmutableList.of("online"), getLabels(clusters.get(0)));
+ assertEquals(ImmutableList.of("solrownstopword"),
+ getLabels(clusters.get(1)));
+ }
+
private CarrotClusteringEngine getClusteringEngine(String engineName) {
ClusteringComponent comp = (ClusteringComponent) h.getCore()
.getSearchComponent("clustering");
@@ -114,18 +203,18 @@ public class CarrotClusteringEngineTest
return engine;
}
- private List checkEngine(CarrotClusteringEngine engine,
+ private List<NamedList<Object>> checkEngine(CarrotClusteringEngine engine,
int expectedNumClusters) throws IOException {
return checkEngine(engine, numberOfDocs, expectedNumClusters, new MatchAllDocsQuery(), new ModifiableSolrParams());
}
- private List checkEngine(CarrotClusteringEngine engine,
+ private List<NamedList<Object>> checkEngine(CarrotClusteringEngine engine,
int expectedNumClusters, SolrParams clusteringParams) throws IOException {
return checkEngine(engine, numberOfDocs, expectedNumClusters, new MatchAllDocsQuery(), clusteringParams);
}
- private List checkEngine(CarrotClusteringEngine engine, int expectedNumDocs,
+ private List<NamedList<Object>> checkEngine(CarrotClusteringEngine engine, int expectedNumDocs,
int expectedNumClusters, Query query, SolrParams clusteringParams) throws IOException {
// Get all documents to cluster
RefCounted<SolrIndexSearcher> ref = h.getCore().getSearcher();
@@ -145,7 +234,9 @@ public class CarrotClusteringEngineTest
LocalSolrQueryRequest req = new LocalSolrQueryRequest(h.getCore(), solrParams);
Map<SolrDocument,Integer> docIds = new HashMap<SolrDocument, Integer>(docList.size());
SolrDocumentList solrDocList = SolrPluginUtils.docListToSolrDocumentList( docList, searcher, engine.getFieldsToLoad(req), docIds );
- List results = (List)engine.cluster(query, solrDocList, docIds, req);
+
+ @SuppressWarnings("unchecked")
+ List<NamedList<Object>> results = (List<NamedList<Object>>) engine.cluster(query, solrDocList, docIds, req);
req.close();
assertEquals("number of clusters: " + results, expectedNumClusters, results.size());
checkClusters(results, false);
@@ -155,51 +246,74 @@ public class CarrotClusteringEngineTest
}
}
- private void checkClusters(List results, int expectedDocCount,
+ private void checkClusters(List<NamedList<Object>> results, int expectedDocCount,
int expectedLabelCount, int expectedSubclusterCount) {
for (int i = 0; i < results.size(); i++) {
- NamedList cluster = (NamedList) results.get(i);
+ NamedList<Object> cluster = results.get(i);
checkCluster(cluster, expectedDocCount, expectedLabelCount,
expectedSubclusterCount);
}
}
- private void checkClusters(List results, boolean hasSubclusters) {
+ private void checkClusters(List<NamedList<Object>> results, boolean hasSubclusters) {
for (int i = 0; i < results.size(); i++) {
- checkCluster((NamedList) results.get(i), hasSubclusters);
+ checkCluster(results.get(i), hasSubclusters);
}
}
- private void checkCluster(NamedList cluster, boolean hasSubclusters) {
- List docs = (List) cluster.get("docs");
+ private void checkCluster(NamedList<Object> cluster, boolean hasSubclusters) {
+ List<Object> docs = getDocs(cluster);
assertNotNull("docs is null and it shouldn't be", docs);
for (int j = 0; j < docs.size(); j++) {
String id = (String) docs.get(j);
assertNotNull("id is null and it shouldn't be", id);
}
- List labels = (List) cluster.get("labels");
+ List<String> labels = getLabels(cluster);
assertNotNull("labels is null but it shouldn't be", labels);
if (hasSubclusters) {
- List subclusters = (List) cluster.get("clusters");
+ List<NamedList<Object>> subclusters = getSubclusters(cluster);
assertNotNull("subclusters is null but it shouldn't be", subclusters);
}
}
- private void checkCluster(NamedList cluster, int expectedDocCount,
+ private void checkCluster(NamedList<Object> cluster, int expectedDocCount,
int expectedLabelCount, int expectedSubclusterCount) {
checkCluster(cluster, expectedSubclusterCount > 0);
assertEquals("number of docs in cluster", expectedDocCount,
- ((List) cluster.get("docs")).size());
+ getDocs(cluster).size());
assertEquals("number of labels in cluster", expectedLabelCount,
- ((List) cluster.get("labels")).size());
+ getLabels(cluster).size());
if (expectedSubclusterCount > 0) {
- List subclusters = (List) cluster.get("clusters");
+ List<NamedList<Object>> subclusters = getSubclusters(cluster);
assertEquals("numClusters", expectedSubclusterCount, subclusters.size());
assertEquals("number of subclusters in cluster",
expectedSubclusterCount, subclusters.size());
}
}
+
+ @SuppressWarnings("unchecked")
+ private List<NamedList<Object>> getSubclusters(NamedList<Object> cluster) {
+ return (List<NamedList<Object>>) cluster.get("clusters");
+ }
+
+ @SuppressWarnings("unchecked")
+ private List<String> getLabels(NamedList<Object> cluster) {
+ return (List<String>) cluster.get("labels");
+ }
+
+ private Double getScore(NamedList<Object> cluster) {
+ return (Double) cluster.get("score");
+ }
+
+ private Boolean isOtherTopics(NamedList<Object> cluster) {
+ return (Boolean) cluster.get("other-topics");
+ }
+
+ @SuppressWarnings("unchecked")
+ private List<Object> getDocs(NamedList<Object> cluster) {
+ return (List<Object>) cluster.get("docs");
+ }
}
Added: lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/LexicalResourcesCheckClusteringAlgorithm.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/LexicalResourcesCheckClusteringAlgorithm.java?rev=1103746&view=auto
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/LexicalResourcesCheckClusteringAlgorithm.java (added)
+++ lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/LexicalResourcesCheckClusteringAlgorithm.java Mon May 16 14:21:41 2011
@@ -0,0 +1,82 @@
+package org.apache.solr.handler.clustering.carrot2;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import java.util.List;
+
+import org.carrot2.core.Cluster;
+import org.carrot2.core.IClusteringAlgorithm;
+import org.carrot2.core.LanguageCode;
+import org.carrot2.core.ProcessingComponentBase;
+import org.carrot2.core.ProcessingException;
+import org.carrot2.core.attribute.AttributeNames;
+import org.carrot2.core.attribute.Processing;
+import org.carrot2.text.linguistic.DefaultLexicalDataFactory;
+import org.carrot2.text.linguistic.ILexicalData;
+import org.carrot2.text.linguistic.ILexicalDataFactory;
+import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipeline;
+import org.carrot2.text.util.MutableCharArray;
+import org.carrot2.util.attribute.Attribute;
+import org.carrot2.util.attribute.Bindable;
+import org.carrot2.util.attribute.Input;
+import org.carrot2.util.attribute.Output;
+
+import com.google.common.collect.Lists;
+
+/**
+ * A mock implementation of Carrot2 clustering algorithm for testing whether the
+ * customized lexical resource lookup works correctly. This algorithm ignores
+ * the input documents and instead for each word from {@link #wordsToCheck}, it
+ * outputs a cluster labeled with the word only if the word is neither a stop
+ * word nor a stop label.
+ */
+@Bindable(prefix = "LexicalResourcesCheckClusteringAlgorithm")
+public class LexicalResourcesCheckClusteringAlgorithm extends
+ ProcessingComponentBase implements IClusteringAlgorithm {
+
+ @Output
+ @Processing
+ @Attribute(key = AttributeNames.CLUSTERS)
+ private List<Cluster> clusters;
+
+ @Input
+ @Processing
+ @Attribute
+ private String wordsToCheck;
+
+ private BasicPreprocessingPipeline preprocessing = new BasicPreprocessingPipeline();
+
+ @Override
+ public void process() throws ProcessingException {
+ clusters = Lists.newArrayList();
+ if (wordsToCheck == null) {
+ return;
+ }
+
+ // Test with Maltese so that the English clustering performed in other tests
+ // is not affected by the test stopwords and stoplabels.
+ ILexicalData lexicalData = preprocessing.lexicalDataFactory
+ .getLexicalData(LanguageCode.MALTESE);
+
+ for (String word : wordsToCheck.split(",")) {
+ if (!lexicalData.isCommonWord(new MutableCharArray(word))
+ && !lexicalData.isStopLabel(word)) {
+ clusters.add(new Cluster(word));
+ }
+ }
+ }
+}
Modified: lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/MockClusteringAlgorithm.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/MockClusteringAlgorithm.java?rev=1103746&r1=1103745&r2=1103746&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/MockClusteringAlgorithm.java (original)
+++ lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/MockClusteringAlgorithm.java Mon May 16 14:21:41 2011
@@ -49,6 +49,11 @@ public class MockClusteringAlgorithm ext
@IntRange(min = 1, max = 5)
private int labels = 1;
+ @Input
+ @Processing
+ @Attribute
+ private int otherTopicsModulo = 0;
+
@Override
public void process() throws ProcessingException {
clusters = Lists.newArrayList();
@@ -59,21 +64,26 @@ public class MockClusteringAlgorithm ext
int documentIndex = 1;
for (Document document : documents) {
StringBuilder label = new StringBuilder("Cluster " + documentIndex);
- Cluster cluster = createCluster(label.toString(), document);
+ Cluster cluster = createCluster(label.toString(), documentIndex, document);
clusters.add(cluster);
for (int i = 1; i <= depth; i++) {
label.append(".");
label.append(i);
- Cluster newCluster = createCluster(label.toString(), document);
- cluster.addSubclusters(createCluster(label.toString(), document), newCluster);
+ Cluster newCluster = createCluster(label.toString(), documentIndex, document);
+ cluster.addSubclusters(createCluster(label.toString(), documentIndex, document), newCluster);
cluster = newCluster;
}
documentIndex++;
}
}
- private Cluster createCluster(String labelBase, Document... documents) {
+ private Cluster createCluster(String labelBase, int documentIndex, Document... documents) {
Cluster cluster = new Cluster();
+ cluster.setScore(documentIndex * 0.25);
+ if (otherTopicsModulo != 0 && documentIndex % otherTopicsModulo == 0)
+ {
+ cluster.setOtherTopics(true);
+ }
for (int i = 0; i < labels; i++) {
cluster.addPhrases(labelBase + "#" + (i + 1));
}
Added: lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/clustering/carrot2/stoplabels.mt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/clustering/carrot2/stoplabels.mt?rev=1103746&view=auto
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/clustering/carrot2/stoplabels.mt (added)
+++ lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/clustering/carrot2/stoplabels.mt Mon May 16 14:21:41 2011
@@ -0,0 +1 @@
+customsolrstoplabel
Added: lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/clustering/carrot2/stopwords.mt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/clustering/carrot2/stopwords.mt?rev=1103746&view=auto
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/clustering/carrot2/stopwords.mt (added)
+++ lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/clustering/carrot2/stopwords.mt Mon May 16 14:21:41 2011
@@ -0,0 +1 @@
+customsolrstopword
Added: lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/clustering/custom/stoplabels.mt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/clustering/custom/stoplabels.mt?rev=1103746&view=auto
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/clustering/custom/stoplabels.mt (added)
+++ lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/clustering/custom/stoplabels.mt Mon May 16 14:21:41 2011
@@ -0,0 +1 @@
+customsolrstoplabelcustomdir
Added: lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/clustering/custom/stopwords.mt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/clustering/custom/stopwords.mt?rev=1103746&view=auto
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/clustering/custom/stopwords.mt (added)
+++ lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/clustering/custom/stopwords.mt Mon May 16 14:21:41 2011
@@ -0,0 +1 @@
+customsolrstopwordcustomdir
Modified: lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/solrconfig.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/solrconfig.xml?rev=1103746&r1=1103745&r2=1103746&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/solrconfig.xml (original)
+++ lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/solrconfig.xml Mon May 16 14:21:41 2011
@@ -404,6 +404,15 @@
<str name="name">mock</str>
<str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.MockClusteringAlgorithm</str>
</lst>
+ <lst name="engine">
+ <str name="name">lexical-resource-check</str>
+ <str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.LexicalResourcesCheckClusteringAlgorithm</str>
+ </lst>
+ <lst name="engine">
+ <str name="name">lexical-resource-check-custom-resource-dir</str>
+ <str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.LexicalResourcesCheckClusteringAlgorithm</str>
+ <str name="carrot.lexicalResourcesDir">clustering/custom</str>
+ </lst>
</searchComponent>
<searchComponent class="org.apache.solr.handler.clustering.ClusteringComponent" name="doc-clustering">
Modified: lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/stopwords.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/stopwords.txt?rev=1103746&r1=1103745&r2=1103746&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/stopwords.txt (original)
+++ lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/resources/solr-clustering/conf/stopwords.txt Mon May 16 14:21:41 2011
@@ -55,4 +55,5 @@ to
was
will
with
+solrownstopword
Modified: lucene/dev/branches/branch_3x/solr/example/solr/conf/solrconfig.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/example/solr/conf/solrconfig.xml?rev=1103746&r1=1103745&r2=1103746&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/example/solr/conf/solrconfig.xml (original)
+++ lucene/dev/branches/branch_3x/solr/example/solr/conf/solrconfig.xml Mon May 16 14:21:41 2011
@@ -1193,17 +1193,20 @@
<lst name="engine">
<!-- The name, only one can be named "default" -->
<str name="name">default</str>
+
<!-- Class name of Carrot2 clustering algorithm.
Currently available algorithms are:
* org.carrot2.clustering.lingo.LingoClusteringAlgorithm
* org.carrot2.clustering.stc.STCClusteringAlgorithm
+ * org.carrot2.clustering.kmeans.BisectingKMeansClusteringAlgorithm
See http://project.carrot2.org/algorithms.html for the
algorithm's characteristics.
-->
<str name="carrot.algorithm">org.carrot2.clustering.lingo.LingoClusteringAlgorithm</str>
+
<!-- Overriding values for Carrot2 default algorithm attributes.
For a description of all available attributes, see:
@@ -1215,6 +1218,19 @@
-->
<str name="LingoClusteringAlgorithm.desiredClusterCountBase">20</str>
+ <!-- Location of Carrot2 lexical resources.
+
+ A directory from which to load Carrot2-specific stop words
+ and stop labels. Absolute or relative to Solr config directory.
+ If a specific resource (e.g. stopwords.en) is present in the
+ specified dir, it will completely override the corresponding
+ default one that ships with Carrot2.
+
+ For an overview of Carrot2 lexical resources, see:
+ http://download.carrot2.org/head/manual/#chapter.lexical-resources
+ -->
+ <str name="carrot.lexicalResourcesDir">clustering/carrot2</str>
+
<!-- The language to assume for the documents.
For a list of allowed values, see: