You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2013/09/06 22:02:58 UTC
svn commit: r1520677 - in /lucene/dev/trunk/solr:
contrib/clustering/src/java/org/apache/solr/handler/clustering/
contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/
contrib/clustering/src/test-files/clustering/solr/collection1/conf...
Author: dweiss
Date: Fri Sep 6 20:02:57 2013
New Revision: 1520677
URL: http://svn.apache.org/r1520677
Log:
SOLR-5202: Support easier overrides of Carrot2 clustering attributes via XML data sets exported from the Workbench. Polished clustering configuration examples.
Added:
lucene/dev/trunk/solr/contrib/clustering/src/test-files/clustering/solr/collection1/conf/clustering/carrot2/mock-external-attrs-attributes.xml
lucene/dev/trunk/solr/example/solr/collection1/conf/clustering/
lucene/dev/trunk/solr/example/solr/collection1/conf/clustering/carrot2/
lucene/dev/trunk/solr/example/solr/collection1/conf/clustering/carrot2/default-attributes.xml
lucene/dev/trunk/solr/example/solr/collection1/conf/clustering/carrot2/kmeans-attributes.xml
lucene/dev/trunk/solr/example/solr/collection1/conf/clustering/carrot2/stc-attributes.xml
Modified:
lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/ClusteringComponent.java
lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/ClusteringEngine.java
lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java
lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java
lucene/dev/trunk/solr/contrib/clustering/src/test-files/clustering/solr/collection1/conf/solrconfig.xml
lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java
lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/MockClusteringAlgorithm.java
lucene/dev/trunk/solr/example/example-schemaless/solr/collection1/conf/solrconfig.xml
lucene/dev/trunk/solr/example/solr/collection1/conf/solrconfig.xml
Modified: lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/ClusteringComponent.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/ClusteringComponent.java?rev=1520677&r1=1520676&r2=1520677&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/ClusteringComponent.java (original)
+++ lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/ClusteringComponent.java Fri Sep 6 20:02:57 2013
@@ -59,11 +59,13 @@ public class ClusteringComponent extends
private Map<String, SearchClusteringEngine> searchClusteringEngines = new HashMap<String, SearchClusteringEngine>();
private Map<String, DocumentClusteringEngine> documentClusteringEngines = new HashMap<String, DocumentClusteringEngine>();
+
/**
- * Base name for all spell checker query parameters. This name is also used to
+ * Base name for all component parameters. This name is also used to
* register this component with SearchHandler.
*/
public static final String COMPONENT_NAME = "clustering";
+
private NamedList initParams;
@Override
Modified: lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/ClusteringEngine.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/ClusteringEngine.java?rev=1520677&r1=1520676&r2=1520677&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/ClusteringEngine.java (original)
+++ lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/ClusteringEngine.java Fri Sep 6 20:02:57 2013
@@ -30,7 +30,6 @@ public class ClusteringEngine {
public String init(NamedList config, SolrCore core) {
name = (String) config.get(ENGINE_NAME);
-
return name;
}
Modified: lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java?rev=1520677&r1=1520676&r2=1520677&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java (original)
+++ lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java Fri Sep 6 20:02:57 2013
@@ -46,6 +46,7 @@ import org.apache.solr.common.util.Named
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.core.SolrCore;
import org.apache.solr.core.SolrResourceLoader;
+import org.apache.solr.handler.clustering.ClusteringEngine;
import org.apache.solr.handler.clustering.SearchClusteringEngine;
import org.apache.solr.handler.component.HighlightComponent;
import org.apache.solr.highlight.SolrHighlighter;
@@ -66,6 +67,8 @@ import org.carrot2.core.attribute.Attrib
import org.carrot2.text.linguistic.DefaultLexicalDataFactoryDescriptor;
import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipelineDescriptor;
import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipelineDescriptor.AttributeBuilder;
+import org.carrot2.util.attribute.AttributeValueSet;
+import org.carrot2.util.attribute.AttributeValueSets;
import org.carrot2.util.resource.ClassLoaderLocator;
import org.carrot2.util.resource.IResource;
import org.carrot2.util.resource.IResourceLocator;
@@ -73,11 +76,11 @@ import org.carrot2.util.resource.Resourc
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import com.google.common.base.Objects;
+import com.google.common.base.Strings;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
-import com.google.common.io.Closeables;
-import com.google.common.io.Closer;
/**
* Search results clustering engine based on Carrot2 clustering algorithms.
@@ -122,8 +125,19 @@ public class CarrotClusteringEngine exte
public SolrResourceLocator(SolrCore core, SolrParams initParams) {
resourceLoader = core.getResourceLoader();
- carrot2ResourcesDir = initParams.get(
- CarrotParams.LEXICAL_RESOURCES_DIR, CARROT_RESOURCES_PREFIX);
+
+ @SuppressWarnings("deprecation")
+ String lexicalResourcesDir = initParams.get(CarrotParams.LEXICAL_RESOURCES_DIR);
+ String resourcesDir = initParams.get(CarrotParams.RESOURCES_DIR);
+ carrot2ResourcesDir = firstNonNull(resourcesDir, lexicalResourcesDir, CARROT_RESOURCES_PREFIX);
+ }
+
+ @SuppressWarnings("unchecked")
+ public static <T> T firstNonNull(T... args) {
+ for (T t : args) {
+ if (t != null) return t;
+ }
+ throw new NullPointerException("At least one element has to be non-null.");
}
@Override
@@ -269,8 +283,52 @@ public class CarrotClusteringEngine exte
String result = super.init(config, core);
final SolrParams initParams = SolrParams.toSolrParams(config);
- // Initialize Carrot2 controller. Pass initialization attributes, if any.
+ // Initialization attributes for Carrot2 controller.
HashMap<String, Object> initAttributes = new HashMap<String, Object>();
+
+ // Customize Carrot2's resource lookup to first look for resources
+ // using Solr's resource loader. If that fails, try loading from the classpath.
+ ResourceLookup resourceLookup = new ResourceLookup(
+ // Solr-specific resource loading.
+ new SolrResourceLocator(core, initParams),
+ // Using the class loader directly because this time we want to omit the prefix
+ new ClassLoaderLocator(core.getResourceLoader().getClassLoader()));
+
+ DefaultLexicalDataFactoryDescriptor.attributeBuilder(initAttributes)
+ .resourceLookup(resourceLookup);
+
+ // Load Carrot2-Workbench exported attribute XMLs based on the 'name' attribute
+ // of this component. This by-name convention lookup is used to simplify configuring algorithms.
+ String componentName = initParams.get(ClusteringEngine.ENGINE_NAME);
+ log.info("Initializing Clustering Engine '" + Objects.firstNonNull(componentName, "<no 'name' attribute>") + "'");
+
+ if (!Strings.isNullOrEmpty(componentName)) {
+ IResource[] attributeXmls = resourceLookup.getAll(componentName + "-attributes.xml");
+ if (attributeXmls.length > 0) {
+ if (attributeXmls.length > 1) {
+ log.warn("More than one attribute file found, first one will be used: "
+ + Arrays.toString(attributeXmls));
+ }
+
+ Thread ct = Thread.currentThread();
+ ClassLoader prev = ct.getContextClassLoader();
+ try {
+ ct.setContextClassLoader(core.getResourceLoader().getClassLoader());
+
+ AttributeValueSets avs = AttributeValueSets.deserialize(attributeXmls[0].open());
+ AttributeValueSet defaultSet = avs.getDefaultAttributeValueSet();
+ initAttributes.putAll(defaultSet.getAttributeValues());
+ } catch (Exception e) {
+ throw new SolrException(ErrorCode.SERVER_ERROR,
+ "Could not read attributes XML for clustering component: "
+ + componentName, e);
+ } finally {
+ ct.setContextClassLoader(prev);
+ }
+ }
+ }
+
+ // Extract solrconfig attributes, they take precedence.
extractCarrotAttributes(initParams, initAttributes);
// Customize the stemmer and tokenizer factories. The implementations we provide here
@@ -291,15 +349,6 @@ public class CarrotClusteringEngine exte
// Pass the schema (via the core) to SolrStopwordsCarrot2LexicalDataFactory.
initAttributes.put("solrCore", core);
- // Customize Carrot2's resource lookup to first look for resources
- // using Solr's resource loader. If that fails, try loading from the classpath.
- DefaultLexicalDataFactoryDescriptor.attributeBuilder(initAttributes).resourceLookup(
- new ResourceLookup(
- // Solr-specific resource loading.
- new SolrResourceLocator(core, initParams),
- // Using the class loader directly because this time we want to omit the prefix
- new ClassLoaderLocator(core.getResourceLoader().getClassLoader())));
-
// Carrot2 uses current thread's context class loader to get
// certain classes (e.g. custom tokenizer/stemmer) at initialization time.
// To make sure classes from contrib JARs are available,
@@ -322,7 +371,9 @@ public class CarrotClusteringEngine exte
// Make sure the requested Carrot2 clustering algorithm class is available
String carrotAlgorithmClassName = initParams.get(CarrotParams.ALGORITHM);
- this.clusteringAlgorithmClass = core.getResourceLoader().findClass(carrotAlgorithmClassName, IClusteringAlgorithm.class);
+ this.clusteringAlgorithmClass = core.getResourceLoader().findClass(
+ carrotAlgorithmClassName, IClusteringAlgorithm.class);
+
return result;
}
@@ -440,8 +491,9 @@ public class CarrotClusteringEngine exte
docsHolder[0] = docIds.get(sdoc).intValue();
DocList docAsList = new DocSlice(0, 1, docsHolder, scores, 1, 1.0f);
NamedList<Object> highlights = highlighter.doHighlighting(docAsList, theQuery, req, snippetFieldAry);
- if (highlights != null && highlights.size() == 1) {//should only be one value given our setup
- //should only be one document
+ if (highlights != null && highlights.size() == 1) {
+ // should only be one value given our setup
+ // should only be one document
@SuppressWarnings("unchecked")
NamedList<String []> tmp = (NamedList<String[]>) highlights.getVal(0);
Modified: lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java?rev=1520677&r1=1520676&r2=1520677&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java (original)
+++ lucene/dev/trunk/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java Fri Sep 6 20:02:57 2013
@@ -43,9 +43,21 @@ public final class CarrotParams {
public static String NUM_DESCRIPTIONS = CARROT_PREFIX + "numDescriptions";
public static String OUTPUT_SUB_CLUSTERS = CARROT_PREFIX + "outputSubClusters";
- public static String LEXICAL_RESOURCES_DIR = CARROT_PREFIX + "lexicalResourcesDir";
+
public static String LANGUAGE_CODE_MAP = CARROT_PREFIX + "lcmap";
+ /**
+ * Use {@link #RESOURCES_DIR}.
+ */
+ @Deprecated
+ public static String LEXICAL_RESOURCES_DIR = CARROT_PREFIX + "lexicalResourcesDir";
+
+ /**
+ * A replacement property pointing to Carrot<sup>2</sup> resources
+ * (a more generic version of the deprecated {@link #LEXICAL_RESOURCES_DIR}).
+ */
+ public static String RESOURCES_DIR = CARROT_PREFIX + "resourcesDir";
+
static final Set<String> CARROT_PARAM_NAMES = ImmutableSet.of(
ALGORITHM,
@@ -62,6 +74,7 @@ public final class CarrotParams {
NUM_DESCRIPTIONS,
OUTPUT_SUB_CLUSTERS,
LEXICAL_RESOURCES_DIR,
+ RESOURCES_DIR,
LANGUAGE_CODE_MAP);
/** No instances. */
Added: lucene/dev/trunk/solr/contrib/clustering/src/test-files/clustering/solr/collection1/conf/clustering/carrot2/mock-external-attrs-attributes.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/src/test-files/clustering/solr/collection1/conf/clustering/carrot2/mock-external-attrs-attributes.xml?rev=1520677&view=auto
==============================================================================
--- lucene/dev/trunk/solr/contrib/clustering/src/test-files/clustering/solr/collection1/conf/clustering/carrot2/mock-external-attrs-attributes.xml (added)
+++ lucene/dev/trunk/solr/contrib/clustering/src/test-files/clustering/solr/collection1/conf/clustering/carrot2/mock-external-attrs-attributes.xml Fri Sep 6 20:02:57 2013
@@ -0,0 +1,10 @@
+<attribute-sets default="overridden-attributes">
+ <attribute-set id="overridden-attributes">
+ <value-set>
+ <label>defaults</label>
+ <attribute key="MockClusteringAlgorithm.depth"><value value="1" /></attribute>
+ <attribute key="MockClusteringAlgorithm.labels"><value value="3" /></attribute>
+ <attribute key="MockClusteringAlgorithm.maxClusters"><value value="13" /></attribute>
+ </value-set>
+ </attribute-set>
+</attribute-sets>
\ No newline at end of file
Modified: lucene/dev/trunk/solr/contrib/clustering/src/test-files/clustering/solr/collection1/conf/solrconfig.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/src/test-files/clustering/solr/collection1/conf/solrconfig.xml?rev=1520677&r1=1520676&r2=1520677&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/clustering/src/test-files/clustering/solr/collection1/conf/solrconfig.xml (original)
+++ lucene/dev/trunk/solr/contrib/clustering/src/test-files/clustering/solr/collection1/conf/solrconfig.xml Fri Sep 6 20:02:57 2013
@@ -328,6 +328,12 @@
<str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.MockClusteringAlgorithm</str>
</lst>
<lst name="engine">
+ <str name="name">mock-external-attrs</str>
+ <str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.MockClusteringAlgorithm</str>
+ <!-- takes precedence over external XML -->
+ <int name="MockClusteringAlgorithm.labels">4</int>
+ </lst>
+ <lst name="engine">
<str name="name">echo</str>
<str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.EchoClusteringAlgorithm</str>
</lst>
@@ -338,6 +344,11 @@
<lst name="engine">
<str name="name">lexical-resource-check-custom-resource-dir</str>
<str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.LexicalResourcesCheckClusteringAlgorithm</str>
+ <str name="carrot.resourcesDir">clustering/custom</str>
+ </lst>
+ <lst name="engine">
+ <str name="name">lexical-resource-check-custom-resource-dir-deprecated</str>
+ <str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.LexicalResourcesCheckClusteringAlgorithm</str>
<str name="carrot.lexicalResourcesDir">clustering/custom</str>
</lst>
<lst name="engine">
Modified: lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java?rev=1520677&r1=1520676&r2=1520677&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java (original)
+++ lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java Fri Sep 6 20:02:57 2013
@@ -122,7 +122,14 @@ public class CarrotClusteringEngineTest
@Test
public void testWithoutSubclusters() throws Exception {
checkClusters(checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs),
- 1, 1, 0);
+ 1, 1, 0);
+ }
+
+ @Test
+ public void testExternalXmlAttributesFile() throws Exception {
+ checkClusters(
+ checkEngine(getClusteringEngine("mock-external-attrs"), 13),
+ 1, 4, 0);
}
@Test
@@ -189,6 +196,12 @@ public class CarrotClusteringEngineTest
"online,customsolrstopwordcustomdir,customsolrstoplabelcustomdir");
}
+ @Test
+ public void testLexicalResourcesFromSolrConfigCustomDirDeprecated() throws Exception {
+ checkLexicalResourcesFromSolrConfig("lexical-resource-check-custom-resource-dir-deprecated",
+ "online,customsolrstopwordcustomdir,customsolrstoplabelcustomdir");
+ }
+
private void checkLexicalResourcesFromSolrConfig(String engineName, String wordsToCheck)
throws IOException {
ModifiableSolrParams params = new ModifiableSolrParams();
Modified: lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/MockClusteringAlgorithm.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/MockClusteringAlgorithm.java?rev=1520677&r1=1520676&r2=1520677&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/MockClusteringAlgorithm.java (original)
+++ lucene/dev/trunk/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/MockClusteringAlgorithm.java Fri Sep 6 20:02:57 2013
@@ -52,6 +52,12 @@ public class MockClusteringAlgorithm ext
@Input
@Processing
@Attribute
+ @IntRange(min = 0)
+ private int maxClusters = 0;
+
+ @Input
+ @Processing
+ @Attribute
private int otherTopicsModulo = 0;
@Override
@@ -61,6 +67,10 @@ public class MockClusteringAlgorithm ext
return;
}
+ if (maxClusters > 0) {
+ documents = documents.subList(0, maxClusters);
+ }
+
int documentIndex = 1;
for (Document document : documents) {
StringBuilder label = new StringBuilder("Cluster " + documentIndex);
Modified: lucene/dev/trunk/solr/example/example-schemaless/solr/collection1/conf/solrconfig.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/example/example-schemaless/solr/collection1/conf/solrconfig.xml?rev=1520677&r1=1520676&r2=1520677&view=diff
==============================================================================
--- lucene/dev/trunk/solr/example/example-schemaless/solr/collection1/conf/solrconfig.xml (original)
+++ lucene/dev/trunk/solr/example/example-schemaless/solr/collection1/conf/solrconfig.xml Fri Sep 6 20:02:57 2013
@@ -1369,113 +1369,7 @@
</arr>
</requestHandler>
- <!-- Clustering Component
-
- http://wiki.apache.org/solr/ClusteringComponent
-
- You'll need to set the solr.clustering.enabled system property
- when running solr to run with clustering enabled:
-
- java -Dsolr.clustering.enabled=true -jar start.jar
-
- -->
- <searchComponent name="clustering"
- enable="${solr.clustering.enabled:false}"
- class="solr.clustering.ClusteringComponent" >
- <!-- Declare an engine -->
- <lst name="engine">
- <!-- The name, only one can be named "default" -->
- <str name="name">default</str>
-
- <!-- Class name of Carrot2 clustering algorithm.
-
- Currently available algorithms are:
-
- * org.carrot2.clustering.lingo.LingoClusteringAlgorithm
- * org.carrot2.clustering.stc.STCClusteringAlgorithm
- * org.carrot2.clustering.kmeans.BisectingKMeansClusteringAlgorithm
-
- See http://project.carrot2.org/algorithms.html for the
- algorithm's characteristics.
- -->
- <str name="carrot.algorithm">org.carrot2.clustering.lingo.LingoClusteringAlgorithm</str>
-
- <!-- Overriding values for Carrot2 default algorithm attributes.
-
- For a description of all available attributes, see:
- http://download.carrot2.org/stable/manual/#chapter.components.
- Use attribute key as name attribute of str elements
- below. These can be further overridden for individual
- requests by specifying attribute key as request parameter
- name and attribute value as parameter value.
- -->
- <str name="LingoClusteringAlgorithm.desiredClusterCountBase">20</str>
-
- <!-- Location of Carrot2 lexical resources.
-
- A directory from which to load Carrot2-specific stop words
- and stop labels. Absolute or relative to Solr config directory.
- If a specific resource (e.g. stopwords.en) is present in the
- specified dir, it will completely override the corresponding
- default one that ships with Carrot2.
-
- For an overview of Carrot2 lexical resources, see:
- http://download.carrot2.org/head/manual/#chapter.lexical-resources
- -->
- <str name="carrot.lexicalResourcesDir">clustering/carrot2</str>
-
- <!-- The language to assume for the documents.
-
- For a list of allowed values, see:
- http://download.carrot2.org/stable/manual/#section.attribute.lingo.MultilingualClustering.defaultLanguage
- -->
- <str name="MultilingualClustering.defaultLanguage">ENGLISH</str>
- </lst>
- <lst name="engine">
- <str name="name">stc</str>
- <str name="carrot.algorithm">org.carrot2.clustering.stc.STCClusteringAlgorithm</str>
- </lst>
- </searchComponent>
-
- <!-- A request handler for demonstrating the clustering component
-
- This is purely as an example.
-
- In reality you will likely want to add the component to your
- already specified request handlers.
- -->
- <requestHandler name="/clustering"
- startup="lazy"
- enable="${solr.clustering.enabled:false}"
- class="solr.SearchHandler">
- <lst name="defaults">
- <bool name="clustering">true</bool>
- <str name="clustering.engine">default</str>
- <bool name="clustering.results">true</bool>
- <!-- The title field -->
- <str name="carrot.title">name</str>
- <str name="carrot.url">id</str>
- <!-- The field to cluster on -->
- <str name="carrot.snippet">features</str>
- <!-- produce summaries -->
- <bool name="carrot.produceSummary">true</bool>
- <!-- the maximum number of labels per cluster -->
- <!--<int name="carrot.numDescriptions">5</int>-->
- <!-- produce sub clusters -->
- <bool name="carrot.outputSubClusters">false</bool>
-
- <str name="defType">edismax</str>
- <str name="qf">
- text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4
- </str>
- <str name="q.alt">*:*</str>
- <str name="rows">10</str>
- <str name="fl">*,score</str>
- </lst>
- <arr name="last-components">
- <str>clustering</str>
- </arr>
- </requestHandler>
+ <!-- Clustering Component. (Omitted here. See the default Solr example for a typical configuration.) -->
<!-- Terms Component
Added: lucene/dev/trunk/solr/example/solr/collection1/conf/clustering/carrot2/default-attributes.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/example/solr/collection1/conf/clustering/carrot2/default-attributes.xml?rev=1520677&view=auto
==============================================================================
--- lucene/dev/trunk/solr/example/solr/collection1/conf/clustering/carrot2/default-attributes.xml (added)
+++ lucene/dev/trunk/solr/example/solr/collection1/conf/clustering/carrot2/default-attributes.xml Fri Sep 6 20:02:57 2013
@@ -0,0 +1,24 @@
+<!--
+ Default configuration for the Lingo clustering algorithm.
+
+ This file can be loaded (and saved) by Carrot2 Workbench.
+ http://project.carrot2.org/download.html
+-->
+<attribute-sets default="attributes">
+ <attribute-set id="attributes">
+ <value-set>
+ <label>attributes</label>
+ <!--
+ The language to assume for clustered documents.
+ For a list of allowed values, see:
+ http://download.carrot2.org/stable/manual/#section.attribute.lingo.MultilingualClustering.defaultLanguage
+ -->
+ <attribute key="MultilingualClustering.defaultLanguage">
+ <value type="org.carrot2.core.LanguageCode" value="ENGLISH"/>
+ </attribute>
+ <attribute key="LingoClusteringAlgorithm.desiredClusterCountBase">
+ <value type="java.lang.Integer" value="20"/>
+ </attribute>
+ </value-set>
+ </attribute-set>
+</attribute-sets>
\ No newline at end of file
Added: lucene/dev/trunk/solr/example/solr/collection1/conf/clustering/carrot2/kmeans-attributes.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/example/solr/collection1/conf/clustering/carrot2/kmeans-attributes.xml?rev=1520677&view=auto
==============================================================================
--- lucene/dev/trunk/solr/example/solr/collection1/conf/clustering/carrot2/kmeans-attributes.xml (added)
+++ lucene/dev/trunk/solr/example/solr/collection1/conf/clustering/carrot2/kmeans-attributes.xml Fri Sep 6 20:02:57 2013
@@ -0,0 +1,19 @@
+<!--
+ Default configuration for the bisecting k-means clustering algorithm.
+
+ This file can be loaded (and saved) by Carrot2 Workbench.
+ http://project.carrot2.org/download.html
+-->
+<attribute-sets default="attributes">
+ <attribute-set id="attributes">
+ <value-set>
+ <label>attributes</label>
+ <attribute key="MultilingualClustering.defaultLanguage">
+ <value type="org.carrot2.core.LanguageCode" value="ENGLISH"/>
+ </attribute>
+ <attribute key="MultilingualClustering.languageAggregationStrategy">
+ <value type="org.carrot2.text.clustering.MultilingualClustering$LanguageAggregationStrategy" value="FLATTEN_MAJOR_LANGUAGE"/>
+ </attribute>
+ </value-set>
+ </attribute-set>
+</attribute-sets>
Added: lucene/dev/trunk/solr/example/solr/collection1/conf/clustering/carrot2/stc-attributes.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/example/solr/collection1/conf/clustering/carrot2/stc-attributes.xml?rev=1520677&view=auto
==============================================================================
--- lucene/dev/trunk/solr/example/solr/collection1/conf/clustering/carrot2/stc-attributes.xml (added)
+++ lucene/dev/trunk/solr/example/solr/collection1/conf/clustering/carrot2/stc-attributes.xml Fri Sep 6 20:02:57 2013
@@ -0,0 +1,19 @@
+<!--
+ Default configuration for the STC clustering algorithm.
+
+ This file can be loaded (and saved) by Carrot2 Workbench.
+ http://project.carrot2.org/download.html
+-->
+<attribute-sets default="attributes">
+ <attribute-set id="attributes">
+ <value-set>
+ <label>attributes</label>
+ <attribute key="MultilingualClustering.defaultLanguage">
+ <value type="org.carrot2.core.LanguageCode" value="ENGLISH"/>
+ </attribute>
+ <attribute key="MultilingualClustering.languageAggregationStrategy">
+ <value type="org.carrot2.text.clustering.MultilingualClustering$LanguageAggregationStrategy" value="FLATTEN_MAJOR_LANGUAGE"/>
+ </attribute>
+ </value-set>
+ </attribute-set>
+</attribute-sets>
Modified: lucene/dev/trunk/solr/example/solr/collection1/conf/solrconfig.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/example/solr/collection1/conf/solrconfig.xml?rev=1520677&r1=1520676&r2=1520677&view=diff
==============================================================================
--- lucene/dev/trunk/solr/example/solr/collection1/conf/solrconfig.xml (original)
+++ lucene/dev/trunk/solr/example/solr/collection1/conf/solrconfig.xml Fri Sep 6 20:02:57 2013
@@ -1382,59 +1382,57 @@
<searchComponent name="clustering"
enable="${solr.clustering.enabled:true}"
class="solr.clustering.ClusteringComponent" >
- <!-- Declare an engine -->
+ <!-- Declare a named clustering engine. Only one engine can be named
+ "default" (and it becomes the default one for the search component).
+ -->
<lst name="engine">
- <!-- The name, only one can be named "default" -->
<str name="name">default</str>
- <!-- Class name of Carrot2 clustering algorithm.
+ <!-- Class name of a clustering algorithm compatible with the Carrot2
+ framework.
- Currently available algorithms are:
-
+ Currently available open source algorithms are:
* org.carrot2.clustering.lingo.LingoClusteringAlgorithm
* org.carrot2.clustering.stc.STCClusteringAlgorithm
* org.carrot2.clustering.kmeans.BisectingKMeansClusteringAlgorithm
-
- See http://project.carrot2.org/algorithms.html for the
- algorithm's characteristics.
- -->
- <str name="carrot.algorithm">org.carrot2.clustering.lingo.LingoClusteringAlgorithm</str>
- <!-- Overriding values for Carrot2 default algorithm attributes.
+ See http://project.carrot2.org/algorithms.html for more information.
- For a description of all available attributes, see:
- http://download.carrot2.org/stable/manual/#chapter.components.
- Use attribute key as name attribute of str elements
- below. These can be further overridden for individual
- requests by specifying attribute key as request parameter
- name and attribute value as parameter value.
+ A commercial algorithm Lingo3G (needs to be installed separately) is defined as:
+ * com.carrotsearch.lingo3g.Lingo3GClusteringAlgorithm
-->
- <str name="LingoClusteringAlgorithm.desiredClusterCountBase">20</str>
+ <str name="carrot.algorithm">org.carrot2.clustering.lingo.LingoClusteringAlgorithm</str>
- <!-- Location of Carrot2 lexical resources.
+ <!-- Override location of the clustering algorithm's resources
+ (attribute definitions and lexical resources).
- A directory from which to load Carrot2-specific stop words
- and stop labels. Absolute or relative to Solr config directory.
+ A directory from which to load algorithm-specific stop words,
+ stop labels and attribute definition XMLs.
+ Absolute or relative to Solr config directory.
If a specific resource (e.g. stopwords.en) is present in the
specified dir, it will completely override the corresponding
- default one that ships with Carrot2.
+ default one that typically ships with each algorithm.
For an overview of Carrot2 lexical resources, see:
http://download.carrot2.org/head/manual/#chapter.lexical-resources
- -->
- <str name="carrot.lexicalResourcesDir">clustering/carrot2</str>
-
- <!-- The language to assume for the documents.
-
- For a list of allowed values, see:
- http://download.carrot2.org/stable/manual/#section.attribute.lingo.MultilingualClustering.defaultLanguage
+
+ For an overview of Lingo3G lexical resources, see:
+ http://download.carrotsearch.com/lingo3g/manual/#chapter.lexical-resources
-->
- <str name="MultilingualClustering.defaultLanguage">ENGLISH</str>
+ <!-- <str name="carrot.resourcesDir">clustering/carrot2</str> -->
</lst>
+
+ <!-- An example definition for the STC clustering algorithm. -->
<lst name="engine">
<str name="name">stc</str>
<str name="carrot.algorithm">org.carrot2.clustering.stc.STCClusteringAlgorithm</str>
</lst>
+
+ <!-- An example definition for the bisecting kmeans clustering algorithm. -->
+ <lst name="engine">
+ <str name="name">kmeans</str>
+ <str name="carrot.algorithm">org.carrot2.clustering.kmeans.BisectingKMeansClusteringAlgorithm</str>
+ </lst>
</searchComponent>
<!-- A request handler for demonstrating the clustering component