You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by og...@apache.org on 2011/12/07 15:31:13 UTC
svn commit: r1211453 - in /incubator/stanbol/trunk:
commons/solr/core/src/main/java/org/apache/stanbol/commons/solr/utils/
enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/
enhancer/engines/topic/src/test/java/org/apache/st...
Author: ogrisel
Date: Wed Dec 7 14:31:13 2011
New Revision: 1211453
URL: http://svn.apache.org/viewvc?rev=1211453&view=rev
Log:
STANBOL-197: fist working topic classification in unit-tests
Added:
incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/topics_abstracts_snippet.tsv
Modified:
incubator/stanbol/trunk/commons/solr/core/src/main/java/org/apache/stanbol/commons/solr/utils/StreamQueryRequest.java
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/TopicEngineTest.java
incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/test_schema.xml
incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/test_solrconfig.xml
Modified: incubator/stanbol/trunk/commons/solr/core/src/main/java/org/apache/stanbol/commons/solr/utils/StreamQueryRequest.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/commons/solr/core/src/main/java/org/apache/stanbol/commons/solr/utils/StreamQueryRequest.java?rev=1211453&r1=1211452&r2=1211453&view=diff
==============================================================================
--- incubator/stanbol/trunk/commons/solr/core/src/main/java/org/apache/stanbol/commons/solr/utils/StreamQueryRequest.java (original)
+++ incubator/stanbol/trunk/commons/solr/core/src/main/java/org/apache/stanbol/commons/solr/utils/StreamQueryRequest.java Wed Dec 7 14:31:13 2011
@@ -1,21 +1,22 @@
/*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements. See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License. You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
package org.apache.stanbol.commons.solr.utils;
+import java.io.File;
import java.util.Arrays;
import java.util.Collection;
@@ -42,7 +43,7 @@ public class StreamQueryRequest extends
public StreamQueryRequest(SolrQuery q) {
super(q, METHOD.POST);
String[] bodies = q.remove(CommonParams.STREAM_BODY);
- if (bodies!= null && bodies.length > 0) {
+ if (bodies != null && bodies.length > 0) {
String body = StringUtils.join(bodies, " ");
this.contentStream = new StringStream(body);
}
Modified: incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java?rev=1211453&r1=1211452&r2=1211453&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java Wed Dec 7 14:31:13 2011
@@ -219,7 +219,7 @@ public class TopicClassificationEngine i
public void computeEnhancements(ContentItem ci) throws EngineException {
String text = getTextFromContentItem(ci);
suggestTopics(text);
-
+
// TODO: express the results as RDF.
}
@@ -242,7 +242,12 @@ public class TopicClassificationEngine i
QueryResponse response = request.process(solrServer);
SolrDocumentList results = response.getResults();
for (SolrDocument result : results.toArray(new SolrDocument[0])) {
- suggestedTopics.add(new TopicSuggestion((String) result.getFirstValue(TOPIC_URI_FIELD), 0.0));
+ String uri = (String) result.getFirstValue(topicUriField);
+ if (uri == null) {
+ throw new EngineException(String.format("Solr Core '%s' is missing required field '%s'.",
+ solrCoreId, topicUriField));
+ }
+ suggestedTopics.add(new TopicSuggestion(uri, 0.0));
}
} catch (SolrServerException e) {
if ("unknown handler: /mlt".equals(e.getCause().getMessage())) {
Modified: incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/TopicEngineTest.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/TopicEngineTest.java?rev=1211453&r1=1211452&r2=1211453&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/TopicEngineTest.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/TopicEngineTest.java Wed Dec 7 14:31:13 2011
@@ -35,16 +35,27 @@ import junit.framework.TestCase;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
+import org.apache.solr.client.solrj.SolrQuery;
+import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
+import org.apache.solr.client.solrj.response.QueryResponse;
+import org.apache.solr.common.params.CommonParams;
import org.apache.solr.core.CoreContainer;
+import org.apache.stanbol.commons.solr.utils.StreamQueryRequest;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import org.osgi.service.cm.ConfigurationException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import org.xml.sax.SAXException;
public class TopicEngineTest {
+ private static final Logger log = LoggerFactory.getLogger(TopicEngineTest.class);
+
+ public static final String TEST_SOLR_CORE_ID = "test";
+
EmbeddedSolrServer solrServer;
File solrHome;
@@ -62,7 +73,9 @@ public class TopicEngineTest {
IOUtils.copy(is, new FileOutputStream(solrFile));
// solr conf folder with schema
- File solrConfFolder = new File(solrHome, "conf");
+ File solrCoreFolder = new File(solrHome, TEST_SOLR_CORE_ID);
+ solrCoreFolder.mkdir();
+ File solrConfFolder = new File(solrCoreFolder, "conf");
solrConfFolder.mkdir();
File schemaFile = new File(solrConfFolder, "schema.xml");
is = getClass().getResourceAsStream("/test_schema.xml");
@@ -75,13 +88,38 @@ public class TopicEngineTest {
IOUtils.copy(is, new FileOutputStream(solrConfigFile));
// create the embedded server
- CoreContainer coreContainer = new CoreContainer(solrHome.getAbsolutePath());
- solrServer = new EmbeddedSolrServer(coreContainer, "test");
+ CoreContainer coreContainer = new CoreContainer(solrHome.getAbsolutePath(), solrFile);
+ solrServer = new EmbeddedSolrServer(coreContainer, TEST_SOLR_CORE_ID);
}
@After
public void cleanupEmbeddedSolrServer() {
FileUtils.deleteQuietly(solrHome);
+ solrHome = null;
+ solrServer = null;
+ }
+
+ protected void loadSampleTopicsFromTSV() throws IOException, SolrServerException {
+ assertNotNull(solrHome);
+ assertNotNull(solrServer);
+ String topicSnippetsPath = "/topics_abstracts_snippet.tsv";
+ InputStream is = getClass().getResourceAsStream(topicSnippetsPath);
+ assertNotNull("Could not find test resource: " + topicSnippetsPath, is);
+
+ // Build a query for the CSV importer
+ SolrQuery query = new SolrQuery();
+ query.setQueryType("/update/csv");
+ query.set("commit", true);
+ query.set("separator", "\t");
+ query.set("headers", false);
+ query.set("fieldnames", "topic,popularity,paths,text");
+ query.set(CommonParams.STREAM_CONTENTTYPE, "text/plan;charset=utf-8");
+ query.set(CommonParams.STREAM_BODY, IOUtils.toString(is, "utf-8"));
+
+ // Upload an index
+ QueryResponse response = new StreamQueryRequest(query).process(solrServer);
+ assertNotNull(response);
+ log.info(String.format("Indexed test topics in %dms", response.getElapsedTime()));
}
protected Hashtable<String,Object> getDefaultConfigParams() {
@@ -130,11 +168,26 @@ public class TopicEngineTest {
assertEquals(engine.acceptedLanguages, Arrays.asList("en", "fr"));
}
- //@Test
- public void testClassificationTest() throws Exception {
+ @Test
+ public void testEmptyIndexTopicClassification() throws Exception {
TopicClassificationEngine engine = TopicClassificationEngine.fromParameters(getDefaultConfigParams());
List<TopicSuggestion> suggestedTopics = engine.suggestTopics("This is a test.");
assertNotNull(suggestedTopics);
- // TODO implement me
+ assertEquals(suggestedTopics.size(), 0);
+ }
+
+ @Test
+ public void testTopicClassification() throws Exception {
+ loadSampleTopicsFromTSV();
+ TopicClassificationEngine engine = TopicClassificationEngine.fromParameters(getDefaultConfigParams());
+ List<TopicSuggestion> suggestedTopics = engine
+ .suggestTopics("The Man Who Shot Liberty Valance is a 1962"
+ + " American Western film directed by John Ford,"
+ + " narrated by Charlton Heston and starring James"
+ + " Stewart, John Wayne and Vivien Leigh.");
+ assertNotNull(suggestedTopics);
+ assertEquals(suggestedTopics.size(), 10);
+ TopicSuggestion bestSuggestion = suggestedTopics.get(0);
+ assertEquals(bestSuggestion.uri, "Category:American_films");
}
}
Modified: incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/test_schema.xml
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/test_schema.xml?rev=1211453&r1=1211452&r2=1211453&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/test_schema.xml (original)
+++ incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/test_schema.xml Wed Dec 7 14:31:13 2011
@@ -28,8 +28,8 @@
<fieldType name="text" class="solr.TextField">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>
- <filter class="solr.StopFilterFactory" ignoreCase="true"
- words="stopwords_en.txt" enablePositionIncrements="false" />
+<!-- <filter class="solr.StopFilterFactory" ignoreCase="true" -->
+<!-- words="stopwords_en.txt" enablePositionIncrements="false" /> -->
<filter class="solr.LowerCaseFilterFactory"/>
<!-- The use of Shingle might help improve the quality but they increase
the size of the index far too much. It would be better to use a
@@ -42,10 +42,10 @@
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory"/>
- <filter class="solr.StopFilterFactory" ignoreCase="true"
- words="stopwords_en.txt" enablePositionIncrements="false" />
- <filter class="solr.SynonymFilterFactory"
- synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+<!-- <filter class="solr.StopFilterFactory" ignoreCase="true" -->
+<!-- words="stopwords_en.txt" enablePositionIncrements="false" /> -->
+<!-- <filter class="solr.SynonymFilterFactory" -->
+<!-- synonyms="synonyms.txt" ignoreCase="true" expand="true"/> -->
<filter class="solr.LowerCaseFilterFactory"/>
<!--
<filter class="solr.ShingleFilterFactory" maxShingleSize="2"
@@ -57,7 +57,7 @@
</types>
<fields>
- <field name="id" type="string" indexed="true" stored="true" required="true" />
+ <field name="topic" type="string" indexed="true" stored="true" required="true" />
<field name="type" type="string" indexed="true" stored="true" multiValued="true" />
<field name="paths" type="string" indexed="true" stored="true" multiValued="true" />
<field name="text" type="text" indexed="true" stored="false"
@@ -65,7 +65,7 @@
<field name="popularity" type="int" indexed="true" stored="true" />
</fields>
- <uniqueKey>id</uniqueKey>
+ <uniqueKey>topic</uniqueKey>
<defaultSearchField>text</defaultSearchField>
<solrQueryParser defaultOperator="AND"/>
</schema>
Modified: incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/test_solrconfig.xml
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/test_solrconfig.xml?rev=1211453&r1=1211452&r2=1211453&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/test_solrconfig.xml (original)
+++ incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/test_solrconfig.xml Wed Dec 7 14:31:13 2011
@@ -47,7 +47,7 @@
that you fully re-index after changing this setting as it can
affect both how text is indexed and queried.
-->
- <luceneMatchVersion>LUCENE_33</luceneMatchVersion>
+ <luceneMatchVersion>LUCENE_32</luceneMatchVersion>
<!-- lib directives can be used to instruct Solr to load an Jars
identified and use them to resolve any "plugins" specified in
@@ -765,74 +765,6 @@
-->
</requestHandler>
- <!-- A Robust Example
-
- This example SearchHandler declaration shows off usage of the
- SearchHandler with many defaults declared
-
- Note that multiple instances of the same Request Handler
- (SearchHandler) can be registered multiple times with different
- names (and different init parameters)
- -->
- <requestHandler name="/browse" class="solr.SearchHandler">
- <lst name="defaults">
- <str name="echoParams">explicit</str>
-
- <!-- VelocityResponseWriter settings -->
- <str name="wt">velocity</str>
-
- <str name="v.template">browse</str>
- <str name="v.layout">layout</str>
- <str name="title">Solritas</str>
-
- <str name="defType">edismax</str>
- <str name="q.alt">*:*</str>
- <str name="rows">10</str>
- <str name="fl">*,score</str>
- <str name="mlt.qf">
- text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4
- </str>
- <str name="mlt.fl">text,features,name,sku,id,manu,cat</str>
- <int name="mlt.count">3</int>
-
- <str name="qf">
- text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4
- </str>
-
- <str name="facet">on</str>
- <str name="facet.field">cat</str>
- <str name="facet.field">manu_exact</str>
- <str name="facet.query">ipod</str>
- <str name="facet.query">GB</str>
- <str name="facet.mincount">1</str>
- <str name="facet.pivot">cat,inStock</str>
- <str name="facet.range">price</str>
- <int name="f.price.facet.range.start">0</int>
- <int name="f.price.facet.range.end">600</int>
- <int name="f.price.facet.range.gap">50</int>
- <str name="f.price.facet.range.other">after</str>
- <str name="facet.range">manufacturedate_dt</str>
- <str name="f.manufacturedate_dt.facet.range.start">NOW/YEAR-10YEARS</str>
- <str name="f.manufacturedate_dt.facet.range.end">NOW</str>
- <str name="f.manufacturedate_dt.facet.range.gap">+1YEAR</str>
- <str name="f.manufacturedate_dt.facet.range.other">before</str>
- <str name="f.manufacturedate_dt.facet.range.other">after</str>
-
-
- <!-- Highlighting defaults -->
- <str name="hl">on</str>
- <str name="hl.fl">text features name</str>
- <str name="f.name.hl.fragsize">0</str>
- <str name="f.name.hl.alternateField">name</str>
- </lst>
- <arr name="last-components">
- <str>spellcheck</str>
- </arr>
- <!--
- <str name="url-scheme">httpx</str>
- -->
- </requestHandler>
-
<!-- XML Update Request Handler.
http://wiki.apache.org/solr/UpdateXmlMessages
@@ -962,27 +894,6 @@
-->
<requestHandler name="/admin/"
class="solr.admin.AdminHandlers" />
- <!-- This single handler is equivalent to the following... -->
- <!--
- <requestHandler name="/admin/luke" class="solr.admin.LukeRequestHandler" />
- <requestHandler name="/admin/system" class="solr.admin.SystemInfoHandler" />
- <requestHandler name="/admin/plugins" class="solr.admin.PluginInfoHandler" />
- <requestHandler name="/admin/threads" class="solr.admin.ThreadDumpHandler" />
- <requestHandler name="/admin/properties" class="solr.admin.PropertiesRequestHandler" />
- <requestHandler name="/admin/file" class="solr.admin.ShowFileRequestHandler" >
- -->
- <!-- If you wish to hide files under ${solr.home}/conf, explicitly
- register the ShowFileRequestHandler using:
- -->
- <!--
- <requestHandler name="/admin/file"
- class="solr.admin.ShowFileRequestHandler" >
- <lst name="invariants">
- <str name="hidden">synonyms.txt</str>
- <str name="hidden">anotherfile.txt</str>
- </lst>
- </requestHandler>
- -->
<!-- ping/healthcheck -->
<requestHandler name="/admin/ping" class="solr.PingRequestHandler">
@@ -1000,520 +911,7 @@
<str name="echoHandler">true</str>
</lst>
</requestHandler>
-
- <!-- Solr Replication
-
- The SolrReplicationHandler supports replicating indexes from a
- "master" used for indexing and "salves" used for queries.
-
- http://wiki.apache.org/solr/SolrReplication
-
- In the example below, remove the <lst name="master"> section if
- this is just a slave and remove the <lst name="slave"> section
- if this is just a master.
- -->
- <!--
- <requestHandler name="/replication" class="solr.ReplicationHandler" >
- <lst name="master">
- <str name="replicateAfter">commit</str>
- <str name="replicateAfter">startup</str>
- <str name="confFiles">schema.xml,stopwords.txt</str>
- </lst>
- <lst name="slave">
- <str name="masterUrl">http://localhost:8983/solr/replication</str>
- <str name="pollInterval">00:00:60</str>
- </lst>
- </requestHandler>
- -->
-
- <!-- Search Components
-
- Search components are registered to SolrCore and used by
- instances of SearchHandler (which can access them by name)
-
- By default, the following components are available:
-
- <searchComponent name="query" class="solr.QueryComponent" />
- <searchComponent name="facet" class="solr.FacetComponent" />
- <searchComponent name="mlt" class="solr.MoreLikeThisComponent" />
- <searchComponent name="highlight" class="solr.HighlightComponent" />
- <searchComponent name="stats" class="solr.StatsComponent" />
- <searchComponent name="debug" class="solr.DebugComponent" />
-
- Default configuration in a requestHandler would look like:
-
- <arr name="components">
- <str>query</str>
- <str>facet</str>
- <str>mlt</str>
- <str>highlight</str>
- <str>stats</str>
- <str>debug</str>
- </arr>
-
- If you register a searchComponent to one of the standard names,
- that will be used instead of the default.
-
- To insert components before or after the 'standard' components, use:
-
- <arr name="first-components">
- <str>myFirstComponentName</str>
- </arr>
-
- <arr name="last-components">
- <str>myLastComponentName</str>
- </arr>
-
- NOTE: The component registered with the name "debug" will
- always be executed after the "last-components"
-
- -->
-
- <!-- Spell Check
-
- The spell check component can return a list of alternative spelling
- suggestions.
-
- http://wiki.apache.org/solr/SpellCheckComponent
- -->
- <searchComponent name="spellcheck" class="solr.SpellCheckComponent">
-
- <str name="queryAnalyzerFieldType">textSpell</str>
-
- <!-- Multiple "Spell Checkers" can be declared and used by this
- component
- -->
-
- <!-- a spellchecker built from a field of the main index, and
- written to disk
- -->
- <lst name="spellchecker">
- <str name="name">default</str>
- <str name="field">name</str>
- <str name="spellcheckIndexDir">spellchecker</str>
- <!-- uncomment this to require terms to occur in 1% of the documents in order to be included in the dictionary
- <float name="thresholdTokenFrequency">.01</float>
- -->
- </lst>
-
- <!-- a spellchecker that uses a different distance measure -->
- <!--
- <lst name="spellchecker">
- <str name="name">jarowinkler</str>
- <str name="field">spell</str>
- <str name="distanceMeasure">
- org.apache.lucene.search.spell.JaroWinklerDistance
- </str>
- <str name="spellcheckIndexDir">spellcheckerJaro</str>
- </lst>
- -->
-
- <!-- a spellchecker that use an alternate comparator
-
- comparatorClass be one of:
- 1. score (default)
- 2. freq (Frequency first, then score)
- 3. A fully qualified class name
- -->
- <!--
- <lst name="spellchecker">
- <str name="name">freq</str>
- <str name="field">lowerfilt</str>
- <str name="spellcheckIndexDir">spellcheckerFreq</str>
- <str name="comparatorClass">freq</str>
- <str name="buildOnCommit">true</str>
- -->
-
- <!-- A spellchecker that reads the list of words from a file -->
- <!--
- <lst name="spellchecker">
- <str name="classname">solr.FileBasedSpellChecker</str>
- <str name="name">file</str>
- <str name="sourceLocation">spellings.txt</str>
- <str name="characterEncoding">UTF-8</str>
- <str name="spellcheckIndexDir">spellcheckerFile</str>
- </lst>
- -->
- </searchComponent>
-
- <!-- A request handler for demonstrating the spellcheck component.
-
- NOTE: This is purely as an example. The whole purpose of the
- SpellCheckComponent is to hook it into the request handler that
- handles your normal user queries so that a separate request is
- not needed to get suggestions.
-
- IN OTHER WORDS, THERE IS REALLY GOOD CHANCE THE SETUP BELOW IS
- NOT WHAT YOU WANT FOR YOUR PRODUCTION SYSTEM!
-
- See http://wiki.apache.org/solr/SpellCheckComponent for details
- on the request parameters.
- -->
- <requestHandler name="/spell" class="solr.SearchHandler" startup="lazy">
- <lst name="defaults">
- <str name="spellcheck.onlyMorePopular">false</str>
- <str name="spellcheck.extendedResults">false</str>
- <str name="spellcheck.count">1</str>
- </lst>
- <arr name="last-components">
- <str>spellcheck</str>
- </arr>
- </requestHandler>
-
- <!-- Term Vector Component
- http://wiki.apache.org/solr/TermVectorComponent
- -->
- <searchComponent name="tvComponent" class="solr.TermVectorComponent"/>
-
- <!-- A request handler for demonstrating the term vector component
-
- This is purely as an example.
-
- In reality you will likely want to add the component to your
- already specified request handlers.
- -->
- <requestHandler name="tvrh" class="solr.SearchHandler" startup="lazy">
- <lst name="defaults">
- <bool name="tv">true</bool>
- </lst>
- <arr name="last-components">
- <str>tvComponent</str>
- </arr>
- </requestHandler>
-
- <!-- Clustering Component
-
- http://wiki.apache.org/solr/ClusteringComponent
-
- This relies on third party jars which are notincluded in the
- release. To use this component (and the "/clustering" handler)
- Those jars will need to be downloaded, and you'll need to set
- the solr.cluster.enabled system property when running solr...
-
- java -Dsolr.clustering.enabled=true -jar start.jar
- -->
- <searchComponent name="clustering"
- enable="${solr.clustering.enabled:false}"
- class="solr.clustering.ClusteringComponent" >
- <!-- Declare an engine -->
- <lst name="engine">
- <!-- The name, only one can be named "default" -->
- <str name="name">default</str>
-
- <!-- Class name of Carrot2 clustering algorithm.
-
- Currently available algorithms are:
-
- * org.carrot2.clustering.lingo.LingoClusteringAlgorithm
- * org.carrot2.clustering.stc.STCClusteringAlgorithm
- * org.carrot2.clustering.kmeans.BisectingKMeansClusteringAlgorithm
-
- See http://project.carrot2.org/algorithms.html for the
- algorithm's characteristics.
- -->
- <str name="carrot.algorithm">org.carrot2.clustering.lingo.LingoClusteringAlgorithm</str>
-
- <!-- Overriding values for Carrot2 default algorithm attributes.
-
- For a description of all available attributes, see:
- http://download.carrot2.org/stable/manual/#chapter.components.
- Use attribute key as name attribute of str elements
- below. These can be further overridden for individual
- requests by specifying attribute key as request parameter
- name and attribute value as parameter value.
- -->
- <str name="LingoClusteringAlgorithm.desiredClusterCountBase">20</str>
-
- <!-- Location of Carrot2 lexical resources.
-
- A directory from which to load Carrot2-specific stop words
- and stop labels. Absolute or relative to Solr config directory.
- If a specific resource (e.g. stopwords.en) is present in the
- specified dir, it will completely override the corresponding
- default one that ships with Carrot2.
-
- For an overview of Carrot2 lexical resources, see:
- http://download.carrot2.org/head/manual/#chapter.lexical-resources
- -->
- <str name="carrot.lexicalResourcesDir">clustering/carrot2</str>
-
- <!-- The language to assume for the documents.
-
- For a list of allowed values, see:
- http://download.carrot2.org/stable/manual/#section.attribute.lingo.MultilingualClustering.defaultLanguage
- -->
- <str name="MultilingualClustering.defaultLanguage">ENGLISH</str>
- </lst>
- <lst name="engine">
- <str name="name">stc</str>
- <str name="carrot.algorithm">org.carrot2.clustering.stc.STCClusteringAlgorithm</str>
- </lst>
- </searchComponent>
-
- <!-- A request handler for demonstrating the clustering component
-
- This is purely as an example.
-
- In reality you will likely want to add the component to your
- already specified request handlers.
- -->
- <requestHandler name="/clustering"
- startup="lazy"
- enable="${solr.clustering.enabled:false}"
- class="solr.SearchHandler">
- <lst name="defaults">
- <bool name="clustering">true</bool>
- <str name="clustering.engine">default</str>
- <bool name="clustering.results">true</bool>
- <!-- The title field -->
- <str name="carrot.title">name</str>
- <str name="carrot.url">id</str>
- <!-- The field to cluster on -->
- <str name="carrot.snippet">features</str>
- <!-- produce summaries -->
- <bool name="carrot.produceSummary">true</bool>
- <!-- the maximum number of labels per cluster -->
- <!--<int name="carrot.numDescriptions">5</int>-->
- <!-- produce sub clusters -->
- <bool name="carrot.outputSubClusters">false</bool>
-
- <str name="defType">edismax</str>
- <str name="qf">
- text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4
- </str>
- <str name="q.alt">*:*</str>
- <str name="rows">10</str>
- <str name="fl">*,score</str>
- </lst>
- <arr name="last-components">
- <str>clustering</str>
- </arr>
- </requestHandler>
-
- <!-- Terms Component
-
- http://wiki.apache.org/solr/TermsComponent
-
- A component to return terms and document frequency of those
- terms
- -->
- <searchComponent name="terms" class="solr.TermsComponent"/>
-
- <!-- A request handler for demonstrating the terms component -->
- <requestHandler name="/terms" class="solr.SearchHandler" startup="lazy">
- <lst name="defaults">
- <bool name="terms">true</bool>
- </lst>
- <arr name="components">
- <str>terms</str>
- </arr>
- </requestHandler>
-
-
- <!-- Query Elevation Component
-
- http://wiki.apache.org/solr/QueryElevationComponent
-
- a search component that enables you to configure the top
- results for a given query regardless of the normal lucene
- scoring.
- -->
- <searchComponent name="elevator" class="solr.QueryElevationComponent" >
- <!-- pick a fieldType to analyze queries -->
- <str name="queryFieldType">string</str>
- <str name="config-file">elevate.xml</str>
- </searchComponent>
-
- <!-- A request handler for demonstrating the elevator component -->
- <requestHandler name="/elevate" class="solr.SearchHandler" startup="lazy">
- <lst name="defaults">
- <str name="echoParams">explicit</str>
- </lst>
- <arr name="last-components">
- <str>elevator</str>
- </arr>
- </requestHandler>
-
- <!-- Highlighting Component
-
- http://wiki.apache.org/solr/HighlightingParameters
- -->
- <searchComponent class="solr.HighlightComponent" name="highlight">
- <highlighting>
- <!-- Configure the standard fragmenter -->
- <!-- This could most likely be commented out in the "default" case -->
- <fragmenter name="gap"
- default="true"
- class="solr.highlight.GapFragmenter">
- <lst name="defaults">
- <int name="hl.fragsize">100</int>
- </lst>
- </fragmenter>
-
- <!-- A regular-expression-based fragmenter
- (for sentence extraction)
- -->
- <fragmenter name="regex"
- class="solr.highlight.RegexFragmenter">
- <lst name="defaults">
- <!-- slightly smaller fragsizes work better because of slop -->
- <int name="hl.fragsize">70</int>
- <!-- allow 50% slop on fragment sizes -->
- <float name="hl.regex.slop">0.5</float>
- <!-- a basic sentence pattern -->
- <str name="hl.regex.pattern">[-\w ,/\n\"']{20,200}</str>
- </lst>
- </fragmenter>
-
- <!-- Configure the standard formatter -->
- <formatter name="html"
- default="true"
- class="solr.highlight.HtmlFormatter">
- <lst name="defaults">
- <str name="hl.simple.pre"><![CDATA[<em>]]></str>
- <str name="hl.simple.post"><![CDATA[</em>]]></str>
- </lst>
- </formatter>
-
- <!-- Configure the standard encoder -->
- <encoder name="html"
- class="solr.highlight.HtmlEncoder" />
-
- <!-- Configure the standard fragListBuilder -->
- <fragListBuilder name="simple"
- default="true"
- class="solr.highlight.SimpleFragListBuilder"/>
-
- <!-- Configure the single fragListBuilder -->
- <fragListBuilder name="single"
- class="solr.highlight.SingleFragListBuilder"/>
-
- <!-- default tag FragmentsBuilder -->
- <fragmentsBuilder name="default"
- default="true"
- class="solr.highlight.ScoreOrderFragmentsBuilder">
- <!--
- <lst name="defaults">
- <str name="hl.multiValuedSeparatorChar">/</str>
- </lst>
- -->
- </fragmentsBuilder>
-
- <!-- multi-colored tag FragmentsBuilder -->
- <fragmentsBuilder name="colored"
- class="solr.highlight.ScoreOrderFragmentsBuilder">
- <lst name="defaults">
- <str name="hl.tag.pre"><![CDATA[
- <b style="background:yellow">,<b style="background:lawgreen">,
- <b style="background:aquamarine">,<b style="background:magenta">,
- <b style="background:palegreen">,<b style="background:coral">,
- <b style="background:wheat">,<b style="background:khaki">,
- <b style="background:lime">,<b style="background:deepskyblue">]]></str>
- <str name="hl.tag.post"><![CDATA[</b>]]></str>
- </lst>
- </fragmentsBuilder>
- </highlighting>
- </searchComponent>
-
- <!-- Update Processors
-
- Chains of Update Processor Factories for dealing with Update
- Requests can be declared, and then used by name in Update
- Request Processors
-
- http://wiki.apache.org/solr/UpdateRequestProcessor
-
- -->
- <!-- Deduplication
-
- An example dedup update processor that creates the "id" field
- on the fly based on the hash code of some other fields. This
- example has overwriteDupes set to false since we are using the
- id field as the signatureField and Solr will maintain
- uniqueness based on that anyway.
-
- -->
- <!--
- <updateRequestProcessorChain name="dedupe">
- <processor class="solr.processor.SignatureUpdateProcessorFactory">
- <bool name="enabled">true</bool>
- <str name="signatureField">id</str>
- <bool name="overwriteDupes">false</bool>
- <str name="fields">name,features,cat</str>
- <str name="signatureClass">solr.processor.Lookup3Signature</str>
- </processor>
- <processor class="solr.LogUpdateProcessorFactory" />
- <processor class="solr.RunUpdateProcessorFactory" />
- </updateRequestProcessorChain>
- -->
-
- <!-- Response Writers
-
- http://wiki.apache.org/solr/QueryResponseWriter
-
- Request responses will be written using the writer specified by
- the 'wt' request parameter matching the name of a registered
- writer.
-
- The "default" writer is the default and will be used if 'wt' is
- not specified in the request.
- -->
- <!-- The following response writers are implicitly configured unless
- overridden...
- -->
- <!--
- <queryResponseWriter name="xml"
- default="true"
- class="solr.XMLResponseWriter" />
- <queryResponseWriter name="json" class="solr.JSONResponseWriter"/>
- <queryResponseWriter name="python" class="solr.PythonResponseWriter"/>
- <queryResponseWriter name="ruby" class="solr.RubyResponseWriter"/>
- <queryResponseWriter name="php" class="solr.PHPResponseWriter"/>
- <queryResponseWriter name="phps" class="solr.PHPSerializedResponseWriter"/>
- <queryResponseWriter name="velocity" class="solr.VelocityResponseWriter"/>
- <queryResponseWriter name="csv" class="solr.CSVResponseWriter"/>
- -->
- <!--
- Custom response writers can be declared as needed...
- -->
- <!--
- <queryResponseWriter name="custom" class="com.example.MyResponseWriter"/>
- -->
-
- <!-- XSLT response writer transforms the XML output by any xslt file found
- in Solr's conf/xslt directory. Changes to xslt files are checked for
- every xsltCacheLifetimeSeconds.
- -->
- <queryResponseWriter name="xslt" class="solr.XSLTResponseWriter">
- <int name="xsltCacheLifetimeSeconds">5</int>
- </queryResponseWriter>
-
- <!-- Query Parsers
-
- http://wiki.apache.org/solr/SolrQuerySyntax
-
- Multiple QParserPlugins can be registered by name, and then
- used in either the "defType" param for the QueryComponent (used
- by SearchHandler) or in LocalParams
- -->
- <!-- example of registering a query parser -->
- <!--
- <queryParser name="myparser" class="com.mycompany.MyQParserPlugin"/>
- -->
-
- <!-- Function Parsers
-
- http://wiki.apache.org/solr/FunctionQuery
-
- Multiple ValueSourceParsers can be registered by name, and then
- used as function names when using the "func" QParser.
- -->
- <!-- example of registering a custom function parser -->
- <!--
- <valueSourceParser name="myfunc"
- class="com.mycompany.MyValueSourceParser" />
- -->
<!-- Legacy config for the admin interface -->
<admin>
Added: incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/topics_abstracts_snippet.tsv
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/topics_abstracts_snippet.tsv?rev=1211453&view=auto
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/topics_abstracts_snippet.tsv (added)
+++ incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/topics_abstracts_snippet.tsv Wed Dec 7 14:31:13 2011
@@ -0,0 +1,10 @@
[... 11 lines stripped ...]