You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by og...@apache.org on 2011/12/07 15:31:13 UTC

svn commit: r1211453 - in /incubator/stanbol/trunk: commons/solr/core/src/main/java/org/apache/stanbol/commons/solr/utils/ enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/ enhancer/engines/topic/src/test/java/org/apache/st...

Author: ogrisel
Date: Wed Dec  7 14:31:13 2011
New Revision: 1211453

URL: http://svn.apache.org/viewvc?rev=1211453&view=rev
Log:
STANBOL-197: fist working topic classification in unit-tests

Added:
    incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/topics_abstracts_snippet.tsv
Modified:
    incubator/stanbol/trunk/commons/solr/core/src/main/java/org/apache/stanbol/commons/solr/utils/StreamQueryRequest.java
    incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
    incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/TopicEngineTest.java
    incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/test_schema.xml
    incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/test_solrconfig.xml

Modified: incubator/stanbol/trunk/commons/solr/core/src/main/java/org/apache/stanbol/commons/solr/utils/StreamQueryRequest.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/commons/solr/core/src/main/java/org/apache/stanbol/commons/solr/utils/StreamQueryRequest.java?rev=1211453&r1=1211452&r2=1211453&view=diff
==============================================================================
--- incubator/stanbol/trunk/commons/solr/core/src/main/java/org/apache/stanbol/commons/solr/utils/StreamQueryRequest.java (original)
+++ incubator/stanbol/trunk/commons/solr/core/src/main/java/org/apache/stanbol/commons/solr/utils/StreamQueryRequest.java Wed Dec  7 14:31:13 2011
@@ -1,21 +1,22 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 package org.apache.stanbol.commons.solr.utils;
 
+import java.io.File;
 import java.util.Arrays;
 import java.util.Collection;
 
@@ -42,7 +43,7 @@ public class StreamQueryRequest extends 
     public StreamQueryRequest(SolrQuery q) {
         super(q, METHOD.POST);
         String[] bodies = q.remove(CommonParams.STREAM_BODY);
-        if (bodies!= null && bodies.length > 0) {
+        if (bodies != null && bodies.length > 0) {
             String body = StringUtils.join(bodies, " ");
             this.contentStream = new StringStream(body);
         }

Modified: incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java?rev=1211453&r1=1211452&r2=1211453&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java Wed Dec  7 14:31:13 2011
@@ -219,7 +219,7 @@ public class TopicClassificationEngine i
     public void computeEnhancements(ContentItem ci) throws EngineException {
         String text = getTextFromContentItem(ci);
         suggestTopics(text);
-        
+
         // TODO: express the results as RDF.
     }
 
@@ -242,7 +242,12 @@ public class TopicClassificationEngine i
             QueryResponse response = request.process(solrServer);
             SolrDocumentList results = response.getResults();
             for (SolrDocument result : results.toArray(new SolrDocument[0])) {
-                suggestedTopics.add(new TopicSuggestion((String) result.getFirstValue(TOPIC_URI_FIELD), 0.0));
+                String uri = (String) result.getFirstValue(topicUriField);
+                if (uri == null) {
+                    throw new EngineException(String.format("Solr Core '%s' is missing required field '%s'.",
+                        solrCoreId, topicUriField));
+                }
+                suggestedTopics.add(new TopicSuggestion(uri, 0.0));
             }
         } catch (SolrServerException e) {
             if ("unknown handler: /mlt".equals(e.getCause().getMessage())) {

Modified: incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/TopicEngineTest.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/TopicEngineTest.java?rev=1211453&r1=1211452&r2=1211453&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/TopicEngineTest.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/TopicEngineTest.java Wed Dec  7 14:31:13 2011
@@ -35,16 +35,27 @@ import junit.framework.TestCase;
 
 import org.apache.commons.io.FileUtils;
 import org.apache.commons.io.IOUtils;
+import org.apache.solr.client.solrj.SolrQuery;
+import org.apache.solr.client.solrj.SolrServerException;
 import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
+import org.apache.solr.client.solrj.response.QueryResponse;
+import org.apache.solr.common.params.CommonParams;
 import org.apache.solr.core.CoreContainer;
+import org.apache.stanbol.commons.solr.utils.StreamQueryRequest;
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;
 import org.osgi.service.cm.ConfigurationException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 import org.xml.sax.SAXException;
 
 public class TopicEngineTest {
 
+    private static final Logger log = LoggerFactory.getLogger(TopicEngineTest.class);
+
+    public static final String TEST_SOLR_CORE_ID = "test";
+
     EmbeddedSolrServer solrServer;
 
     File solrHome;
@@ -62,7 +73,9 @@ public class TopicEngineTest {
         IOUtils.copy(is, new FileOutputStream(solrFile));
 
         // solr conf folder with schema
-        File solrConfFolder = new File(solrHome, "conf");
+        File solrCoreFolder = new File(solrHome, TEST_SOLR_CORE_ID);
+        solrCoreFolder.mkdir();
+        File solrConfFolder = new File(solrCoreFolder, "conf");
         solrConfFolder.mkdir();
         File schemaFile = new File(solrConfFolder, "schema.xml");
         is = getClass().getResourceAsStream("/test_schema.xml");
@@ -75,13 +88,38 @@ public class TopicEngineTest {
         IOUtils.copy(is, new FileOutputStream(solrConfigFile));
 
         // create the embedded server
-        CoreContainer coreContainer = new CoreContainer(solrHome.getAbsolutePath());
-        solrServer = new EmbeddedSolrServer(coreContainer, "test");
+        CoreContainer coreContainer = new CoreContainer(solrHome.getAbsolutePath(), solrFile);
+        solrServer = new EmbeddedSolrServer(coreContainer, TEST_SOLR_CORE_ID);
     }
 
     @After
     public void cleanupEmbeddedSolrServer() {
         FileUtils.deleteQuietly(solrHome);
+        solrHome = null;
+        solrServer = null;
+    }
+
+    protected void loadSampleTopicsFromTSV() throws IOException, SolrServerException {
+        assertNotNull(solrHome);
+        assertNotNull(solrServer);
+        String topicSnippetsPath = "/topics_abstracts_snippet.tsv";
+        InputStream is = getClass().getResourceAsStream(topicSnippetsPath);
+        assertNotNull("Could not find test resource: " + topicSnippetsPath, is);
+
+        // Build a query for the CSV importer
+        SolrQuery query = new SolrQuery();
+        query.setQueryType("/update/csv");
+        query.set("commit", true);
+        query.set("separator", "\t");
+        query.set("headers", false);
+        query.set("fieldnames", "topic,popularity,paths,text");
+        query.set(CommonParams.STREAM_CONTENTTYPE, "text/plan;charset=utf-8");
+        query.set(CommonParams.STREAM_BODY, IOUtils.toString(is, "utf-8"));
+
+        // Upload an index
+        QueryResponse response = new StreamQueryRequest(query).process(solrServer);
+        assertNotNull(response);
+        log.info(String.format("Indexed test topics in %dms", response.getElapsedTime()));
     }
 
     protected Hashtable<String,Object> getDefaultConfigParams() {
@@ -130,11 +168,26 @@ public class TopicEngineTest {
         assertEquals(engine.acceptedLanguages, Arrays.asList("en", "fr"));
     }
 
-    //@Test
-    public void testClassificationTest() throws Exception {
+    @Test
+    public void testEmptyIndexTopicClassification() throws Exception {
         TopicClassificationEngine engine = TopicClassificationEngine.fromParameters(getDefaultConfigParams());
         List<TopicSuggestion> suggestedTopics = engine.suggestTopics("This is a test.");
         assertNotNull(suggestedTopics);
-        // TODO implement me
+        assertEquals(suggestedTopics.size(), 0);
+    }
+
+    @Test
+    public void testTopicClassification() throws Exception {
+        loadSampleTopicsFromTSV();
+        TopicClassificationEngine engine = TopicClassificationEngine.fromParameters(getDefaultConfigParams());
+        List<TopicSuggestion> suggestedTopics = engine
+                .suggestTopics("The Man Who Shot Liberty Valance is a 1962"
+                               + " American Western film directed by John Ford,"
+                               + " narrated by Charlton Heston and starring James"
+                               + " Stewart, John Wayne and Vivien Leigh.");
+        assertNotNull(suggestedTopics);
+        assertEquals(suggestedTopics.size(), 10);
+        TopicSuggestion bestSuggestion = suggestedTopics.get(0);
+        assertEquals(bestSuggestion.uri, "Category:American_films");
     }
 }

Modified: incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/test_schema.xml
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/test_schema.xml?rev=1211453&r1=1211452&r2=1211453&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/test_schema.xml (original)
+++ incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/test_schema.xml Wed Dec  7 14:31:13 2011
@@ -28,8 +28,8 @@
     <fieldType name="text" class="solr.TextField">
       <analyzer type="index">
         <tokenizer class="solr.StandardTokenizerFactory"/>
-        <filter class="solr.StopFilterFactory" ignoreCase="true"
-          words="stopwords_en.txt" enablePositionIncrements="false" />
+<!--         <filter class="solr.StopFilterFactory" ignoreCase="true" -->
+<!--           words="stopwords_en.txt" enablePositionIncrements="false" /> -->
         <filter class="solr.LowerCaseFilterFactory"/>
         <!-- The use of Shingle might help improve the quality but they increase
         the size of the index far too much. It would be better to use a
@@ -42,10 +42,10 @@
       </analyzer>
       <analyzer type="query">
         <tokenizer class="solr.StandardTokenizerFactory"/>
-        <filter class="solr.StopFilterFactory" ignoreCase="true"
-          words="stopwords_en.txt" enablePositionIncrements="false" />
-        <filter class="solr.SynonymFilterFactory"
-          synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+<!--         <filter class="solr.StopFilterFactory" ignoreCase="true" -->
+<!--           words="stopwords_en.txt" enablePositionIncrements="false" /> -->
+<!--         <filter class="solr.SynonymFilterFactory" -->
+<!--           synonyms="synonyms.txt" ignoreCase="true" expand="true"/> -->
         <filter class="solr.LowerCaseFilterFactory"/>
         <!--
         <filter class="solr.ShingleFilterFactory" maxShingleSize="2"
@@ -57,7 +57,7 @@
  </types>
 
  <fields>
-   <field name="id" type="string" indexed="true" stored="true" required="true" /> 
+   <field name="topic" type="string" indexed="true" stored="true" required="true" /> 
    <field name="type" type="string" indexed="true" stored="true" multiValued="true" />
    <field name="paths" type="string" indexed="true" stored="true" multiValued="true" /> 
    <field name="text" type="text" indexed="true" stored="false"
@@ -65,7 +65,7 @@
    <field name="popularity" type="int" indexed="true" stored="true" />
  </fields>
 
- <uniqueKey>id</uniqueKey>
+ <uniqueKey>topic</uniqueKey>
  <defaultSearchField>text</defaultSearchField>
  <solrQueryParser defaultOperator="AND"/>
 </schema>

Modified: incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/test_solrconfig.xml
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/test_solrconfig.xml?rev=1211453&r1=1211452&r2=1211453&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/test_solrconfig.xml (original)
+++ incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/test_solrconfig.xml Wed Dec  7 14:31:13 2011
@@ -47,7 +47,7 @@
        that you fully re-index after changing this setting as it can
        affect both how text is indexed and queried.
     -->
-  <luceneMatchVersion>LUCENE_33</luceneMatchVersion>
+  <luceneMatchVersion>LUCENE_32</luceneMatchVersion>
 
   <!-- lib directives can be used to instruct Solr to load an Jars
        identified and use them to resolve any "plugins" specified in
@@ -765,74 +765,6 @@
       -->
     </requestHandler>
 
-  <!-- A Robust Example
-
-       This example SearchHandler declaration shows off usage of the
-       SearchHandler with many defaults declared
-
-       Note that multiple instances of the same Request Handler
-       (SearchHandler) can be registered multiple times with different
-       names (and different init parameters)
-    -->
-  <requestHandler name="/browse" class="solr.SearchHandler">
-     <lst name="defaults">
-       <str name="echoParams">explicit</str>
-
-       <!-- VelocityResponseWriter settings -->
-       <str name="wt">velocity</str>
-
-       <str name="v.template">browse</str>
-       <str name="v.layout">layout</str>
-       <str name="title">Solritas</str>
-
-       <str name="defType">edismax</str>
-       <str name="q.alt">*:*</str>
-       <str name="rows">10</str>
-       <str name="fl">*,score</str>
-       <str name="mlt.qf">
-         text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4
-       </str>
-       <str name="mlt.fl">text,features,name,sku,id,manu,cat</str>
-       <int name="mlt.count">3</int>
-
-       <str name="qf">
-          text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4
-       </str>
-
-       <str name="facet">on</str>
-       <str name="facet.field">cat</str>
-       <str name="facet.field">manu_exact</str>
-       <str name="facet.query">ipod</str>
-       <str name="facet.query">GB</str>
-       <str name="facet.mincount">1</str>
-       <str name="facet.pivot">cat,inStock</str>
-       <str name="facet.range">price</str>
-       <int name="f.price.facet.range.start">0</int>
-       <int name="f.price.facet.range.end">600</int>
-       <int name="f.price.facet.range.gap">50</int>
-       <str name="f.price.facet.range.other">after</str>
-       <str name="facet.range">manufacturedate_dt</str>
-       <str name="f.manufacturedate_dt.facet.range.start">NOW/YEAR-10YEARS</str>
-       <str name="f.manufacturedate_dt.facet.range.end">NOW</str>
-       <str name="f.manufacturedate_dt.facet.range.gap">+1YEAR</str>
-       <str name="f.manufacturedate_dt.facet.range.other">before</str>
-       <str name="f.manufacturedate_dt.facet.range.other">after</str>
-
-
-       <!-- Highlighting defaults -->
-       <str name="hl">on</str>
-       <str name="hl.fl">text features name</str>
-       <str name="f.name.hl.fragsize">0</str>
-       <str name="f.name.hl.alternateField">name</str>
-     </lst>
-     <arr name="last-components">
-       <str>spellcheck</str>
-     </arr>
-     <!--
-     <str name="url-scheme">httpx</str>
-     -->
-  </requestHandler>
-
   <!-- XML Update Request Handler.  
        
        http://wiki.apache.org/solr/UpdateXmlMessages
@@ -962,27 +894,6 @@
     -->
   <requestHandler name="/admin/" 
                   class="solr.admin.AdminHandlers" />
-  <!-- This single handler is equivalent to the following... -->
-  <!--
-     <requestHandler name="/admin/luke"       class="solr.admin.LukeRequestHandler" />
-     <requestHandler name="/admin/system"     class="solr.admin.SystemInfoHandler" />
-     <requestHandler name="/admin/plugins"    class="solr.admin.PluginInfoHandler" />
-     <requestHandler name="/admin/threads"    class="solr.admin.ThreadDumpHandler" />
-     <requestHandler name="/admin/properties" class="solr.admin.PropertiesRequestHandler" />
-     <requestHandler name="/admin/file"       class="solr.admin.ShowFileRequestHandler" >
-    -->
-  <!-- If you wish to hide files under ${solr.home}/conf, explicitly
-       register the ShowFileRequestHandler using: 
-    -->
-  <!--
-     <requestHandler name="/admin/file" 
-                     class="solr.admin.ShowFileRequestHandler" >
-       <lst name="invariants">
-         <str name="hidden">synonyms.txt</str> 
-         <str name="hidden">anotherfile.txt</str> 
-       </lst>
-     </requestHandler>
-    -->
 
   <!-- ping/healthcheck -->
   <requestHandler name="/admin/ping" class="solr.PingRequestHandler">
@@ -1000,520 +911,7 @@
      <str name="echoHandler">true</str>
     </lst>
   </requestHandler>
-  
-  <!-- Solr Replication
-
-       The SolrReplicationHandler supports replicating indexes from a
-       "master" used for indexing and "salves" used for queries.
-
-       http://wiki.apache.org/solr/SolrReplication 
-
-       In the example below, remove the <lst name="master"> section if
-       this is just a slave and remove  the <lst name="slave"> section
-       if this is just a master.
-    -->
-  <!--
-     <requestHandler name="/replication" class="solr.ReplicationHandler" >
-       <lst name="master">
-         <str name="replicateAfter">commit</str>
-         <str name="replicateAfter">startup</str>
-         <str name="confFiles">schema.xml,stopwords.txt</str>
-       </lst>
-       <lst name="slave">
-         <str name="masterUrl">http://localhost:8983/solr/replication</str>
-         <str name="pollInterval">00:00:60</str>
-       </lst>
-     </requestHandler>
-    -->
-
-  <!-- Search Components
-
-       Search components are registered to SolrCore and used by 
-       instances of SearchHandler (which can access them by name)
-       
-       By default, the following components are available:
-       
-       <searchComponent name="query"     class="solr.QueryComponent" />
-       <searchComponent name="facet"     class="solr.FacetComponent" />
-       <searchComponent name="mlt"       class="solr.MoreLikeThisComponent" />
-       <searchComponent name="highlight" class="solr.HighlightComponent" />
-       <searchComponent name="stats"     class="solr.StatsComponent" />
-       <searchComponent name="debug"     class="solr.DebugComponent" />
-   
-       Default configuration in a requestHandler would look like:
-
-       <arr name="components">
-         <str>query</str>
-         <str>facet</str>
-         <str>mlt</str>
-         <str>highlight</str>
-         <str>stats</str>
-         <str>debug</str>
-       </arr>
-
-       If you register a searchComponent to one of the standard names, 
-       that will be used instead of the default.
-
-       To insert components before or after the 'standard' components, use:
-    
-       <arr name="first-components">
-         <str>myFirstComponentName</str>
-       </arr>
-    
-       <arr name="last-components">
-         <str>myLastComponentName</str>
-       </arr>
-
-       NOTE: The component registered with the name "debug" will
-       always be executed after the "last-components" 
-       
-     -->
-
-   <!-- Spell Check
-
-        The spell check component can return a list of alternative spelling
-        suggestions.  
-
-        http://wiki.apache.org/solr/SpellCheckComponent
-     -->
-  <searchComponent name="spellcheck" class="solr.SpellCheckComponent">
-
-    <str name="queryAnalyzerFieldType">textSpell</str>
-
-    <!-- Multiple "Spell Checkers" can be declared and used by this
-         component
-      -->
-
-    <!-- a spellchecker built from a field of the main index, and
-         written to disk
-      -->
-    <lst name="spellchecker">
-      <str name="name">default</str>
-      <str name="field">name</str>
-      <str name="spellcheckIndexDir">spellchecker</str>
-      <!-- uncomment this to require terms to occur in 1% of the documents in order to be included in the dictionary
-      	<float name="thresholdTokenFrequency">.01</float>
-      -->
-    </lst>
-
-    <!-- a spellchecker that uses a different distance measure -->
-    <!--
-       <lst name="spellchecker">
-         <str name="name">jarowinkler</str>
-         <str name="field">spell</str>
-         <str name="distanceMeasure">
-           org.apache.lucene.search.spell.JaroWinklerDistance
-         </str>
-         <str name="spellcheckIndexDir">spellcheckerJaro</str>
-       </lst>
-     -->
-
-    <!-- a spellchecker that use an alternate comparator 
-
-         comparatorClass be one of:
-          1. score (default)
-          2. freq (Frequency first, then score)
-          3. A fully qualified class name
-      -->
-    <!--
-       <lst name="spellchecker">
-         <str name="name">freq</str>
-         <str name="field">lowerfilt</str>
-         <str name="spellcheckIndexDir">spellcheckerFreq</str>
-         <str name="comparatorClass">freq</str>
-         <str name="buildOnCommit">true</str>
-      -->
-
-    <!-- A spellchecker that reads the list of words from a file -->
-    <!--
-       <lst name="spellchecker">
-         <str name="classname">solr.FileBasedSpellChecker</str>
-         <str name="name">file</str>
-         <str name="sourceLocation">spellings.txt</str>
-         <str name="characterEncoding">UTF-8</str>
-         <str name="spellcheckIndexDir">spellcheckerFile</str>
-       </lst>
-      -->
-  </searchComponent>
-
-  <!-- A request handler for demonstrating the spellcheck component.  
-
-       NOTE: This is purely as an example.  The whole purpose of the
-       SpellCheckComponent is to hook it into the request handler that
-       handles your normal user queries so that a separate request is
-       not needed to get suggestions.
-
-       IN OTHER WORDS, THERE IS REALLY GOOD CHANCE THE SETUP BELOW IS
-       NOT WHAT YOU WANT FOR YOUR PRODUCTION SYSTEM!
-       
-       See http://wiki.apache.org/solr/SpellCheckComponent for details
-       on the request parameters.
-    -->
-  <requestHandler name="/spell" class="solr.SearchHandler" startup="lazy">
-    <lst name="defaults">
-      <str name="spellcheck.onlyMorePopular">false</str>
-      <str name="spellcheck.extendedResults">false</str>
-      <str name="spellcheck.count">1</str>
-    </lst>
-    <arr name="last-components">
-      <str>spellcheck</str>
-    </arr>
-  </requestHandler>
-
-  <!-- Term Vector Component
 
-       http://wiki.apache.org/solr/TermVectorComponent
-    -->
-  <searchComponent name="tvComponent" class="solr.TermVectorComponent"/>
-
-  <!-- A request handler for demonstrating the term vector component
-
-       This is purely as an example.
-
-       In reality you will likely want to add the component to your 
-       already specified request handlers. 
-    -->
-  <requestHandler name="tvrh" class="solr.SearchHandler" startup="lazy">
-    <lst name="defaults">
-      <bool name="tv">true</bool>
-    </lst>
-    <arr name="last-components">
-      <str>tvComponent</str>
-    </arr>
-  </requestHandler>
-
-  <!-- Clustering Component
-
-       http://wiki.apache.org/solr/ClusteringComponent
-
-       This relies on third party jars which are notincluded in the
-       release.  To use this component (and the "/clustering" handler)
-       Those jars will need to be downloaded, and you'll need to set
-       the solr.cluster.enabled system property when running solr...
-
-          java -Dsolr.clustering.enabled=true -jar start.jar
-    -->
-  <searchComponent name="clustering" 
-                   enable="${solr.clustering.enabled:false}"
-                   class="solr.clustering.ClusteringComponent" >
-    <!-- Declare an engine -->
-    <lst name="engine">
-      <!-- The name, only one can be named "default" -->
-      <str name="name">default</str>
-
-      <!-- Class name of Carrot2 clustering algorithm. 
-           
-           Currently available algorithms are:
-           
-           * org.carrot2.clustering.lingo.LingoClusteringAlgorithm
-           * org.carrot2.clustering.stc.STCClusteringAlgorithm
-           * org.carrot2.clustering.kmeans.BisectingKMeansClusteringAlgorithm
-           
-           See http://project.carrot2.org/algorithms.html for the
-           algorithm's characteristics.
-        -->
-      <str name="carrot.algorithm">org.carrot2.clustering.lingo.LingoClusteringAlgorithm</str>
-
-      <!-- Overriding values for Carrot2 default algorithm attributes.
-
-           For a description of all available attributes, see:
-           http://download.carrot2.org/stable/manual/#chapter.components.
-           Use attribute key as name attribute of str elements
-           below. These can be further overridden for individual
-           requests by specifying attribute key as request parameter
-           name and attribute value as parameter value.
-        -->
-      <str name="LingoClusteringAlgorithm.desiredClusterCountBase">20</str>
-      
-      <!-- Location of Carrot2 lexical resources.
-
-           A directory from which to load Carrot2-specific stop words
-           and stop labels. Absolute or relative to Solr config directory.
-           If a specific resource (e.g. stopwords.en) is present in the
-           specified dir, it will completely override the corresponding
-           default one that ships with Carrot2.
-
-           For an overview of Carrot2 lexical resources, see:
-           http://download.carrot2.org/head/manual/#chapter.lexical-resources
-        -->
-      <str name="carrot.lexicalResourcesDir">clustering/carrot2</str>
-
-      <!-- The language to assume for the documents.
-           
-           For a list of allowed values, see:
-           http://download.carrot2.org/stable/manual/#section.attribute.lingo.MultilingualClustering.defaultLanguage
-       -->
-      <str name="MultilingualClustering.defaultLanguage">ENGLISH</str>
-    </lst>
-    <lst name="engine">
-      <str name="name">stc</str>
-      <str name="carrot.algorithm">org.carrot2.clustering.stc.STCClusteringAlgorithm</str>
-    </lst>
-  </searchComponent>
-
-  <!-- A request handler for demonstrating the clustering component
-
-       This is purely as an example.
-
-       In reality you will likely want to add the component to your 
-       already specified request handlers. 
-    -->
-  <requestHandler name="/clustering"
-                  startup="lazy"
-                  enable="${solr.clustering.enabled:false}"
-                  class="solr.SearchHandler">
-    <lst name="defaults">
-      <bool name="clustering">true</bool>
-      <str name="clustering.engine">default</str>
-      <bool name="clustering.results">true</bool>
-      <!-- The title field -->
-      <str name="carrot.title">name</str>
-      <str name="carrot.url">id</str>
-      <!-- The field to cluster on -->
-       <str name="carrot.snippet">features</str>
-       <!-- produce summaries -->
-       <bool name="carrot.produceSummary">true</bool>
-       <!-- the maximum number of labels per cluster -->
-       <!--<int name="carrot.numDescriptions">5</int>-->
-       <!-- produce sub clusters -->
-       <bool name="carrot.outputSubClusters">false</bool>
-       
-       <str name="defType">edismax</str>
-       <str name="qf">
-          text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4
-       </str>
-       <str name="q.alt">*:*</str>
-       <str name="rows">10</str>
-       <str name="fl">*,score</str>
-    </lst>     
-    <arr name="last-components">
-      <str>clustering</str>
-    </arr>
-  </requestHandler>
-  
-  <!-- Terms Component
-
-       http://wiki.apache.org/solr/TermsComponent
-
-       A component to return terms and document frequency of those
-       terms
-    -->
-  <searchComponent name="terms" class="solr.TermsComponent"/>
-
-  <!-- A request handler for demonstrating the terms component -->
-  <requestHandler name="/terms" class="solr.SearchHandler" startup="lazy">
-     <lst name="defaults">
-      <bool name="terms">true</bool>
-    </lst>     
-    <arr name="components">
-      <str>terms</str>
-    </arr>
-  </requestHandler>
-
-
-  <!-- Query Elevation Component
-
-       http://wiki.apache.org/solr/QueryElevationComponent
-
-       a search component that enables you to configure the top
-       results for a given query regardless of the normal lucene
-       scoring.
-    -->
-  <searchComponent name="elevator" class="solr.QueryElevationComponent" >
-    <!-- pick a fieldType to analyze queries -->
-    <str name="queryFieldType">string</str>
-    <str name="config-file">elevate.xml</str>
-  </searchComponent>
-
-  <!-- A request handler for demonstrating the elevator component -->
-  <requestHandler name="/elevate" class="solr.SearchHandler" startup="lazy">
-    <lst name="defaults">
-      <str name="echoParams">explicit</str>
-    </lst>
-    <arr name="last-components">
-      <str>elevator</str>
-    </arr>
-  </requestHandler>
-
-  <!-- Highlighting Component
-
-       http://wiki.apache.org/solr/HighlightingParameters
-    -->
-  <searchComponent class="solr.HighlightComponent" name="highlight">
-    <highlighting>
-      <!-- Configure the standard fragmenter -->
-      <!-- This could most likely be commented out in the "default" case -->
-      <fragmenter name="gap" 
-                  default="true"
-                  class="solr.highlight.GapFragmenter">
-        <lst name="defaults">
-          <int name="hl.fragsize">100</int>
-        </lst>
-      </fragmenter>
-
-      <!-- A regular-expression-based fragmenter 
-           (for sentence extraction) 
-        -->
-      <fragmenter name="regex" 
-                  class="solr.highlight.RegexFragmenter">
-        <lst name="defaults">
-          <!-- slightly smaller fragsizes work better because of slop -->
-          <int name="hl.fragsize">70</int>
-          <!-- allow 50% slop on fragment sizes -->
-          <float name="hl.regex.slop">0.5</float>
-          <!-- a basic sentence pattern -->
-          <str name="hl.regex.pattern">[-\w ,/\n\&quot;&apos;]{20,200}</str>
-        </lst>
-      </fragmenter>
-
-      <!-- Configure the standard formatter -->
-      <formatter name="html" 
-                 default="true"
-                 class="solr.highlight.HtmlFormatter">
-        <lst name="defaults">
-          <str name="hl.simple.pre"><![CDATA[<em>]]></str>
-          <str name="hl.simple.post"><![CDATA[</em>]]></str>
-        </lst>
-      </formatter>
-
-      <!-- Configure the standard encoder -->
-      <encoder name="html" 
-               class="solr.highlight.HtmlEncoder" />
-
-      <!-- Configure the standard fragListBuilder -->
-      <fragListBuilder name="simple" 
-                       default="true"
-                       class="solr.highlight.SimpleFragListBuilder"/>
-
-      <!-- Configure the single fragListBuilder -->
-      <fragListBuilder name="single" 
-                       class="solr.highlight.SingleFragListBuilder"/>
-
-      <!-- default tag FragmentsBuilder -->
-      <fragmentsBuilder name="default" 
-                        default="true"
-                        class="solr.highlight.ScoreOrderFragmentsBuilder">
-        <!-- 
-        <lst name="defaults">
-          <str name="hl.multiValuedSeparatorChar">/</str>
-        </lst>
-        -->
-      </fragmentsBuilder>
-
-      <!-- multi-colored tag FragmentsBuilder -->
-      <fragmentsBuilder name="colored" 
-                        class="solr.highlight.ScoreOrderFragmentsBuilder">
-        <lst name="defaults">
-          <str name="hl.tag.pre"><![CDATA[
-               <b style="background:yellow">,<b style="background:lawgreen">,
-               <b style="background:aquamarine">,<b style="background:magenta">,
-               <b style="background:palegreen">,<b style="background:coral">,
-               <b style="background:wheat">,<b style="background:khaki">,
-               <b style="background:lime">,<b style="background:deepskyblue">]]></str>
-          <str name="hl.tag.post"><![CDATA[</b>]]></str>
-        </lst>
-      </fragmentsBuilder>
-    </highlighting>
-  </searchComponent>
-
-  <!-- Update Processors
-
-       Chains of Update Processor Factories for dealing with Update
-       Requests can be declared, and then used by name in Update
-       Request Processors
-
-       http://wiki.apache.org/solr/UpdateRequestProcessor
-
-    --> 
-  <!-- Deduplication
-
-       An example dedup update processor that creates the "id" field
-       on the fly based on the hash code of some other fields.  This
-       example has overwriteDupes set to false since we are using the
-       id field as the signatureField and Solr will maintain
-       uniqueness based on that anyway.  
-       
-    -->
-  <!--
-     <updateRequestProcessorChain name="dedupe">
-       <processor class="solr.processor.SignatureUpdateProcessorFactory">
-         <bool name="enabled">true</bool>
-         <str name="signatureField">id</str>
-         <bool name="overwriteDupes">false</bool>
-         <str name="fields">name,features,cat</str>
-         <str name="signatureClass">solr.processor.Lookup3Signature</str>
-       </processor>
-       <processor class="solr.LogUpdateProcessorFactory" />
-       <processor class="solr.RunUpdateProcessorFactory" />
-     </updateRequestProcessorChain>
-    -->
-
-  <!-- Response Writers
-
-       http://wiki.apache.org/solr/QueryResponseWriter
-
-       Request responses will be written using the writer specified by
-       the 'wt' request parameter matching the name of a registered
-       writer.
-
-       The "default" writer is the default and will be used if 'wt' is
-       not specified in the request.
-    -->
-  <!-- The following response writers are implicitly configured unless
-       overridden...
-    -->
-  <!--
-     <queryResponseWriter name="xml" 
-                          default="true"
-                          class="solr.XMLResponseWriter" />
-     <queryResponseWriter name="json" class="solr.JSONResponseWriter"/>
-     <queryResponseWriter name="python" class="solr.PythonResponseWriter"/>
-     <queryResponseWriter name="ruby" class="solr.RubyResponseWriter"/>
-     <queryResponseWriter name="php" class="solr.PHPResponseWriter"/>
-     <queryResponseWriter name="phps" class="solr.PHPSerializedResponseWriter"/>
-     <queryResponseWriter name="velocity" class="solr.VelocityResponseWriter"/>
-     <queryResponseWriter name="csv" class="solr.CSVResponseWriter"/>
-    -->
-  <!--
-     Custom response writers can be declared as needed...
-    -->
-  <!--
-     <queryResponseWriter name="custom" class="com.example.MyResponseWriter"/>
-    -->
-
-  <!-- XSLT response writer transforms the XML output by any xslt file found
-       in Solr's conf/xslt directory.  Changes to xslt files are checked for
-       every xsltCacheLifetimeSeconds.  
-    -->
-  <queryResponseWriter name="xslt" class="solr.XSLTResponseWriter">
-    <int name="xsltCacheLifetimeSeconds">5</int>
-  </queryResponseWriter>
-
-  <!-- Query Parsers
-
-       http://wiki.apache.org/solr/SolrQuerySyntax
-
-       Multiple QParserPlugins can be registered by name, and then
-       used in either the "defType" param for the QueryComponent (used
-       by SearchHandler) or in LocalParams
-    -->
-  <!-- example of registering a query parser -->
-  <!--
-     <queryParser name="myparser" class="com.mycompany.MyQParserPlugin"/>
-    -->
-
-  <!-- Function Parsers
-
-       http://wiki.apache.org/solr/FunctionQuery
-
-       Multiple ValueSourceParsers can be registered by name, and then
-       used as function names when using the "func" QParser.
-    -->
-  <!-- example of registering a custom function parser  -->
-  <!--
-     <valueSourceParser name="myfunc" 
-                        class="com.mycompany.MyValueSourceParser" />
-    -->
 
   <!-- Legacy config for the admin interface -->
   <admin>

Added: incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/topics_abstracts_snippet.tsv
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/topics_abstracts_snippet.tsv?rev=1211453&view=auto
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/topics_abstracts_snippet.tsv (added)
+++ incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/topics_abstracts_snippet.tsv Wed Dec  7 14:31:13 2011
@@ -0,0 +1,10 @@

[... 11 lines stripped ...]