You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2012/10/29 11:24:39 UTC
svn commit: r1403238 [1/3] - in /stanbol/branches/stanbol-nlp-processing/enhancer/engines/keywordextraction: ./ src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/ src/main/java/org/apache/stanbol/enhancer/engines/keywordextract...

Author: rwesten
Date: Mon Oct 29 10:24:38 2012
New Revision: 1403238

URL: http://svn.apache.org/viewvc?rev=1403238&view=rev
Log:
STANBOL-740: Adoption of the KeywordLinkingEngine to the Stanbol NLP processing module

Added:
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/EntityLinker.java   (contents, props changed)
      - copied, changed from r1393931, stanbol/branches/stanbol-nlp-processing/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/LinkedEntity.java   (contents, props changed)
      - copied, changed from r1393931, stanbol/branches/stanbol-nlp-processing/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/LinkedEntity.java
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/MainLabelTokenizer.java
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/Suggestion.java   (contents, props changed)
      - copied, changed from r1393931, stanbol/branches/stanbol-nlp-processing/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/Suggestion.java
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/Utils.java
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/LabelTokenizer.java
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/LabelTokenizerManager.java
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/TextProcessingConfig.java
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/OpenNlpLabelTokenizer.java
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/keywordextraction/src/test/resources/
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/keywordextraction/src/test/resources/log4j.properties
Removed:
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/AnalysedContent.java
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/LinkedEntity.java
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/Suggestion.java
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/OpenNlpAnalysedContentFactory.java
Modified:
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/keywordextraction/pom.xml
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/ProcessingState.java
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinkerConfig.java
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntitySearcher.java
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/EntityhubSearcher.java
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/ReferencedSiteSearcher.java
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/keywordextraction/src/main/resources/OSGI-INF/metatype/metatype.properties
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/keywordextraction/src/test/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngineTest.java
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/keywordextraction/src/test/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/TestSearcherImpl.java

Modified: stanbol/branches/stanbol-nlp-processing/enhancer/engines/keywordextraction/pom.xml
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/keywordextraction/pom.xml?rev=1403238&r1=1403237&r2=1403238&view=diff
==============================================================================
--- stanbol/branches/stanbol-nlp-processing/enhancer/engines/keywordextraction/pom.xml (original)
+++ stanbol/branches/stanbol-nlp-processing/enhancer/engines/keywordextraction/pom.xml Mon Oct 29 10:24:38 2012
@@ -86,6 +86,11 @@
     </dependency>
     <dependency>
       <groupId>org.apache.stanbol</groupId>
+      <artifactId>org.apache.stanbol.enhancer.nlp</artifactId>
+      <version>0.10.0-SNAPSHOT</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.stanbol</groupId>
       <artifactId>org.apache.stanbol.commons.stanboltools.offline</artifactId>
       <version>0.9.0-incubating</version>
     </dependency>
@@ -161,10 +166,15 @@
       <artifactId>junit</artifactId>
       <scope>test</scope>      
     </dependency>
-    <dependency>
+    <dependency>  <!-- used for debug level logging during tests -->
       <groupId>org.slf4j</groupId>
-      <artifactId>slf4j-simple</artifactId>
-      <scope>test</scope>      
+      <artifactId>slf4j-log4j12</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>log4j</groupId>
+      <artifactId>log4j</artifactId>
+      <scope>test</scope>
     </dependency>
 
   </dependencies>

Modified: stanbol/branches/stanbol-nlp-processing/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java?rev=1403238&r1=1403237&r2=1403238&view=diff
==============================================================================
--- stanbol/branches/stanbol-nlp-processing/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java (original)
+++ stanbol/branches/stanbol-nlp-processing/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java Mon Oct 29 10:24:38 2012
@@ -16,9 +16,12 @@
 */
 package org.apache.stanbol.enhancer.engines.keywordextraction.engine;
 
+import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText;
+import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage;
 import static org.apache.stanbol.entityhub.servicesapi.defaults.NamespaceEnum.getFullName;
 
-import java.io.IOException;
+import java.lang.Integer; //preserve this!
+
 import java.net.URI;
 import java.net.URISyntaxException;
 import java.util.ArrayList;
@@ -26,17 +29,17 @@ import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.Dictionary;
+import java.util.EnumSet;
+import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.Map;
-import java.util.Map.Entry;
 import java.util.Set;
 
 import org.apache.clerezza.rdf.core.Language;
 import org.apache.clerezza.rdf.core.Literal;
 import org.apache.clerezza.rdf.core.LiteralFactory;
 import org.apache.clerezza.rdf.core.MGraph;
-import org.apache.clerezza.rdf.core.Triple;
 import org.apache.clerezza.rdf.core.UriRef;
 import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
 import org.apache.clerezza.rdf.core.impl.TripleImpl;
@@ -51,35 +54,36 @@ import org.apache.felix.scr.annotations.
 import org.apache.felix.scr.annotations.ReferencePolicy;
 import org.apache.felix.scr.annotations.ReferenceStrategy;
 import org.apache.felix.scr.annotations.Service;
-import org.apache.stanbol.commons.opennlp.OpenNLP;
-import org.apache.stanbol.commons.opennlp.TextAnalyzer;
-import org.apache.stanbol.commons.opennlp.TextAnalyzer.TextAnalyzerConfig;
 import org.apache.stanbol.commons.stanboltools.offline.OfflineMode;
-import org.apache.stanbol.enhancer.engines.keywordextraction.linking.AnalysedContent;
-import org.apache.stanbol.enhancer.engines.keywordextraction.linking.EntityLinker;
+import org.apache.stanbol.enhancer.engines.keywordextraction.impl.EntityLinker;
+import org.apache.stanbol.enhancer.engines.keywordextraction.impl.LinkedEntity;
+import org.apache.stanbol.enhancer.engines.keywordextraction.impl.Suggestion;
+import org.apache.stanbol.enhancer.engines.keywordextraction.impl.LinkedEntity.Occurrence;
 import org.apache.stanbol.enhancer.engines.keywordextraction.linking.EntityLinkerConfig;
-import org.apache.stanbol.enhancer.engines.keywordextraction.linking.EntitySearcher;
-import org.apache.stanbol.enhancer.engines.keywordextraction.linking.LinkedEntity;
-import org.apache.stanbol.enhancer.engines.keywordextraction.linking.Suggestion;
 import org.apache.stanbol.enhancer.engines.keywordextraction.linking.EntityLinkerConfig.RedirectProcessingMode;
-import org.apache.stanbol.enhancer.engines.keywordextraction.linking.LinkedEntity.Occurrence;
+import org.apache.stanbol.enhancer.engines.keywordextraction.linking.EntitySearcher;
+import org.apache.stanbol.enhancer.engines.keywordextraction.linking.LabelTokenizer;
+import org.apache.stanbol.enhancer.engines.keywordextraction.linking.LabelTokenizerManager;
+import org.apache.stanbol.enhancer.engines.keywordextraction.linking.TextProcessingConfig;
 import org.apache.stanbol.enhancer.engines.keywordextraction.linking.impl.EntityhubSearcher;
-import org.apache.stanbol.enhancer.engines.keywordextraction.linking.impl.OpenNlpAnalysedContentFactory;
 import org.apache.stanbol.enhancer.engines.keywordextraction.linking.impl.ReferencedSiteSearcher;
 import org.apache.stanbol.enhancer.engines.keywordextraction.linking.impl.TrackingEntitySearcher;
-import org.apache.stanbol.enhancer.servicesapi.Blob;
+import org.apache.stanbol.enhancer.nlp.NlpAnnotations;
+import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
+import org.apache.stanbol.enhancer.nlp.model.Token;
+import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
+import org.apache.stanbol.enhancer.nlp.pos.Pos;
+import org.apache.stanbol.enhancer.nlp.pos.PosTag;
+import org.apache.stanbol.enhancer.nlp.utils.LanguageConfiguration;
 import org.apache.stanbol.enhancer.servicesapi.ContentItem;
 import org.apache.stanbol.enhancer.servicesapi.EngineException;
 import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
-import org.apache.stanbol.enhancer.servicesapi.InvalidContentException;
 import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
-import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
 import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
 import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
 import org.apache.stanbol.enhancer.servicesapi.rdf.Properties;
 import org.apache.stanbol.entityhub.model.clerezza.RdfValueFactory;
 import org.apache.stanbol.entityhub.servicesapi.Entityhub;
-import org.apache.stanbol.entityhub.servicesapi.defaults.NamespaceEnum;
 import org.apache.stanbol.entityhub.servicesapi.model.Reference;
 import org.apache.stanbol.entityhub.servicesapi.model.Text;
 import org.apache.stanbol.entityhub.servicesapi.model.rdf.RdfResourceEnum;
@@ -88,7 +92,11 @@ import org.osgi.service.cm.Configuration
 import org.osgi.service.component.ComponentContext;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-
+/**
+ * TODO: Split "Engine" and "EngineConfiguration" in two classes
+ * @author Rupert Westenthaler
+ *
+ */
 @Component(
     configurationFactory = true, 
     policy = ConfigurationPolicy.REQUIRE, // the baseUri is required!
@@ -119,12 +127,17 @@ import org.slf4j.LoggerFactory;
         intValue=EntityLinkerConfig.DEFAULT_MIN_SEARCH_TOKEN_LENGTH),
     @Property(name=KeywordLinkingEngine.MIN_TOKEN_MATCH_FACTOR,floatValue=
             EntityLinkerConfig.DEFAULT_MIN_TOKEN_MATCH_FACTOR),
-    @Property(name=KeywordLinkingEngine.KEYWORD_TOKENIZER,boolValue=false),
+    //Can no longer be supported with the new NLP chain!
+    //@Property(name=KeywordLinkingEngine.KEYWORD_TOKENIZER,boolValue=false),
     @Property(name=KeywordLinkingEngine.MAX_SUGGESTIONS,
         intValue=EntityLinkerConfig.DEFAULT_SUGGESTIONS),
-    @Property(name=KeywordLinkingEngine.PROCESSED_LANGUAGES,value=""),
+    @Property(name=KeywordLinkingEngine.PROCESS_ONLY_PROPER_NOUNS_STATE,
+        boolValue=KeywordLinkingEngine.DEFAULT_PROCESS_ONLY_PROPER_NOUNS_STATE),
+    @Property(name=KeywordLinkingEngine.PROCESSED_LANGUAGES,
+        cardinality=Integer.MAX_VALUE,
+        value={"*"}),
     @Property(name=KeywordLinkingEngine.DEFAULT_MATCHING_LANGUAGE,value=""),
-    @Property(name=KeywordLinkingEngine.TYPE_MAPPINGS,cardinality=1000),
+    @Property(name=KeywordLinkingEngine.TYPE_MAPPINGS,cardinality=Integer.MAX_VALUE),
     @Property(name=KeywordLinkingEngine.DEREFERENCE_ENTITIES,
         boolValue=KeywordLinkingEngine.DEFAULT_DEREFERENCE_ENTITIES_STATE),
     @Property(name=Constants.SERVICE_RANKING,intValue=0)
@@ -158,16 +171,77 @@ public class KeywordLinkingEngine 
     public static final String CASE_SENSITIVE = "org.apache.stanbol.enhancer.engines.keywordextraction.caseSensitive";
     public static final String REDIRECT_FIELD = "org.apache.stanbol.enhancer.engines.keywordextraction.redirectField";
     public static final String REDIRECT_PROCESSING_MODE = "org.apache.stanbol.enhancer.engines.keywordextraction.redirectMode";
-    public static final String MIN_SEARCH_TOKEN_LENGTH = "org.apache.stanbol.enhancer.engines.keywordextraction.minSearchTokenLength";
     public static final String MAX_SUGGESTIONS = "org.apache.stanbol.enhancer.engines.keywordextraction.maxSuggestions";
-    public static final String PROCESSED_LANGUAGES = "org.apache.stanbol.enhancer.engines.keywordextraction.processedLanguages";
     public static final String MIN_FOUND_TOKENS= "org.apache.stanbol.enhancer.engines.keywordextraction.minFoundTokens";
     public static final String DEFAULT_MATCHING_LANGUAGE = "org.apache.stanbol.enhancer.engines.keywordextraction.defaultMatchingLanguage";
-    public static final String MIN_POS_TAG_PROBABILITY = "org.apache.stanbol.enhancer.engines.keywordextraction.minPosTagProbability";
     public static final String TYPE_MAPPINGS = "org.apache.stanbol.enhancer.engines.keywordextraction.typeMappings";
-    public static final String KEYWORD_TOKENIZER = "org.apache.stanbol.enhancer.engines.keywordextraction.keywordTokenizer";
+    //public static final String KEYWORD_TOKENIZER = "org.apache.stanbol.enhancer.engines.keywordextraction.keywordTokenizer";
     public static final String MIN_TOKEN_MATCH_FACTOR = "org.apache.stanbol.enhancer.engines.keywordextraction.minTokenMatchFactor";
 //  public static final String ENABLE_CHUNKER = "org.apache.stanbol.enhancer.engines.keywordextraction.enableChunker";
+    //Search parameters
+    /**
+     * Used as fallback in case a {@link Token} does not have a {@link PosTag} or 
+     * {@link NlpAnnotations#POS_ANNOTATION POS annotations} do have a low confidence.
+     * In such cases only words that are longer than  this value will be considerd for
+     * linking
+     */
+    public static final String MIN_SEARCH_TOKEN_LENGTH = "org.apache.stanbol.enhancer.engines.keywordextraction.minSearchTokenLength";
+    /**
+     * The maximum number of {@link Token} used as search terms with the 
+     * {@link EntitySearcher#lookup(String, Set, java.util.List, String[], Integer)}
+     * method
+     */
+    public static final String MAX_SEARCH_TOKENS = "org.apache.stanbol.enhancer.engines.keywordextraction.masSearchTokens";
+    /**
+     * The maximum number of {@link Token} searched around a "processable" Token for
+     * additional search tokens.<p>
+     * As an Example in the text section "at the University of Munich a new procedure to"
+     * only "Munich" would be classified as {@link Pos#ProperNoun} and considered as
+     * "processible". However for searching it makes sence to use additional Tokens to
+     * reduce (or correctly rank) the expected high number of results for "Munich".
+     * Because of that "matchable" words suronding the "processable" are considered as
+     * included for searches.<p>
+     * This parameter allows to configure the maximum distance surounding the current
+     * "processable" Token other "processable" tokens can be included in searches.
+     */
+    public static final String MAX_SEARCH_TOKEN_DISTANCE = "org.apache.stanbol.enhancer.engines.keywordextraction.masSearchTokenDistance";
+    
+    /**
+     * {@link NlpAnnotations#POS_ANNOTATION POS annotations} with a lower
+     * confidence than this value will be ignored.
+     */
+    public static final String MIN_POS_TAG_PROBABILITY = "org.apache.stanbol.enhancer.engines.keywordextraction.minPosTagProbability";
+    /**
+     * If enabled only {@link Pos#ProperNoun}, {@link Pos#Foreign} and {@link Pos#Acronym} are Matched. If
+     * deactivated all Tokens with the category {@link LexicalCategory#Noun} and 
+     * {@link LexicalCategory#Residual} are considered for matching.<p>
+     * This property allows an easy configuration of the matching that is sufficient for most usage scenarios.
+     * Users that need to have more control can configure language specific mappings by using
+     * {@link #PARAM_LEXICAL_CATEGORIES}, {@link #PARAM_POS_TYPES}, {@link #PARAM_POS_TAG} and
+     * {@link #PARAM_POS_PROBABILITY} in combination with the {@link #PROCESSED_LANGUAGES}
+     * configuration.<p>
+     * The {@link #DEFAULT_PROCESS_ONLY_PROPER_NOUNS_STATE default} if this is <code>false</code>
+     */
+    public static final String PROCESS_ONLY_PROPER_NOUNS_STATE = "org.apache.stanbol.enhancer.engines.keywordextraction.properNounsState";
+    public static final boolean DEFAULT_PROCESS_ONLY_PROPER_NOUNS_STATE = false;
+    public static Set<Pos> DEFAULT_PROCESSED_POS_TYPES = TextProcessingConfig.DEFAULT_PROCESSED_POS;
+    public static Set<LexicalCategory> DEFAULT_PROCESSED_LEXICAL_CATEGORIES = TextProcessingConfig.DEFAULT_PROCESSED_LEXICAL_CATEGORIES;
+    /**
+     * Allows to configure the processed languages by using the syntax supported by {@link LanguageConfiguration}.
+     * In addition this engine supports language specific configurations for matched {@link LexicalCategory}
+     * {@link Pos} and String POS tags as well as Pos annotation probabilities by using the parameters
+     * {@link #PARAM_LEXICAL_CATEGORIES}, {@link #PARAM_POS_TYPES}, {@link #PARAM_POS_TAG} and
+     * {@link #PARAM_POS_PROBABILITY}.<p>
+     * See the documentation of {@link LanguageConfiguration} for details of the Syntax.
+     */
+    public static final String PROCESSED_LANGUAGES = "org.apache.stanbol.enhancer.engines.keywordextraction.processedLanguages";
+    /*
+     * Parameters used for language specific text processing configurations
+     */
+    public static final String PARAM_LEXICAL_CATEGORIES = "lc";
+    public static final String PARAM_POS_TYPES = "pos";
+    public static final String PARAM_POS_TAG = "tag";
+    public static final String PARAM_POS_PROBABILITY = "prob";
     /**
      * Adds the dereference feature (STANBOL-333) also to this engine.
      * This will be replaced by STANBOL-336. 
@@ -178,6 +252,10 @@ public class KeywordLinkingEngine 
      */
     public static final boolean DEFAULT_DEREFERENCE_ENTITIES_STATE = true;
     /**
+     * Allows to add a list of fields that are included when dereferencing Entities
+     */
+    public static final String DEREFERENCE_ENTITIES_FIELDS = "org.apache.stanbol.enhancer.engines.keywordextraction.dereferenceFields";
+    /**
      * Additional fields added for dereferenced entities
      */
     private static final Collection<String> DEREFERENCE_FIELDS = Arrays.asList(
@@ -200,28 +278,40 @@ public class KeywordLinkingEngine 
      * The languages this engine is configured to enhance. An empty List is
      * considered as active for any language
      */
-    private Set<String> languages = DEFAULT_LANGUAGES;
+    private LanguageConfiguration languages = new LanguageConfiguration(PROCESSED_LANGUAGES, new String[]{"*"});
     /**
      * The literal representing the LangIDEngine as creator.
      */
     public static final Literal LANG_ID_ENGINE_NAME = LiteralFactory.getInstance().createTypedLiteral("org.apache.stanbol.enhancer.engines.langid.LangIdEnhancementEngine");
-    
+
+    /**
+     * The default value for the LIMIT of the {@link EntitySearcher}
+     */
+    private static final int DEFAULT_ENTITY_SEARCHER_LIMIT = 10;
+
     private EntitySearcher entitySearcher;
     private EntityLinkerConfig linkerConfig;
-    private TextAnalyzerConfig nlpConfig;
     
-    /**
-     * The reference to the OpenNLP component
-     */
-    @org.apache.felix.scr.annotations.Reference
-    private OpenNLP openNLP;
-    //TextAnalyzer was changed to have a scope of a single request ( call to
-    //#computeEnhancement!
-    //private TextAnalyzer textAnalyser;
-    /**
-     * Used to create {@link AnalysedContent} instances for parsed content items
-     */
-    private OpenNlpAnalysedContentFactory analysedContentFactory;
+    private TextProcessingConfig defaultTextProcessingConfig;
+    private Map<String,TextProcessingConfig> textProcessingConfigs = new HashMap<String,TextProcessingConfig>();
+    
+    //NOTE as I want to inject an instance of LabelTokenizerManager I need to implement my own
+    //bind/unbind methods as the generated methods would expect a field 
+    // "LabelTokenizerManager labelTokenizer" and not "LabelTokenizer labelTokenizer"
+    @org.apache.felix.scr.annotations.Reference(referenceInterface=LabelTokenizerManager.class,
+            bind="bindLabelTokenizer",unbind="unbindLabelTokenizer")
+    private LabelTokenizer labelTokenizer;
+
+    protected void bindLabelTokenizer(LabelTokenizerManager ltm){
+        labelTokenizer = ltm;
+    }
+    
+    protected void unbindLabelTokenizer(LabelTokenizerManager ltm){
+        labelTokenizer = null;
+    }
+    
+    
+    
     /**
      * The literalFactory used to create typed literals
      */
@@ -282,44 +372,36 @@ public class KeywordLinkingEngine 
     public KeywordLinkingEngine() {
     }
     /**
-     * Internal Constructor used by {@link #createInstance(OpenNLP, EntitySearcher, EntityLinkerConfig)}
-     * @param openNLP
-     * @param entitySearcher
-     * @param config
-     */
-    protected KeywordLinkingEngine(OpenNLP openNLP,EntitySearcher entitySearcher,
-                                   TextAnalyzerConfig nlpConfig,EntityLinkerConfig linkingConfig){
-        this.openNLP = openNLP;
+     * Internal Constructor used by {@link #createInstance(EntitySearcher, TextProcessingConfig, EntityLinkerConfig)}
+     * @param entitySearcher The component used to lookup Entities
+     * @param textProcessingConfig The configuration on how to use the {@link AnalysedText} content part of
+     * processed {@link ContentItem}s
+     * @param linkingConfig the configuration for the EntityLinker
+     */
+    protected KeywordLinkingEngine(EntitySearcher entitySearcher,TextProcessingConfig textProcessingConfig, 
+                                   EntityLinkerConfig linkingConfig, LabelTokenizer labelTokenizer){
         this.linkerConfig = linkingConfig != null ? linkingConfig : new EntityLinkerConfig();
-        this.nlpConfig = nlpConfig != null ? nlpConfig : new TextAnalyzerConfig();
-        this.analysedContentFactory = OpenNlpAnalysedContentFactory.getInstance(openNLP,nlpConfig);
+        this.defaultTextProcessingConfig = textProcessingConfig != null ? textProcessingConfig : new TextProcessingConfig();
+        this.textProcessingConfigs = Collections.emptyMap();
         this.entitySearcher = entitySearcher;
+        this.labelTokenizer = labelTokenizer;
     }
     /**
      * Allows to create an instance that can be used outside of an OSGI
      * environment. This is mainly intended for unit tests.
-     * @param openNLP The {@link OpenNLP} instance used for natural language processing
-     * @param entitySearcher the searcher used to lookup terms
-     * @param config the configuration or <code>null</code> to use the defaults
+     * @param entitySearcher The component used to lookup Entities
+     * @param textProcessingConfig The configuration on how to use the {@link AnalysedText} content part of
+     * processed {@link ContentItem}s
+     * @param linkingConfig the configuration for the EntityLinker
      * @return the created engine instance
      */
-    public static KeywordLinkingEngine createInstance(OpenNLP openNLP,
-                                                      EntitySearcher entitySearcher,
-                                                      TextAnalyzerConfig nlpConfig,
-                                                      EntityLinkerConfig linkingConfig){
-        return new KeywordLinkingEngine(openNLP,entitySearcher,nlpConfig,linkingConfig);
+    public static KeywordLinkingEngine createInstance(EntitySearcher entitySearcher,
+                                                      TextProcessingConfig textProcessingConfig,
+                                                      EntityLinkerConfig linkingConfig,
+                                                      LabelTokenizer labelTokenizer){
+        return new KeywordLinkingEngine(entitySearcher,textProcessingConfig,linkingConfig,labelTokenizer);
     }
 
-
-    /**
-     * Checks if the parsed language is enabled for processing.
-     * @param language The language to process
-     * @return the processing state for the parsed language.
-     */
-    protected boolean isProcessableLanguages(String language) {
-        return languages.isEmpty() || languages.contains(language);
-    }
-    
     @Override
     public Map<String,Object> getServiceProperties() {
         return Collections.unmodifiableMap(Collections.singletonMap(
@@ -329,70 +411,57 @@ public class KeywordLinkingEngine 
 
     @Override
     public int canEnhance(ContentItem ci) throws EngineException {
-        if(ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES) != null){
-            return ENHANCE_ASYNC; //KeywordLinking now supports async processing
-        } else {
+        log.info("canEnhancer {}",ci.getUri());
+        if(isOfflineMode() && !entitySearcher.supportsOfflineMode()){
+            log.warn("{} '{}' is inactive because EntitySearcher does not support Offline mode!",
+                getClass().getSimpleName(),getName());
+            return CANNOT_ENHANCE;
+        }
+        String language = getLanguage(this, ci, false);
+        if(language == null || !languages.isLanguage(language)){
+            log.debug("Engine {} ignores ContentItem {} becuase language {} is not condigured.",
+                new Object[]{ getName(), ci.getUri(), language});
             return CANNOT_ENHANCE;
         }
+        //we need a detected language, the AnalyzedText contentPart with
+        //Tokens.
+        AnalysedText at = getAnalysedText(this, ci, false);
+        return at != null && at.getTokens().hasNext() ?
+                ENHANCE_ASYNC : CANNOT_ENHANCE;
     }
 
     @Override
     public void computeEnhancements(ContentItem ci) throws EngineException {
+        log.info(" enhance ci {}",ci.getUri());
         if(isOfflineMode() && !entitySearcher.supportsOfflineMode()){
-            throw new EngineException("Offline mode is not supported by the Component used to lookup Entities");
+            throw new EngineException(this,ci,"Offline mode is not supported by the used EntitySearcher!",null);
+        }
+        AnalysedText at = getAnalysedText(this, ci, true);
+        log.info("  > AnalysedText {}",at);
+        String language = getLanguage(this, ci, true);
+        if(log.isDebugEnabled()){
+            log.debug("computeEnhancements for ContentItem {} language {} text={}", 
+                new Object []{ci.getUri().getUnicodeString(), language, StringUtils.abbreviate(at.getSpan(), 100)});
         }
-        Entry<UriRef,Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES);
-        if(contentPart == null){
-            throw new IllegalStateException("No ContentPart with a supported Mime Type"
-                    + "found for ContentItem "+ci.getUri()+"(supported: '"
-                    + SUPPORTED_MIMETYPES+"') -> this indicates that canEnhance was" 
-                    + "NOT called and indicates a bug in the used EnhancementJobManager!");
+        log.info("  > Language {}",language);
+        TextProcessingConfig tpc = textProcessingConfigs.get(language);
+        if(tpc == null){
+            tpc = defaultTextProcessingConfig;
+            log.info("    ... with default TextProcessingConfig");
+        } else {
+            log.info("    ... with language specific TextProcessingConfig");
         }
-        String text;
+        EntityLinker entityLinker = new EntityLinker(at,language, 
+            defaultTextProcessingConfig, entitySearcher, linkerConfig, labelTokenizer);
+        //process
+        entityLinker.process();
+        //write results (requires a write lock)
+        ci.getLock().writeLock().lock();
         try {
-            text = ContentItemHelper.getText(contentPart.getValue());
-        } catch (IOException e) {
-            throw new InvalidContentException(String.format("Unable to extract "
-                +" text from ContentPart %s of ContentItem %s!",
-                contentPart.getKey(),ci.getUri()),e);
-        }
-        if (text.trim().length() == 0) {
-            // TODO: make the length of the data a field of the ContentItem
-            // interface to be able to filter out empty items in the canEnhance
-            // method
-            log.warn("ContentPart {} of ContentItem does not contain any Text to extract knowledge from",
-                contentPart.getKey(), ci);
-            return;
-        }
-        //Determine the language
-        String language;
-        ci.getLock().readLock().lock();
-        try {
-         language = extractLanguage(ci);
+            writeEnhancements(ci, entityLinker.getLinkedEntities().values(), language);
         } finally {
-            ci.getLock().readLock().unlock();
+            ci.getLock().writeLock().unlock();
         }
-        if(isProcessableLanguages(language)){
-            log.debug("computeEnhancements for ContentItem {} language {} text={}", 
-                new Object []{ci.getUri().getUnicodeString(), language, StringUtils.abbreviate(text, 100)});
-            
-            EntityLinker entityLinker = new EntityLinker(
-                analysedContentFactory.create(text, language),
-                entitySearcher, linkerConfig);
-            //process
-            entityLinker.process();
-            //write results (requires a write lock)
-            ci.getLock().writeLock().lock();
-            try {
-                writeEnhancements(ci, entityLinker.getLinkedEntities().values(), language);
-            } finally {
-                ci.getLock().writeLock().unlock();
-            }
-        } else {
-            log.debug("ignore ContentItem {} because language '{}' is not configured to" +
-            		"be processed by this engine.",ci.getUri().getUnicodeString(),language);
-        }
-        
     }
 
     /**
@@ -474,38 +543,6 @@ public class KeywordLinkingEngine 
             }
         }
     }
-    /**
-     * Extracts the language of the parsed ContentItem by using
-     * {@link EnhancementEngineHelper#getLanguage(ContentItem)} and "en" as
-     * default.
-     * @param ci the content item
-     * @return the language
-     */
-    private String extractLanguage(ContentItem ci) {
-        String lang = EnhancementEngineHelper.getLanguage(ci);
-//        if(lang != null){
-//        MGraph metadata = ci.getMetadata();
-//        Iterator<Triple> langaugeEnhancementCreatorTriples = 
-//            metadata.filter(null, Properties.DC_CREATOR, LANG_ID_ENGINE_NAME);
-//        if(langaugeEnhancementCreatorTriples.hasNext()){
-//            String lang = EnhancementEngineHelper.getString(metadata, 
-//                langaugeEnhancementCreatorTriples.next().getSubject(), 
-//                Properties.DC_LANGUAGE);
-        if(lang != null){
-            return lang;
-        } else {
-            log.warn("Unable to extract language for ContentItem %s! The Enhancement of the %s is missing the %s property",
-                new Object[]{ci.getUri().getUnicodeString(),LANG_ID_ENGINE_NAME.getLexicalForm(),Properties.DC_LANGUAGE});
-            log.warn(" ... return 'en' as default");
-            return "en";
-        }
-//        } else {
-//            log.warn("Unable to extract language for ContentItem %s! Is the %s active?",
-//                ci.getUri().getUnicodeString(),LANG_ID_ENGINE_NAME.getLexicalForm());
-//            log.warn(" ... return 'en' as default");
-//            return "en";
-//        }
-    }
 
     
     /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
@@ -525,7 +562,7 @@ public class KeywordLinkingEngine 
      * call<ul>
      * <li> {@link #activateEntitySearcher(ComponentContext, Dictionary)}
      * <li> {@link #initEntityLinkerConfig(Dictionary, EntityLinkerConfig)} and
-     * <li> {@link #activateTextAnalyzerConfig(Dictionary)}
+     * <li> {@link #activateTextProcessingConfig(Dictionary)}
      * <li> {@link #dereferenceEntitiesState} (needs to be called after 
      * {@link #initEntityLinkerConfig(Dictionary, EntityLinkerConfig)})
      * </ul>
@@ -539,7 +576,7 @@ public class KeywordLinkingEngine 
     protected void activate(ComponentContext context) throws ConfigurationException {
         super.activate(context);
         Dictionary<String,Object> properties = context.getProperties();
-        activateTextAnalyzerConfig(properties);
+        activateTextProcessingConfig(properties);
         activateEntitySearcher(context, properties);
         activateEntityLinkerConfig(properties);
         activateEntityDereference(properties);
@@ -550,7 +587,7 @@ public class KeywordLinkingEngine 
      * {@link #DEREFERENCE_ENTITIES} configuration.
      * @param properties the configuration
      */
-    protected final void activateEntityDereference(Dictionary<String,Object> properties) {
+    protected final void activateEntityDereference(Dictionary<String,Object> properties) throws ConfigurationException {
         Object value = properties.get(DEREFERENCE_ENTITIES);
         if(value instanceof Boolean){
             dereferenceEntitiesState = ((Boolean)value).booleanValue();
@@ -560,9 +597,32 @@ public class KeywordLinkingEngine 
             dereferenceEntitiesState = DEFAULT_DEREFERENCE_ENTITIES_STATE;
         }
         if(dereferenceEntitiesState){
-            linkerConfig.getSelectedFields().addAll(DEREFERENCE_FIELDS);
-        }
-    }
+            value = properties.get(DEREFERENCE_ENTITIES_FIELDS);
+            if(value instanceof String[]){
+                for(String field : (String[])value){
+                    if(field != null && !field.isEmpty()){
+                        linkerConfig.getSelectedFields().add(field);
+                    }
+                }
+            } else if(value instanceof Collection<?>){
+                for(Object field : (Collection<?>)value){
+                    if(field != null && !field.toString().isEmpty()){
+                        linkerConfig.getSelectedFields().add(field.toString());
+                    }
+                }
+            } else if(value instanceof String){
+                if(!value.toString().isEmpty()){
+                    linkerConfig.getSelectedFields().add(value.toString());
+                }
+            } else if(value != null){
+                throw new ConfigurationException(DEREFERENCE_ENTITIES_FIELDS, 
+                    "Dereference Entities_Fields MUST BE parsed as String[], Collection<String> or "
+                    + "String (single value). The actual value '"+value+"'(type: '"+value.getClass() 
+                    + "') is NOT supported");
+            } else { //value == null -> add the default fields
+                linkerConfig.getSelectedFields().addAll(DEREFERENCE_FIELDS);
+            }
+        }    }
 
     /**
      * Initialise the {@link TextAnalyzer} component.<p>
@@ -575,27 +635,10 @@ public class KeywordLinkingEngine 
      * 
      * @param configuration the OSGI component configuration
      */
-    protected final void activateTextAnalyzerConfig(Dictionary<String,Object> configuration) throws ConfigurationException {
-        nlpConfig = new TextAnalyzerConfig();
-        Object value;
-        value = configuration.get(PROCESSED_LANGUAGES);
-        if(value == null){
-            this.languages = DEFAULT_LANGUAGES;
-        } else if (value.toString().trim().isEmpty()){
-            this.languages = Collections.emptySet();
-        } else {
-            String[] languageArray = value.toString().split(",");
-            languages = new HashSet<String>();
-            for(String language : languageArray){
-                if(language != null){
-                    language = language.trim();
-                    if(!language.isEmpty()){
-                        languages.add(language);
-                    }
-                }
-            }
-        }
-        value = configuration.get(MIN_POS_TAG_PROBABILITY);
+    protected final void activateTextProcessingConfig(Dictionary<String,Object> configuration) throws ConfigurationException {
+        //Parse the default text processing configuration
+        defaultTextProcessingConfig = new TextProcessingConfig();
+        Object value = configuration.get(MIN_POS_TAG_PROBABILITY);
         double minPosTagProb;
         if(value instanceof Number){
             minPosTagProb = ((Number)value).doubleValue();
@@ -614,20 +657,134 @@ public class KeywordLinkingEngine 
                 "The configured min POS tag probability MUST BE in the range [0..1] " +
                 "or < 0 to deactivate this feature (parsed value "+value+")!");
         }
-        nlpConfig.setMinPosTagProbability(minPosTagProb);
-        value = configuration.get(KEYWORD_TOKENIZER);
-        //the keyword tokenizer config
+        defaultTextProcessingConfig.setMinPosAnnotationProbability(minPosTagProb);
+        defaultTextProcessingConfig.setMinExcludePosAnnotationProbability(minPosTagProb/2d);
+        //set the default LexicalTypes
+        value = configuration.get(PROCESS_ONLY_PROPER_NOUNS_STATE);
+        boolean properNounState;
         if(value instanceof Boolean){
-            nlpConfig.forceKeywordTokenizer((Boolean)value);
-        } else if(value != null && !value.toString().isEmpty()){
-            nlpConfig.forceKeywordTokenizer(Boolean.valueOf(value.toString()));
+            properNounState = ((Boolean)value).booleanValue();
+        } else if (value != null){
+            properNounState = Boolean.parseBoolean(value.toString());
+        } else {
+            properNounState = DEFAULT_PROCESS_ONLY_PROPER_NOUNS_STATE;
+        }
+        if(properNounState){
+            defaultTextProcessingConfig.setProcessedLexicalCategories(Collections.EMPTY_SET);
+            defaultTextProcessingConfig.setProcessedPos(DEFAULT_PROCESSED_POS_TYPES);
+            log.debug("> ProperNoun matching activated (matched Pos: {})",
+                defaultTextProcessingConfig.getProcessedPos());
+        } else {
+            defaultTextProcessingConfig.setProcessedLexicalCategories(DEFAULT_PROCESSED_LEXICAL_CATEGORIES);
+            defaultTextProcessingConfig.setProcessedPos(Collections.EMPTY_SET);
+            log.debug("> Noun matching activated (matched LexicalCategories: {})",
+                defaultTextProcessingConfig.getProcessedLexicalCategories());
+        }
+        //parse the language configuration
+        value = configuration.get(PROCESSED_LANGUAGES);
+        if(value instanceof String){
+            throw new ConfigurationException(PROCESSED_LANGUAGES, "Unable to configure "
+                + getClass().getSimpleName()+" '"+getName()+": 'Comma separated String "
+                + "is not supported for configurung the processed languages for the because "
+                + "the comma is used as separator for values of the parameters '"
+                + PARAM_LEXICAL_CATEGORIES+"', '"+ PARAM_POS_TYPES+"'and'"+PARAM_POS_TAG
+                + "! Users need to use String[] or Collection<?> instead!");
+        }
+        languages.setConfiguration(configuration);
+        Map<String,String> defaultConfig = languages.getDefaultParameters();
+        if(!defaultConfig.isEmpty()){
+            applyLanguageParameter(defaultTextProcessingConfig,null,defaultConfig);
+        }
+        for(String lang : languages.getExplicitlyIncluded()){
+            TextProcessingConfig tpc = defaultTextProcessingConfig.clone();
+            applyLanguageParameter(tpc, lang, languages.getParameters(lang));
+            this.textProcessingConfigs.put(lang, tpc);
+        }
+    }
+
+    private void applyLanguageParameter(TextProcessingConfig tpc, String language, Map<String,String> config) throws ConfigurationException {
+        Set<LexicalCategory> lexCats = parseEnumParam(config, PROCESSED_LANGUAGES, language, PARAM_LEXICAL_CATEGORIES, LexicalCategory.class);
+        Set<Pos> pos = parseEnumParam(config, PROCESSED_LANGUAGES, language,PARAM_POS_TYPES, Pos.class);
+        Set<String> tags = parsePosTags(config.get(PARAM_POS_TAG));
+        Double prob = null;
+        String paramVal = config.get(PARAM_POS_PROBABILITY);
+        if(paramVal != null && !paramVal.trim().isEmpty()){
+            try {
+                prob = Double.parseDouble(paramVal.trim());
+            } catch (NumberFormatException e) {
+                throw new ConfigurationException(PROCESSED_LANGUAGES, "Unable to parse parameter '"
+                    + PARAM_POS_PROBABILITY+"="+paramVal.trim()
+                    + "' from the "+(language == null ? "default" : language)
+                    + " language configuration", e);
+            }
+        }
+        if(!lexCats.isEmpty() || !pos.isEmpty() || !tags.isEmpty()){
+            log.info(" > use spefic language Configuration for language {}",
+                getClass().getSimpleName(),getName());
+            log.info("   - LexCat: {}",lexCats);
+            log.info("   - Pos: {}",pos);
+            log.info("   - Tags: {}",tags);
+            tpc.setProcessedLexicalCategories(lexCats);
+            tpc.setProcessedPos(pos);
+            tpc.setProcessedPosTags(tags);
+        }
+        if(prob != null){
+            tpc.setMinPosAnnotationProbability(prob);
+            tpc.setMinExcludePosAnnotationProbability(prob/2d);
+        }
+    }
+    private Set<String> parsePosTags(String value) {
+        if(value == null || value.isEmpty()){
+            return Collections.EMPTY_SET;
+        } else {
+            Set<String> tags = new HashSet<String>();
+            for(String entry : value.split(",")){
+                entry = entry.trim();
+                if(!entry.isEmpty()){
+                    tags.add(entry);
+                }
+            }
+            return tags;
         }
-        //nlpConfig.enablePosTypeChunker(false);
-        //nlpConfig.enableChunker(false);
-        analysedContentFactory = OpenNlpAnalysedContentFactory.getInstance(openNLP,nlpConfig);
     }
 
     /**
+     * Utility to parse Enum members out of a comma separated string
+     * @param config the config
+     * @param property the property (only used for error handling)
+     * @param param the key of the config used to obtain the config
+     * @param enumClass the {@link Enum} class
+     * @return the configured members of the Enum or an empty set if none 
+     * @throws ConfigurationException if a configured value was not part of the enum
+     */
+    private <T extends Enum<T>> Set<T> parseEnumParam(Map<String,String> config,
+        String property, String language, //params used for logging
+        String param,Class<T> enumClass) throws ConfigurationException {
+        Set<T> enumSet;
+        String val = config.get(param);
+        if(val == null){
+            enumSet = Collections.emptySet();
+        } else {
+            enumSet = EnumSet.noneOf(enumClass);
+            for(String entry : val.split(",")){
+                entry = entry.trim();
+                if(!entry.isEmpty()){
+                    try {
+                        enumSet.add(Enum.valueOf(enumClass,entry.toString()));
+                    } catch (IllegalArgumentException e) {
+                        throw new ConfigurationException(property, 
+                            "'"+entry +"' of param '"+param+"' for language '"
+                            + (language == null ? "default" : language)
+                            + "'is not a member of the enum "+ enumClass.getSimpleName()
+                            + "(configured : '"+val+"')!" ,e);
+                    }
+                }
+            }
+        }
+        return enumSet;
+    }
+    
+    /**
      * Configures the parsed {@link EntityLinkerConfig} with the values of the
      * following properties:<ul>
      * <li>{@link #NAME_FIELD}
@@ -743,6 +900,48 @@ public class KeywordLinkingEngine 
             }
             linkerConfig.setMinSearchTokenLength(minSearchTokenLength);
         }
+        //init MAX_SEARCH_TOKENS
+        value = configuration.get(MAX_SEARCH_TOKENS);
+        Integer maxSearchTokens;
+        if(value instanceof Integer){
+            maxSearchTokens = (Integer)value;
+        } else if (value != null){
+            try {
+                maxSearchTokens = Integer.valueOf(value.toString());
+            } catch(NumberFormatException e){
+                throw new ConfigurationException(MAX_SEARCH_TOKENS, "Values MUST be valid Integer values > 0",e);
+            }
+        } else {
+            maxSearchTokens = null;
+        }
+        if(maxSearchTokens != null){
+            if(maxSearchTokens < 1){
+                throw new ConfigurationException(MAX_SEARCH_TOKENS, "Values MUST be valid Integer values > 0");
+            }
+            linkerConfig.setMaxSearchTokens(maxSearchTokens);
+        }
+        
+        //init the MAX_SEARCH_TOKEN_DISTANCE
+        value = configuration.get(MAX_SEARCH_TOKEN_DISTANCE);
+        Integer maxSearchDistance;
+        if(value instanceof Integer){
+            maxSearchDistance = (Integer)value;
+        } else if (value != null){
+            try {
+                maxSearchDistance = Integer.valueOf(value.toString());
+            } catch(NumberFormatException e){
+                throw new ConfigurationException(MAX_SEARCH_TOKENS, "Values MUST be valid Integer values > 0",e);
+            }
+        } else {
+            maxSearchDistance = null;
+        }
+        if(maxSearchDistance != null){
+            if(maxSearchDistance < 1){
+                throw new ConfigurationException(MAX_SEARCH_TOKENS, "Values MUST be valid Integer values > 0");
+            }
+            linkerConfig.setMaxSearchDistance(maxSearchDistance);
+        }
+
         //init the REDIRECT_PROCESSING_MODE
         value = configuration.get(REDIRECT_PROCESSING_MODE);
         if(value != null){
@@ -753,6 +952,7 @@ public class KeywordLinkingEngine 
                     Arrays.toString(RedirectProcessingMode.values()));
             }
         }
+        
         //init the DEFAULT_LANGUAGE
         value = configuration.get(DEFAULT_MATCHING_LANGUAGE);
         if(value != null){
@@ -766,6 +966,7 @@ public class KeywordLinkingEngine 
                 linkerConfig.setDefaultLanguage(defaultLang);
             }
         }
+        
         // init MIN_TOKEN_MATCH_FACTOR
         value=configuration.get(MIN_TOKEN_MATCH_FACTOR);
         float minTokenMatchFactor;
@@ -873,9 +1074,9 @@ public class KeywordLinkingEngine 
         }
         //TODO: make limit configurable!
         if(Entityhub.ENTITYHUB_IDS.contains(referencedSiteName.toLowerCase())){
-            entitySearcher = new EntityhubSearcher(context.getBundleContext(),10);
+            entitySearcher = new EntityhubSearcher(context.getBundleContext(),DEFAULT_ENTITY_SEARCHER_LIMIT);
         } else {
-            entitySearcher = new ReferencedSiteSearcher(context.getBundleContext(),referencedSiteName,10);
+            entitySearcher = new ReferencedSiteSearcher(context.getBundleContext(),referencedSiteName,DEFAULT_ENTITY_SEARCHER_LIMIT);
         }
     }
     /**
@@ -891,7 +1092,7 @@ public class KeywordLinkingEngine 
     protected void deactivate(ComponentContext context) {
         super.deactivate(context);
         deactivateEntitySearcher();
-        deactivateTextAnalyzerConfig();
+        deactivateTextProcessingConfig();
         deactivateEntityLinkerConfig();
         deactivateEntityDereference();
     }
@@ -907,10 +1108,9 @@ public class KeywordLinkingEngine 
      * Deactivates the {@link TextAnalyzer} as well as resets the set of languages
      * to process to {@link #DEFAULT_LANGUAGES}
      */
-    protected void deactivateTextAnalyzerConfig() {
-        this.nlpConfig = null;
-        this.analysedContentFactory = null;
-        languages = DEFAULT_LANGUAGES;
+    protected void deactivateTextProcessingConfig() {
+        this.languages.setDefault(); //reset to the default
+        this.textProcessingConfigs.clear();
     }
 
     /**

Copied: stanbol/branches/stanbol-nlp-processing/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/EntityLinker.java (from r1393931, stanbol/branches/stanbol-nlp-processing/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java)
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/EntityLinker.java?p2=stanbol/branches/stanbol-nlp-processing/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/EntityLinker.java&p1=stanbol/branches/stanbol-nlp-processing/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java&r1=1393931&r2=1403238&rev=1403238&view=diff
==============================================================================
--- stanbol/branches/stanbol-nlp-processing/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java (original)
+++ stanbol/branches/stanbol-nlp-processing/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/EntityLinker.java Mon Oct 29 10:24:38 2012
@@ -14,7 +14,9 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-package org.apache.stanbol.enhancer.engines.keywordextraction.linking;
+package org.apache.stanbol.enhancer.engines.keywordextraction.impl;
+
+import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.POS_ANNOTATION;
 
 import java.util.ArrayList;
 import java.util.Arrays;
@@ -28,10 +30,16 @@ import java.util.Map;
 import java.util.Set;
 
 import org.apache.clerezza.rdf.core.UriRef;
-import org.apache.stanbol.commons.opennlp.TextAnalyzer.AnalysedText.Token;
-import org.apache.stanbol.enhancer.engines.keywordextraction.impl.ProcessingState;
+import org.apache.stanbol.enhancer.engines.keywordextraction.impl.ProcessingState.TokenData;
+import org.apache.stanbol.enhancer.engines.keywordextraction.impl.Suggestion.MATCH;
+import org.apache.stanbol.enhancer.engines.keywordextraction.linking.EntityLinkerConfig;
 import org.apache.stanbol.enhancer.engines.keywordextraction.linking.EntityLinkerConfig.RedirectProcessingMode;
-import org.apache.stanbol.enhancer.engines.keywordextraction.linking.Suggestion.MATCH;
+import org.apache.stanbol.enhancer.engines.keywordextraction.linking.EntitySearcher;
+import org.apache.stanbol.enhancer.engines.keywordextraction.linking.LabelTokenizer;
+import org.apache.stanbol.enhancer.engines.keywordextraction.linking.TextProcessingConfig;
+import org.apache.stanbol.enhancer.nlp.NlpAnnotations;
+import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
+import org.apache.stanbol.enhancer.nlp.model.Token;
 import org.apache.stanbol.enhancer.servicesapi.EngineException;
 import org.apache.stanbol.entityhub.servicesapi.defaults.NamespaceEnum;
 import org.apache.stanbol.entityhub.servicesapi.model.Reference;
@@ -44,9 +52,10 @@ import org.slf4j.LoggerFactory;
 public class EntityLinker {
     
     private final Logger log = LoggerFactory.getLogger(EntityLinker.class);
-
-    private final EntityLinkerConfig config;
-    private final AnalysedContent content;
+    
+    private final EntityLinkerConfig linkerConfig;
+    private final TextProcessingConfig textProcessingConfig;
+    private final AnalysedText analysedText;
     private final EntitySearcher entitySearcher;
     /**
      * The state of the current processing
@@ -57,142 +66,184 @@ public class EntityLinker {
      */
     private final Map<String,LinkedEntity> linkedEntities = new HashMap<String,LinkedEntity>();
     
-    /**
-     * After {@link #process()}ing this returns the entities linked for the
-     * parsed {@link AnalysedContent}.
-     * @return the linked entities
-     */
-    public final Map<String,LinkedEntity> getLinkedEntities() {
-        return linkedEntities;
-    }
-    public EntityLinker(AnalysedContent content,EntitySearcher taxonomy,EntityLinkerConfig config){
-        if(config == null){
-            throw new IllegalArgumentException("The parsed TaxonomyLinkerConfig MUST NOT be NULL!");
-        }
-        if(taxonomy == null){
-            throw new IllegalArgumentException("The parsed Taxonomy MUST NOT be NULL!");
-        }
-        if(content == null){
-            throw new IllegalArgumentException("The parsed AnalysedContent MUST NOT be NULL!");
-        }
-        this.content = content;
-        this.entitySearcher = taxonomy;
-        this.config = config;
-        this.state = new ProcessingState(content.getAnalysedText());
+    private Integer lookupLimit;
+    
+    private LabelTokenizer labelTokenizer;
+    
+
+    public EntityLinker(AnalysedText analysedText, String language,
+                        TextProcessingConfig textProcessingConfig,
+                        EntitySearcher entitySearcher,
+                        EntityLinkerConfig linkerConfig,
+                        LabelTokenizer labelTokenizer) {
+        this.analysedText = analysedText;
+        this.entitySearcher = entitySearcher;
+        this.linkerConfig = linkerConfig;
+        this.textProcessingConfig = textProcessingConfig;
+        this.labelTokenizer = labelTokenizer;
+        this.state = new ProcessingState(analysedText,language,textProcessingConfig,linkerConfig);
+        this.lookupLimit  = Math.max(10,linkerConfig.getMaxSuggestions()*2);
     }
     /**
      * Steps over the sentences, chunks, tokens of the {@link #sentences}
      */
     public void process() throws EngineException {
-        int debugedIndex = 0;
+        //int debugedIndex = 0;
         while(state.next()) {
-            if(log.isDebugEnabled() && (state.getTokenIndex() > debugedIndex || state.getTokenIndex() ==  0)){
-                debugedIndex = state.getTokenIndex();
-                Token token = state.getToken();
-                log.debug(" {} {} (pos:{}|prop:{})",new Object[]{
-                    isProcessableToken(token)? '+':'-',
-                    token.getText(),token.getPosTags(),token.getPosProbabilities()
-                });
-            }
-            if(isProcessableToken(state.getToken())){
-                List<String> searchStrings = new ArrayList<String>(config.getMaxSearchTokens());
-                searchStrings.add(state.getToken().getText());
-                //get the list of all tokens that can possible be matched
-                int includeTokenIndex = state.getTokenIndex();
-                includeTokenIndex++;
-                while(searchStrings.size() < config.getMaxSearchTokens() && //more search strings
-                        (includeTokenIndex <= (state.getChunk() != null ? //still within
-                                state.getChunk().getEnd() : //the chunk
-                                    state.getSentence().getTokens().size()-1))){ //or sentence
-                    Token included = state.getSentence().getTokens().get(includeTokenIndex);
-                    if(log.isDebugEnabled()  && includeTokenIndex > debugedIndex){
-                        debugedIndex = includeTokenIndex;
-                        log.debug(" {} {} (pos:{}|prop:{})",new Object[]{
-                            isProcessableToken(included)? '+':'-',
-                            included.getText(),included.getPosTags(),included.getPosProbabilities()
+            ProcessingState.TokenData token = state.getToken();
+            if(log.isDebugEnabled()){
+                log.debug("--- preocess Token {}: {} (pos:{}) chunk: {}",
+                    new Object[]{token.index,token.token, 
+                                 token.token.getAnnotations(POS_ANNOTATION),
+                                 token.inChunk != null ? 
+                                         (token.inChunk.chunk + " "+ token.inChunk.chunk.getSpan()) : 
+                                             "none"});
+            }
+            List<String> searchStrings = new ArrayList<String>(linkerConfig.getMaxSearchTokens());
+            searchStrings.add(token.token.getSpan());
+            //Determine the range we are allowed to search for tokens
+            final int minIncludeIndex;
+            int maxIndcludeIndex;
+            if(token.inChunk != null && !textProcessingConfig.isIgnoreChunks()){
+                minIncludeIndex = Math.max(
+                    state.getConsumedIndex()+1, 
+                    token.inChunk.startToken);
+                maxIndcludeIndex = token.inChunk.endToken;
+            } else {
+                maxIndcludeIndex = state.getTokens().size() - 1;
+                minIncludeIndex = state.getConsumedIndex() + 1;
+            }
+            int prevIndex,pastIndex; //search away from the currently active token
+            int distance = 0;
+            do {
+                distance++;
+                prevIndex = token.index-distance;
+                pastIndex = token.index+distance;
+                if(minIncludeIndex <= prevIndex){
+                    TokenData prevToken = state.getTokens().get(prevIndex);
+                    if(log.isDebugEnabled()){
+                        log.debug("    {} {}:'{}' (pos:{})",new Object[]{
+                            prevToken.isMatchable? '+':'-',prevToken.index,
+                            prevToken.token.getSpan(),
+                            prevToken.token.getAnnotations(POS_ANNOTATION)
                         });
                     }
-                    includeTokenIndex++;
-                    if(isProcessableToken(included)){
-                        searchStrings.add(included.getText());
-                    }
-                }
-                //search for Entities
-                List<Suggestion> suggestions = lookupEntities(searchStrings);
-                if(!suggestions.isEmpty()){
-                    //update the suggestions based on the best match
-                    int bestMatchCount = suggestions.get(0).getMatchCount();
-                    Iterator<Suggestion> it = suggestions.iterator();
-                    while(it.hasNext()){
-                        Suggestion suggestion = it.next();
-                        //suggestions that match less tokens as the best match
-                        //need to be updated to PARTIAL
-                        if(suggestion.getMatchCount() < bestMatchCount){
-                            suggestion.setMatch(MATCH.PARTIAL);
-                        }
-                        //Filter matches with less than config.getMinFoundTokens()
-                        //if matchcount is less than of the best match
-                        if(suggestion.getMatchCount() < bestMatchCount &&
-                                suggestion.getMatchCount() < config.getMinFoundTokens()){
-                            it.remove();
-                        } else { //calculate the score
-                            double suggestionMatchScore = suggestion.getMatchCount()*suggestion.getMatchScore();
-                            //how good is the current match in relation to the best one
-                            double spanScore = suggestion.getMatchCount()/bestMatchCount;
-                            //how good is the match to the span selected by this suggestion
-                            double textScore = suggestionMatchScore/suggestion.getSpan();
-                            //how good is the match in relation to the tokens of the suggested label
-                            double labelScore = suggestionMatchScore/suggestion.getLabelTokenCount();
-                            suggestion.setScore(spanScore*spanScore*textScore*labelScore);
-                        }
+                    if(prevToken.isMatchable){
+                        searchStrings.add(0,prevToken.token.getSpan());
                     }
-                    Suggestion oldBestRanked = suggestions.get(0); //for debugging
-                    //resort by score
-                    Collections.sort(suggestions, Suggestion.SCORE_COMPARATOR);
-                    //this should never happen ... but the
-                    //matchcount of the best match MUST NOT change
-                    //after the sort by score!
-                    if(bestMatchCount != suggestions.get(0).getMatchCount()){
-                        log.warn("The match count for the top Ranked Suggestion for {} " +
-                        		"changed after resorting based on Scores!",
-                            state.getTokenText(suggestions.get(0).getStart(),bestMatchCount));
-                        log.warn("  originalbest   : {}",oldBestRanked);
-                        log.warn(" currnet ranking : {}",suggestions);
-                        log.warn("  ... this will result in worng confidence values relative to the best match");
-                    }
-                    //remove all suggestions > config.maxSuggestions
-                    if(suggestions.size() > config.getMaxSuggestions()){
-                        suggestions.subList(config.getMaxSuggestions(),suggestions.size()).clear();
-                    }
-                    
-                    //process redirects
-                    if(config.getRedirectProcessingMode() != RedirectProcessingMode.IGNORE){
-                        for(Suggestion suggestion : suggestions){
-                            processRedirects(suggestion);
-                        }
+                }
+                if(maxIndcludeIndex >= pastIndex){
+                    TokenData pastToken = state.getTokens().get(pastIndex);
+                    if(log.isDebugEnabled()){
+                        log.debug("    {} {}:'{}' (pos:{})",new Object[]{
+                            pastToken.isMatchable? '+':'-',pastToken.index,
+                            pastToken.token.getSpan(),
+                            pastToken.token.getAnnotations(POS_ANNOTATION)
+                        });
+                    }
+                    if(pastToken.isMatchable){
+                        searchStrings.add(pastToken.token.getSpan());
                     }
-                    int start = suggestions.get(0).getStart();
-                    int span = suggestions.get(0).getSpan();
-                    //Store the linking results
-                    String selectedText = state.getTokenText(start,span);
-                    //float score;
-                    LinkedEntity linkedEntity = linkedEntities.get(selectedText);
-                    if(linkedEntity == null){
-                        linkedEntity = new LinkedEntity(selectedText,
-                            suggestions, getLinkedEntityTypes(suggestions.subList(0, 1)));
-                        linkedEntities.put(selectedText, linkedEntity);
-                    }
-                    linkedEntity.addOccurrence(
-                        state.getSentence(), start, span);
-                    //set the next token to process to the next word after the
-                    //currently found suggestion
-                    state.setConsumed(start+span-1);
                 }
-                
-            } //else do not process this token
+            } while(searchStrings.size() < linkerConfig.getMaxSearchTokens() && distance <
+                    linkerConfig.getMaxSearchDistance() &&
+                    (prevIndex > minIncludeIndex || pastIndex < maxIndcludeIndex));
+            //we might have an additional element in the list
+            if(searchStrings.size() > linkerConfig.getMaxSearchTokens()){
+                searchStrings = searchStrings.subList(0, linkerConfig.getMaxSearchTokens());
+            }
+            log.debug("  >> searchStrings {}",searchStrings);
+            //search for Entities
+            List<Suggestion> suggestions = lookupEntities(searchStrings);
+            if(!suggestions.isEmpty()){
+                //update the suggestions based on the best match
+                int bestMatchCount = suggestions.get(0).getMatchCount();
+                Iterator<Suggestion> it = suggestions.iterator();
+                while(it.hasNext()){
+                    Suggestion suggestion = it.next();
+                    //suggestions that match less tokens as the best match
+                    //need to be updated to PARTIAL
+                    if(suggestion.getMatchCount() < bestMatchCount){
+                        suggestion.setMatch(MATCH.PARTIAL);
+                    }
+                    //Filter matches with less than config.getMinFoundTokens()
+                    //if matchcount is less than of the best match
+                    if(suggestion.getMatchCount() < bestMatchCount &&
+                            suggestion.getMatchCount() < linkerConfig.getMinFoundTokens()){
+                        it.remove();
+                    } else { //calculate the score
+                        double suggestionMatchScore = suggestion.getMatchCount()*suggestion.getMatchScore();
+                        //how good is the current match in relation to the best one
+                        double spanScore = suggestion.getMatchCount()/bestMatchCount;
+                        //how good is the match to the span selected by this suggestion
+                        double textScore = suggestionMatchScore/suggestion.getSpan();
+                        //how good is the match in relation to the tokens of the suggested label
+                        double labelScore = suggestionMatchScore/suggestion.getLabelTokenCount();
+                        suggestion.setScore(spanScore*spanScore*textScore*labelScore);
+                    }
+                }
+                Suggestion oldBestRanked = suggestions.get(0); //for debugging
+                //resort by score
+                Collections.sort(suggestions, Suggestion.SCORE_COMPARATOR);
+                //this should never happen ... but the
+                //matchcount of the best match MUST NOT change
+                //after the sort by score!
+                if(bestMatchCount != suggestions.get(0).getMatchCount()){
+                    log.warn("The match count for the top Ranked Suggestion for {} " +
+                    		"changed after resorting based on Scores!",
+                        state.getTokenText(suggestions.get(0).getStart(),bestMatchCount));
+                    log.warn("  originalbest   : {}",oldBestRanked);
+                    log.warn(" currnet ranking : {}",suggestions);
+                    log.warn("  ... this will result in worng confidence values relative to the best match");
+                }
+                //remove all suggestions > config.maxSuggestions
+                if(suggestions.size() > linkerConfig.getMaxSuggestions()){
+                    suggestions.subList(linkerConfig.getMaxSuggestions(),suggestions.size()).clear();
+                }
+                if(log.isDebugEnabled()){
+                    log.debug("  >> Suggestions:");
+                    int i=0;
+                    for(Suggestion s : suggestions){
+                        log.debug("   - {}: {}",i,s);
+                        i++;
+                    }
+                }
+                //process redirects
+                if(linkerConfig.getRedirectProcessingMode() != RedirectProcessingMode.IGNORE){
+                    for(Suggestion suggestion : suggestions){
+                        processRedirects(suggestion);
+                    }
+                }
+                int start = suggestions.get(0).getStart();
+                int span = suggestions.get(0).getSpan();
+                //Store the linking results
+                String selectedText = state.getTokenText(start,span);
+                //float score;
+                LinkedEntity linkedEntity = linkedEntities.get(selectedText);
+                if(linkedEntity == null){
+                    linkedEntity = new LinkedEntity(selectedText,
+                        suggestions, getLinkedEntityTypes(suggestions.subList(0, 1)));
+                    linkedEntities.put(selectedText, linkedEntity);
+                }
+                linkedEntity.addOccurrence(state.getSentence(), 
+                    //NOTE: The end Token is "start+span-1"
+                    state.getTokens().get(start).token, state.getTokens().get(start+span-1).token);
+                //set the next token to process to the next word after the
+                //currently found suggestion
+                state.setConsumed(start+span-1);
+            }
+            
         }
     }
     /**
+     * After {@link #process()}ing this returns the entities linked for the
+     * parsed {@link AnalysedContent}.
+     * @return the linked entities
+     */
+    public final Map<String,LinkedEntity> getLinkedEntities() {
+        return linkedEntities;
+    }
+    /**
      * Retrieves all {@link EntitySearcher#getTypeField()} values of the parsed
      * {@link Suggestion}s and than lookup the {@link NamespaceEnum#dcTerms dc}:type
      * values for the {@link LinkedEntity#getTypes()} by using the configured
@@ -206,10 +257,10 @@ public class EntityLinker {
         Collection<String> conceptTypes = new HashSet<String>();
         for(Suggestion suggestion : suggestions){
             for(Iterator<Reference> types = 
-                suggestion.getRepresentation().getReferences(config.getTypeField()); 
+                suggestion.getRepresentation().getReferences(linkerConfig.getTypeField()); 
                 types.hasNext();conceptTypes.add(types.next().getReference()));
         }
-        Map<String,UriRef> typeMappings = config.getTypeMappings();
+        Map<String,UriRef> typeMappings = linkerConfig.getTypeMappings();
         Set<UriRef> dcTypes = new HashSet<UriRef>();
         for(String conceptType : conceptTypes){
             UriRef dcType = typeMappings.get(conceptType);
@@ -217,8 +268,8 @@ public class EntityLinker {
                 dcTypes.add(dcType);
             }
         }
-        if(dcTypes.isEmpty() && config.getDefaultDcType() != null){
-            dcTypes.add(config.getDefaultDcType());
+        if(dcTypes.isEmpty() && linkerConfig.getDefaultDcType() != null){
+            dcTypes.add(linkerConfig.getDefaultDcType());
         }
         return dcTypes;
     }
@@ -231,7 +282,7 @@ public class EntityLinker {
      */
     private void processRedirects(Suggestion suggestion) {
         //if mode is IGNORE -> nothing to do
-        if(config.getRedirectProcessingMode() == RedirectProcessingMode.IGNORE){
+        if(linkerConfig.getRedirectProcessingMode() == RedirectProcessingMode.IGNORE){
             return;
         }
         //in case results for queries are locally cached it might be the case
@@ -241,14 +292,14 @@ public class EntityLinker {
             return; //Redirects for ResultMatch are already processed ... ignore
         }
         Representation result = suggestion.getResult();
-        Iterator<Reference> redirects = result.getReferences(config.getRedirectField());
-        switch (config.getRedirectProcessingMode()) {
+        Iterator<Reference> redirects = result.getReferences(linkerConfig.getRedirectField());
+        switch (linkerConfig.getRedirectProcessingMode()) {
             case ADD_VALUES:
                 while(redirects.hasNext()){
                     Reference redirect = redirects.next();
                     if(redirect != null){
                         Representation redirectedEntity = entitySearcher.get(redirect.getReference(),
-                            config.getSelectedFields());
+                            linkerConfig.getSelectedFields());
                         if(redirectedEntity != null){
                             for(Iterator<String> fields = redirectedEntity.getFieldNames();fields.hasNext();){
                                 String field = fields.next();
@@ -264,7 +315,7 @@ public class EntityLinker {
                     Reference redirect = redirects.next();
                     if(redirect != null){
                         Representation redirectedEntity = entitySearcher.get(redirect.getReference(),
-                            config.getSelectedFields());
+                            linkerConfig.getSelectedFields());
                         if(redirectedEntity != null){
                             //copy the original result score
                             redirectedEntity.set(RdfResourceEnum.resultScore.getUri(),
@@ -289,14 +340,20 @@ public class EntityLinker {
     private List<Suggestion> lookupEntities(List<String> searchStrings) throws EngineException {
         Collection<? extends Representation> results;
         try {
-            results = entitySearcher.lookup(config.getNameField(),config.getSelectedFields(),
-            searchStrings, state.getSentence().getLanguage(),config.getDefaultLanguage());
+            results = entitySearcher.lookup(linkerConfig.getNameField(),
+                linkerConfig.getSelectedFields(),
+                searchStrings, 
+                new String[]{state.getLanguage(),linkerConfig.getDefaultLanguage()},
+                lookupLimit);
         } catch (RuntimeException e) {
             throw new EngineException(e.getMessage(),e);
         }
+        log.debug("   - found {} entities ...",results.size());
         List<Suggestion> suggestions = new ArrayList<Suggestion>();
         for(Representation result : results){ 
+            log.debug("    > {}",result.getId());
             Suggestion match = matchLabels(result);
+            log.debug("      < {}",match);
             if(match.getMatch() != MATCH.NONE){
                 suggestions.add(match);
             }                    
@@ -342,11 +399,11 @@ public class EntityLinker {
      */
     private Suggestion matchLabels(Representation rep) {
         String curLang = state.getLanguage(); //language of the current sentence
-        String defLang = config.getDefaultLanguage(); //configured default language 
+        String defLang = linkerConfig.getDefaultLanguage(); //configured default language 
 //        Iterator<Text> labels = rep.get(config.getNameField(), //get all labels
 //            state.getLanguage(), //in the current language
 //            config.getDefaultLanguage()); //and the default language
-        Iterator<Text> labels = rep.getText(config.getNameField());
+        Iterator<Text> labels = rep.getText(linkerConfig.getNameField());
         Suggestion match = new Suggestion(rep);
         Collection<Text> defaultLabels = new ArrayList<Text>();
         boolean matchedCurLangLabel = false;
@@ -378,18 +435,21 @@ public class EntityLinker {
      */
     private void matchLabel(Suggestion match, Text label) {
         String text = label.getText();
-        if(!config.isCaseSensitiveMatching()){
+        if(!linkerConfig.isCaseSensitiveMatching()){
             text = text.toLowerCase(); //TODO use language of label for Locale
         }
         //Tokenize the label and remove remove tokens without alpha numerical chars
-        String[] unprocessedLabelTokens = content.tokenize(text);
+        String[] unprocessedLabelTokens = labelTokenizer.tokenize(text,
+            state.getLanguage()); //TODO: maybe check of Pos.Foreign
+        if(unprocessedLabelTokens == null){ //no tokenizer available
+            log.info("Unable to tokenize {} language texts. Will process untokenized label {}",
+                state.getLanguage(),text);
+            unprocessedLabelTokens = new String[]{text}; //there is already a warning
+        }
         int offset = 0;
         for(int i=0;i<unprocessedLabelTokens.length;i++){
-            boolean hasAlpha = false;
-            for(int j=0;!hasAlpha && j<unprocessedLabelTokens[i].length();j++){
-                hasAlpha = Character.isLetterOrDigit(unprocessedLabelTokens[i].charAt(j));
-            }
-            if(!hasAlpha){
+            boolean hasAlphaNumericChar = Utils.hasAlphaNumericChar(unprocessedLabelTokens[i]);
+            if(!hasAlphaNumericChar){
                 offset++;
             } else if(offset > 0){
                 unprocessedLabelTokens[i-offset] = unprocessedLabelTokens[i];
@@ -410,26 +470,27 @@ public class EntityLinker {
         //ensure the correct order of the tokens in the suggested entity
         boolean search = true;
         int firstFoundIndex = -1;
+        int firstProcessableFoundIndex = -1;
         int lastFoundIndex = -1;
+        int lastProcessableFoundIndex = -1;
         int firstFoundLabelIndex = -1;
         int lastfoundLabelIndex = -1;
-        Token currentToken;
+        TokenData currentToken;
         String currentTokenText;
         int currentTokenLength;
         int notFound = 0;
-        float minTokenMatchFactor = config.getMinTokenMatchFactor();
+        float minTokenMatchFactor = linkerConfig.getMinTokenMatchFactor();
         //search for matches within the correct order
-        for(int currentIndex = state.getTokenIndex();
-                currentIndex < state.getSentence().getTokens().size() 
+        for(int currentIndex = state.getToken().index;
+                currentIndex < state.getTokens().size() 
                 && search ;currentIndex++){
-            currentToken = state.getSentence().getTokens().get(currentIndex);
-            if(currentToken.hasAplhaNumericChar()){
-                currentTokenText = currentToken.getText();
-                if(!config.isCaseSensitiveMatching()){
+            currentToken = state.getTokens().get(currentIndex);
+            if(currentToken.hasAlphaNumeric){
+                currentTokenText = currentToken.token.getSpan();
+                if(!linkerConfig.isCaseSensitiveMatching()){
                     currentTokenText = currentTokenText.toLowerCase();
                 }
                 currentTokenLength = currentTokenText.length();
-                boolean isProcessable = isProcessableToken(currentToken);
                 boolean found = false;
                 float matchFactor = 0f;
                 //iteration starts at the next token after the last matched one
@@ -460,8 +521,12 @@ public class EntityLinker {
                 }
                 //int found = text.indexOf(currentToken.getText().toLowerCase());
                 if(found){ //found
-                    if(isProcessable){
+                    if(currentToken.isMatchable){
                         foundProcessableTokens++; //only count processable Tokens
+                        if(firstProcessableFoundIndex < 0){
+                            firstProcessableFoundIndex = currentIndex;
+                        }
+                        lastProcessableFoundIndex = currentIndex;
                     }
                     foundTokens++;
                     foundTokenMatch = foundTokenMatch + matchFactor; //sum up the matches
@@ -472,7 +537,7 @@ public class EntityLinker {
                     lastFoundIndex = currentIndex;
                 } else { //not found
                     notFound++;
-                    if(isProcessable || notFound > config.getMaxNotFound()){
+                    if(currentToken.isMatchable || notFound > linkerConfig.getMaxNotFound()){
                         //stop as soon as a token that needs to be processed is
                         //not found in the label or the maximum number of tokens
                         //that are not processable are not found
@@ -483,17 +548,16 @@ public class EntityLinker {
         }
         //search backwards for label tokens until firstFoundLabelIndex if there
         //are unconsumed Tokens in the sentence before state.getTokenIndex
-        int currentIndex = state.getTokenIndex()-1;
+        int currentIndex = state.getToken().index-1;
         int labelIndex = firstFoundLabelIndex-1;
         notFound = 0;
         search = true;
         while(search && labelIndex >= 0 && currentIndex > state.getConsumedIndex()){
             String labelTokenText = labelTokens[labelIndex];
             if(labelTokenSet.remove(labelTokenText)){ //still not matched
-                currentToken = state.getSentence().getTokens().get(currentIndex);
-                boolean isProcessable = isProcessableToken(currentToken);
-                currentTokenText = currentToken.getText();
-                if(!config.isCaseSensitiveMatching()){
+                currentToken = state.getTokens().get(currentIndex);
+                currentTokenText = currentToken.token.getSpan();
+                if(!linkerConfig.isCaseSensitiveMatching()){
                     currentTokenText = currentTokenText.toLowerCase();
                 }
                 currentTokenLength = currentTokenText.length();
@@ -510,8 +574,9 @@ public class EntityLinker {
                     }
                 }
                 if(found){ //found
-                    if(isProcessable){
+                    if(currentToken.isMatchable){
                         foundProcessableTokens++; //only count processable Tokens
+                        firstProcessableFoundIndex = currentIndex;
                     }
                     foundTokens++;
                     foundTokenMatch = foundTokenMatch + matchFactor; //sum up the matches
@@ -519,7 +584,7 @@ public class EntityLinker {
                     currentIndex --;
                 } else {
                     notFound++;
-                    if(isProcessable || notFound > config.getMaxNotFound()){
+                    if(currentToken.isMatchable || notFound > linkerConfig.getMaxNotFound()){
                         //stop as soon as a token that needs to be processed is
                         //not found in the label or the maximum number of tokens
                         //that are not processable are not found
@@ -533,6 +598,7 @@ public class EntityLinker {
         //e.g. if given and family name of persons are switched
         MATCH labelMatch; 
         int coveredTokens = lastFoundIndex-firstFoundIndex+1;
+        int coveredProcessableTokens = lastProcessableFoundIndex-firstProcessableFoundIndex+1;
         float labelMatchScore = (foundTokenMatch/(float)labelTokens.length);
         //Matching rules
         // - if less than config#minTokenFound() than accept only EXACT
@@ -541,19 +607,20 @@ public class EntityLinker {
         //   match (this will be very rare
         if(foundProcessableTokens > 0 && match.getMatchCount() <= foundProcessableTokens) {
             String currentText = state.getTokenText(firstFoundIndex,coveredTokens);
-            if(config.isCaseSensitiveMatching() ? currentText.equals(text) : currentText.equalsIgnoreCase(text)){ 
+            if(linkerConfig.isCaseSensitiveMatching() ? currentText.equals(text) : currentText.equalsIgnoreCase(text)){ 
                 labelMatch = MATCH.EXACT;
                 //set found to covered: May be lower because only
                 //processable tokens are counted, but Exact also checks
                 //of non-processable!
                 foundTokens = coveredTokens;
-            } else if((foundProcessableTokens >= config.getMinFoundTokens() ||
+                foundProcessableTokens = coveredProcessableTokens;
+            } else if((foundProcessableTokens >= linkerConfig.getMinFoundTokens() ||
                     //NOTE (rwesten, 2012-05-21): Do not check if all covered
                     //  Tokens are found, but if all Tokens of the Label are
                     //  matched! (STANBOL-622)
                     //foundTokens == coveredTokens) && 
-                    foundTokens >= labelTokens.length) &&
-                    labelMatchScore >= 0.6f){
+                    foundTokens >= labelTokens.length)){ //&&
+                    //labelMatchScore >= 0.6f){
                 //same as above
                 //if(foundTokens == coveredTokens){
                 if(foundTokens == labelTokens.length && foundTokens == coveredTokens){
@@ -568,7 +635,9 @@ public class EntityLinker {
                 if(match.getMatchCount() < foundProcessableTokens ||
                         match.getMatchCount() == foundProcessableTokens && 
                         labelMatch.ordinal() > match.getMatch().ordinal()){
-                    match.updateMatch(labelMatch, firstFoundIndex, coveredTokens, foundTokens,
+//                    match.updateMatch(labelMatch, firstFoundIndex, coveredTokens, foundTokens,
+//                        foundTokenMatch/foundTokens,label,labelTokens.length);
+                    match.updateMatch(labelMatch, firstProcessableFoundIndex, coveredProcessableTokens, foundProcessableTokens,
                         foundTokenMatch/foundTokens,label,labelTokens.length);
                 } //else this match is not better as the existing one
             } //else ignore labels with MATCH.NONE
@@ -614,27 +683,5 @@ public class EntityLinker {
         }
         return f > b ? f : b;
     }
-    
-    /**
-     * Checks if the current token of {@link #state} is processable. 
-     * @param token the {@link Token} to check.
-     * @return <code>true</code> if the parsed token needs to be processed.
-     * Otherwise <code>false</code>
-     */
-    private boolean isProcessableToken(Token token) {
-        Boolean processToken = null;
-        String[] posTags = token.getPosTags();
-        double[] posProb = token.getPosProbabilities();
-        if(posTags != null){
-            int i=0;
-            do {
-                processToken = content.processPOS(posTags[i],posProb[i]);
-                i++;
-            } while(processToken == null && i<posTags.length);
-        }
-        if(processToken == null) {
-             processToken = token.getText().length() >= config.getMinSearchTokenLength();
-        }
-        return processToken;
-    }
+
 }

Propchange: stanbol/branches/stanbol-nlp-processing/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/EntityLinker.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain