You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2012/07/11 18:59:09 UTC

svn commit: r1360296 - in /incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main: java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/ java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/ java/org/apache/stanb...

Author: rwesten
Date: Wed Jul 11 16:59:08 2012
New Revision: 1360296

URL: http://svn.apache.org/viewvc?rev=1360296&view=rev
Log:
fixes for STANBOL-685 and STANBOL-686 as stated in the issue description.

Also added debug level loggings about metadata of processed Tokens

Modified:
    incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java
    incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java
    incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinkerConfig.java
    incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/OpenNlpAnalysedContentFactory.java
    incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/resources/OSGI-INF/metatype/metatype.properties

Modified: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java?rev=1360296&r1=1360295&r2=1360296&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java Wed Jul 11 16:59:08 2012
@@ -117,6 +117,8 @@ import org.slf4j.LoggerFactory;
         },value="IGNORE"),
     @Property(name=KeywordLinkingEngine.MIN_SEARCH_TOKEN_LENGTH,
         intValue=EntityLinkerConfig.DEFAULT_MIN_SEARCH_TOKEN_LENGTH),
+    @Property(name=KeywordLinkingEngine.MIN_TOKEN_MATCH_FACTOR,floatValue=
+            EntityLinkerConfig.DEFAULT_MIN_TOKEN_MATCH_FACTOR),
     @Property(name=KeywordLinkingEngine.KEYWORD_TOKENIZER,boolValue=false),
     @Property(name=KeywordLinkingEngine.MAX_SUGGESTIONS,
         intValue=EntityLinkerConfig.DEFAULT_SUGGESTIONS),
@@ -164,6 +166,7 @@ public class KeywordLinkingEngine 
     public static final String MIN_POS_TAG_PROBABILITY = "org.apache.stanbol.enhancer.engines.keywordextraction.minPosTagProbability";
     public static final String TYPE_MAPPINGS = "org.apache.stanbol.enhancer.engines.keywordextraction.typeMappings";
     public static final String KEYWORD_TOKENIZER = "org.apache.stanbol.enhancer.engines.keywordextraction.keywordTokenizer";
+    public static final String MIN_TOKEN_MATCH_FACTOR = "org.apache.stanbol.enhancer.engines.keywordextraction.minTokenMatchFactor";
 //  public static final String ENABLE_CHUNKER = "org.apache.stanbol.enhancer.engines.keywordextraction.enableChunker";
     /**
      * Adds the dereference feature (STANBOL-333) also to this engine.
@@ -192,7 +195,7 @@ public class KeywordLinkingEngine 
      * language are processed. 
      */
     public static final Set<String> DEFAULT_LANGUAGES = Collections.emptySet();
-    public static final double DEFAULT_MIN_POS_TAG_PROBABILITY = 0.8;
+    public static final double DEFAULT_MIN_POS_TAG_PROBABILITY = 0.6667;
     /**
      * The languages this engine is configured to enhance. An empty List is
      * considered as active for any language
@@ -611,6 +614,7 @@ public class KeywordLinkingEngine 
                 "The configured min POS tag probability MUST BE in the range [0..1] " +
                 "or < 0 to deactivate this feature (parsed value "+value+")!");
         }
+        nlpConfig.setMinPosTagProbability(minPosTagProb);
         value = configuration.get(KEYWORD_TOKENIZER);
         //the keyword tokenizer config
         if(value instanceof Boolean){
@@ -618,7 +622,8 @@ public class KeywordLinkingEngine 
         } else if(value != null && !value.toString().isEmpty()){
             nlpConfig.forceKeywordTokenizer(Boolean.valueOf(value.toString()));
         }
-        nlpConfig.setMinPosTagProbability(minPosTagProb);
+        //nlpConfig.enablePosTypeChunker(false);
+        //nlpConfig.enableChunker(false);
         analysedContentFactory = OpenNlpAnalysedContentFactory.getInstance(openNLP,nlpConfig);
     }
 
@@ -632,6 +637,7 @@ public class KeywordLinkingEngine 
      * <li>{@link #MAX_SUGGESTIONS}
      * <li>{@link #MIN_SEARCH_TOKEN_LENGTH}
      * <li>{@link #MIN_FOUND_TOKENS}
+     * <li> {@link #MIN_TOKEN_MATCH_FACTOR}
      * </ul>
      * This Method create an new {@link EntityLinkerConfig} instance only if
      * <code>{@link #linkerConfig} == null</code>. If the instance is already initialised
@@ -760,6 +766,30 @@ public class KeywordLinkingEngine 
                 linkerConfig.setDefaultLanguage(defaultLang);
             }
         }
+        // init MIN_TOKEN_MATCH_FACTOR
+        value=configuration.get(MIN_TOKEN_MATCH_FACTOR);
+        float minTokenMatchFactor;
+        if(value instanceof Number){
+            minTokenMatchFactor = ((Number)value).floatValue();
+        } else if(value != null){
+            try {
+                minTokenMatchFactor = Float.valueOf(value.toString());
+            } catch (NumberFormatException e) {
+                throw new ConfigurationException(MIN_TOKEN_MATCH_FACTOR, 
+                    "Unable to parse the minimum token match factor from the parsed value "+value,e);
+            }
+            if(minTokenMatchFactor < 0){
+                minTokenMatchFactor = EntityLinkerConfig.DEFAULT_MIN_TOKEN_MATCH_FACTOR;
+            }
+        } else {
+            minTokenMatchFactor = EntityLinkerConfig.DEFAULT_MIN_TOKEN_MATCH_FACTOR;
+        }
+        if(minTokenMatchFactor == 0 || minTokenMatchFactor > 1){
+            throw new ConfigurationException(MIN_TOKEN_MATCH_FACTOR, 
+                "The minimum token match factor MUST be > 0 and <= 1 (negative values for the default)");
+        }
+        linkerConfig.setMinTokenMatchFactor(minTokenMatchFactor);
+
         //init type mappings
         value = configuration.get(TYPE_MAPPINGS);
         if(value instanceof String[]){ //support array

Modified: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java?rev=1360296&r1=1360295&r2=1360296&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java Wed Jul 11 16:59:08 2012
@@ -84,7 +84,16 @@ public class EntityLinker {
      * Steps over the sentences, chunks, tokens of the {@link #sentences}
      */
     public void process() throws EngineException {
+        int debugedIndex = 0;
         while(state.next()) {
+            if(log.isDebugEnabled() && (state.getTokenIndex() > debugedIndex || state.getTokenIndex() ==  0)){
+                debugedIndex = state.getTokenIndex();
+                Token token = state.getToken();
+                log.debug(" {} {} (pos:{}|prop:{})",new Object[]{
+                    isProcessableToken(token)? '+':'-',
+                    token.getText(),token.getPosTags(),token.getPosProbabilities()
+                });
+            }
             if(isProcessableToken(state.getToken())){
                 List<String> searchStrings = new ArrayList<String>(config.getMaxSearchTokens());
                 searchStrings.add(state.getToken().getText());
@@ -96,6 +105,13 @@ public class EntityLinker {
                                 state.getChunk().getEnd() : //the chunk
                                     state.getSentence().getTokens().size()-1))){ //or sentence
                     Token included = state.getSentence().getTokens().get(includeTokenIndex);
+                    if(log.isDebugEnabled()  && includeTokenIndex > debugedIndex){
+                        debugedIndex = includeTokenIndex;
+                        log.debug(" {} {} (pos:{}|prop:{})",new Object[]{
+                            isProcessableToken(included)? '+':'-',
+                            included.getText(),included.getPosTags(),included.getPosProbabilities()
+                        });
+                    }
                     includeTokenIndex++;
                     if(isProcessableToken(included)){
                         searchStrings.add(included.getText());
@@ -355,20 +371,7 @@ public class EntityLinker {
         }
         return match;
     }
-
-    /**
-     * The default value for the maximum number or non-processable tokens
-     * allowed to be not matching with a label of an entity before the matching
-     * is stopped.
-     */
-    private static int DEFAULT_MAX_NOT_FOUND = 1; 
-    /**
-    * The value for the maximum number or non-processable tokens
-    * allowed to be not matching with a label of an entity before the matching
-    * is stopped.
-     * TODO: make configurable!
-    */
-    private int maxNotFound = DEFAULT_MAX_NOT_FOUND;
+    
     /**
      * @param match
      * @param label
@@ -414,6 +417,7 @@ public class EntityLinker {
         String currentTokenText;
         int currentTokenLength;
         int notFound = 0;
+        float minTokenMatchFactor = config.getMinTokenMatchFactor();
         //search for matches within the correct order
         for(int currentIndex = state.getTokenIndex();
                 currentIndex < state.getSentence().getTokens().size() 
@@ -435,9 +439,9 @@ public class EntityLinker {
                     int labelTokenLength = labelTokenText.length();
                     float maxLength = currentTokenLength > labelTokenLength ? currentTokenLength : labelTokenLength;
                     float lengthDif = Math.abs(currentTokenLength - labelTokenLength);
-                    if((lengthDif/maxLength)<=0.3f){ //this prevents unnecessary string comparison 
-                        int matchCount = compairTokens(currentTokenText, labelTokenText);
-                        if(matchCount/maxLength >= 0.7f){
+                    if((lengthDif/maxLength)<=(1-minTokenMatchFactor)){ //this prevents unnecessary string comparison 
+                        int matchCount = compareTokens(currentTokenText, labelTokenText);
+                        if(matchCount/maxLength >= minTokenMatchFactor){
                             lastfoundLabelIndex = i; //set the last found index to the current position
                             found = true; //set found to true -> stops iteration
                             matchFactor = matchCount/maxLength; //how good is the match
@@ -468,7 +472,7 @@ public class EntityLinker {
                     lastFoundIndex = currentIndex;
                 } else { //not found
                     notFound++;
-                    if(isProcessable || notFound > maxNotFound){
+                    if(isProcessable || notFound > config.getMaxNotFound()){
                         //stop as soon as a token that needs to be processed is
                         //not found in the label or the maximum number of tokens
                         //that are not processable are not found
@@ -498,9 +502,9 @@ public class EntityLinker {
                 int labelTokenLength = labelTokenText.length();
                 float maxLength = currentTokenLength > labelTokenLength ? currentTokenLength : labelTokenLength;
                 float lengthDif = Math.abs(currentTokenLength - labelTokenLength);
-                if((lengthDif/maxLength)<=0.3f){ //this prevents unnecessary string comparison 
-                    int matchCount = compairTokens(currentTokenText, labelTokenText);
-                    if(matchCount/maxLength >= 0.7f){
+                if((lengthDif/maxLength)<=(1-minTokenMatchFactor)){ //this prevents unnecessary string comparison 
+                    int matchCount = compareTokens(currentTokenText, labelTokenText);
+                    if(matchCount/maxLength >= minTokenMatchFactor){
                         found = true; //set found to true -> stops iteration
                         matchFactor = matchCount/maxLength; //how good is the match
                     }
@@ -515,7 +519,7 @@ public class EntityLinker {
                     currentIndex --;
                 } else {
                     notFound++;
-                    if(isProcessable || notFound > maxNotFound){
+                    if(isProcessable || notFound > config.getMaxNotFound()){
                         //stop as soon as a token that needs to be processed is
                         //not found in the label or the maximum number of tokens
                         //that are not processable are not found
@@ -577,7 +581,7 @@ public class EntityLinker {
      * @param token2 the second token
      * @return the number of matching chars
      */
-    private int compairTokens(String token1,String token2){
+    private int compareTokens(String token1,String token2){
         int l1 = token1.length(); //length of the first token
         int l2 = token2.length(); //length of the second token
         //in case of same length check for equals first
@@ -626,7 +630,7 @@ public class EntityLinker {
             do {
                 processToken = content.processPOS(posTags[i],posProb[i]);
                 i++;
-            } while(processToken != null && processToken.equals(Boolean.FALSE) && i<posTags.length);
+            } while(processToken == null && i<posTags.length);
         }
         if(processToken == null) {
              processToken = token.getText().length() >= config.getMinSearchTokenLength();

Modified: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinkerConfig.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinkerConfig.java?rev=1360296&r1=1360295&r2=1360296&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinkerConfig.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinkerConfig.java Wed Jul 11 16:59:08 2012
@@ -211,6 +211,45 @@ public class EntityLinkerConfig {
      * detected for the text.
      */
     private String defaultLanguage = DEFAULT_LANGUAGE;
+    
+    /**
+     * Default for the maximum number of non-processable tokens that are 
+     * allowed to not match before no further tokens are matched against a label 
+     * of an Entity. <p>
+     * This allows e.g. to match "Dr. Richard Dogles" with "Dr Richard Dogles"
+     * as '.' is a non-processable token in the text that is missing in the
+     * label.<p>
+     * The default is set to <code>1</code>
+     */
+    public final static int DEFAULT_MAX_NOT_FOUND = 1; 
+    /**
+     * Value of the maximum number of non-processable tokens that are 
+     * allowed to not match before no further tokens are matched against a label 
+     * of an Entity. <p>
+     * This allows e.g. to match "Dr. Richard Dogles" with "Dr Richard Dogles"
+     * as '.' is a non-processable token in the text that is missing in the
+     * label.
+    */
+    private int maxNotFound;
+    /**
+     * Default value for the minimum token match factor.
+     * If Tokens match is determined by comparing them using some algorithm.
+     * Results need to be in the range [0..1]. This factor defines the minimum
+     * similarity value so that a match is assumed. Not that this factor only
+     * is used for filtering out non-matching tokens. The similarity value will
+     * still used for calculating the confidence.<p>
+     * The default is set to <code>0.7</code>.
+     */
+    public final static float DEFAULT_MIN_TOKEN_MATCH_FACTOR = 0.7f;
+    /**
+     * If Tokens match is determined by comparing them using some algorithm.
+     * Results need to be in the range [0..1]. This factor defines the minimum
+     * similarity value so that a match is assumed. Not that this factor only
+     * is used for filtering out non-matching tokens. The similarity value will
+     * still used for calculating the confidence
+     */
+    private float minTokenMatchFactor;
+    
     /**
      * Default constructor the initialises the configuration with the 
      * default values
@@ -226,6 +265,8 @@ public class EntityLinkerConfig {
         setNameField(DEFAULT_NAME_FIELD);
         setRedirectField(DEFAULT_REDIRECT_FIELD);
         setTypeField(DEFAULT_TYPE_FIELD);
+        setMaxNotFound(DEFAULT_MAX_NOT_FOUND);
+        setMinTokenMatchFactor(DEFAULT_MIN_TOKEN_MATCH_FACTOR);
     }
     /**
      * Getter for the uri of the field used for the names in the taxonomy
@@ -483,4 +524,62 @@ public class EntityLinkerConfig {
     public String getDefaultLanguage() {
         return defaultLanguage;
     }
+    /**
+     * Getter for the maximum number of non-processable tokens that are 
+     * allowed to not match before no further tokens are matched against a label 
+     * of an Entity. <p>
+     * This allows e.g. to match "Dr. Richard Dogles" with "Dr Richard Dogles"
+     * as '.' is a non-processable token in the text that is missing in the
+     * label.
+     * @return the maxNotFound
+     */
+    public int getMaxNotFound() {
+        return maxNotFound;
+    }
+    /**
+     * Setter for the maximum number of non-processable tokens that are 
+     * allowed to not match before no further tokens are matched against a label 
+     * of an Entity. <p>
+     * This allows e.g. to match "Dr. Richard Dogles" with "Dr Richard Dogles"
+     * as '.' is a non-processable token in the text that is missing in the
+     * label.
+     * @param maxNotFound the maxNotFound to set
+     */
+    public void setMaxNotFound(int maxNotFound) {
+        if(maxNotFound < 0){
+            this.maxNotFound = DEFAULT_MAX_NOT_FOUND;
+        } else {
+            this.maxNotFound = maxNotFound;
+        }
+    }
+    /**
+     * Getter for the minimum token match Factor.
+     * If Tokens match is determined by comparing them using some algorithm.
+     * Results need to be in the range [0..1]. This factor defines the minimum
+     * similarity value so that a match is assumed. Not that this factor only
+     * is used for filtering out non-matching tokens. The similarity value will
+     * still used for calculating the confidence
+     * @return the minTokenMatchFactor
+     */
+    public float getMinTokenMatchFactor() {
+        return minTokenMatchFactor;
+    }
+    /**
+     * Setter for the minimum token match Factor.
+     * If Tokens match is determined by comparing them using some algorithm.
+     * Results need to be in the range [0..1]. This factor defines the minimum
+     * similarity value so that a match is assumed. Not that this factor only
+     * is used for filtering out non-matching tokens. The similarity value will
+     * still used for calculating the confidence
+     * @param minTokenMatchFactor the minTokenMatchFactor to set
+     */
+    public void setMinTokenMatchFactor(float minTokenMatchFactor) {
+        if(minTokenMatchFactor < 0 ){
+            this.minTokenMatchFactor = DEFAULT_MIN_TOKEN_MATCH_FACTOR;
+        } else if(minTokenMatchFactor == 0 || minTokenMatchFactor > 1){
+            throw new IllegalArgumentException("minimum Token Match Facter MUST be > 0 <= 1 (parsed: "+minTokenMatchFactor+")!");
+        } else {
+            this.minTokenMatchFactor = minTokenMatchFactor;
+        }
+    }
 }
\ No newline at end of file

Modified: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/OpenNlpAnalysedContentFactory.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/OpenNlpAnalysedContentFactory.java?rev=1360296&r1=1360295&r2=1360296&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/OpenNlpAnalysedContentFactory.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/OpenNlpAnalysedContentFactory.java Wed Jul 11 16:59:08 2012
@@ -113,6 +113,7 @@ public class OpenNlpAnalysedContentFacto
     private class OpenNlpAnalysedContent implements AnalysedContent{
         private final TextAnalyzer analyzer;
         private final double minPosTagProbability;
+        private final double minExcludePosTagProbability;
         private final Iterator<AnalysedText> sentences;
         private final Set<String> posTags;
         private final Tokenizer tokenizer;
@@ -124,6 +125,7 @@ public class OpenNlpAnalysedContentFacto
                 analyzer.getLanguage(), PosTypeCollectionType.NOUN);
             this.tokenizer = analyzer.getTokenizer();
             this.minPosTagProbability = analyzer.getConfig().getMinPosTypeProbability();
+            this.minExcludePosTagProbability = minPosTagProbability/2;
         }
         
         /**
@@ -135,19 +137,45 @@ public class OpenNlpAnalysedContentFacto
             return sentences;
         }
         /**
-         * Called to check if a {@link Token} should be used to search for
-         * Concepts within the Taxonomy based on the POS tag of the Token.
-         * @param posTag the POS tag to check
-         * @param posProb the probability of the parsed POS tag
-         * @return <code>true</code> if Tokens with this POS tag should be
-         * included in searches. Otherwise <code>false</code>. Also returns
-         * <code>true</code> if no POS type configuration is available for the
-         * language parsed in the constructor
+         * This uses now two Tag Probabilities<ul>
+         * <li> {@link TextAnalyzerConfig#getMinPosTypeProbability()} for
+         * accepting POS tags that represent Nouns and
+         * <li> <code>minPosTypeProb/2</code> for rejecting POS tags that 
+         * are not nouns
+         * </ul>
+         * Assuming that the <code>minPosTypePropb=0.667</code> a<ul>
+         * <li> noun with the prop 0.8 would result in returning <code>true</code>
+         * <li> noun with prop 0.5 would return <code>null</code>
+         * <li> verb with prop 0.4 would return <code>false</code>
+         * <li> verb with prop 0.3 would return <code>null</code>
+         * </ul>
+         * This new algorithm makes it less likely that non nouns are processed
+         * by the KeywordLinkingEngine as returning <code>null</code> as the
+         * minimum probability requirement is now much lower.<p> 
+         * <i>NOTE:</i> Returning <code>null</code> usually results in using
+         * the fall-back (typically minTokenLnegh = 3) so most of those tokens
+         * where processed by the KeywordLinkingEngine.
+         * (see also STANBOL-685)
          */
         @Override
         public Boolean processPOS(String posTag, double posProb) {
-            return posTags != null && posProb > minPosTagProbability ? 
-                    Boolean.valueOf(posTags.contains(posTag)) : null;
+            if(posTags != null){
+                if(posTags.contains(posTag)){
+                    if(posProb >= minPosTagProbability){
+                        return Boolean.TRUE;
+                    } else {
+                        return null; //probability to low
+                    }
+                } else {
+                    if(posProb >= minExcludePosTagProbability){
+                        return Boolean.FALSE;
+                    } else {
+                        return null; //probability to low
+                    }
+                }
+            } else {
+                return null;
+            }
         }
         /**
          * Not yet implemented.

Modified: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/resources/OSGI-INF/metatype/metatype.properties
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/resources/OSGI-INF/metatype/metatype.properties?rev=1360296&r1=1360295&r2=1360296&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/resources/OSGI-INF/metatype/metatype.properties (original)
+++ incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/resources/OSGI-INF/metatype/metatype.properties Wed Jul 11 16:59:08 2012
@@ -115,3 +115,11 @@ org.apache.stanbol.enhancer.engines.keyw
 to use a special Tokenizer for matching keywords and alpha numeric IDs. Typical language \
 specific Tokenizers tned to split such IDs in several tokens and therefore might prevent \
 a correct matching.
+
+org.apache.stanbol.enhancer.engines.keywordextraction.minTokenMatchFactor.name=Minimum Token Match Factor
+org.apache.stanbol.enhancer.engines.keywordextraction.minTokenMatchFactor.description=If a Token \
+of the text is compared with a Token in the Label of an Entity the similarity of those is \
+expressed in the range [0..1]. This factor specifies the minimum similarity of two Tokens \
+so that they are considered to match. Lower values will allow more Tokens to match (e.g \
+inflected forms of words) but may also result in false positives. Regardless of the \
+configured value the similarity will influence the confidence of suggestions.