You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2013/11/19 14:51:47 UTC

svn commit: r1543431 - in /stanbol/branches/release-0.12: ./ data/defaultconfig/src/main/resources/config/ enhancement-engines/ enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/ enhancement...

Author: rwesten
Date: Tue Nov 19 13:51:46 2013
New Revision: 1543431

URL: http://svn.apache.org/r1543431
Log:
merged STANBOL-1211 to the 0.12 releasing branch

Modified:
    stanbol/branches/release-0.12/   (props changed)
    stanbol/branches/release-0.12/data/defaultconfig/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-dbpedia_fst.config
    stanbol/branches/release-0.12/enhancement-engines/   (props changed)
    stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/EntityLinkerConfig.java
    stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ChunkData.java
    stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java
    stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/SectionData.java
    stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/test/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngineTest.java
    stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java
    stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java
    stanbol/branches/release-0.12/integration-tests/src/test/java/org/apache/stanbol/enhancer/it/FstLinkingTest.java

Propchange: stanbol/branches/release-0.12/
------------------------------------------------------------------------------
  Merged /stanbol/trunk:r1543372-1543373,1543405

Modified: stanbol/branches/release-0.12/data/defaultconfig/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-dbpedia_fst.config
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/data/defaultconfig/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-dbpedia_fst.config?rev=1543431&r1=1543430&r2=1543431&view=diff
==============================================================================
--- stanbol/branches/release-0.12/data/defaultconfig/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-dbpedia_fst.config (original)
+++ stanbol/branches/release-0.12/data/defaultconfig/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-dbpedia_fst.config Tue Nov 19 13:51:46 2013
@@ -1,3 +1,3 @@
 stanbol.enhancer.chain.name="dbpedia-fst-linking"
-stanbol.enhancer.chain.weighted.chain=["tika;optional","langdetect","opennlp-sentence","opennlp-token","opennlp-pos","dbpedia-fst"]
+stanbol.enhancer.chain.weighted.chain=["tika;optional","langdetect","opennlp-sentence","opennlp-token","opennlp-pos","opennlp-chunker","dbpedia-fst"]
 service.ranking=I"0"
\ No newline at end of file

Propchange: stanbol/branches/release-0.12/enhancement-engines/
------------------------------------------------------------------------------
  Merged /stanbol/trunk/enhancement-engines:r1543372-1543373

Modified: stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/EntityLinkerConfig.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/EntityLinkerConfig.java?rev=1543431&r1=1543430&r2=1543431&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/EntityLinkerConfig.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/EntityLinkerConfig.java Tue Nov 19 13:51:46 2013
@@ -37,6 +37,7 @@ import org.apache.stanbol.enhancer.engin
 import org.apache.stanbol.enhancer.engines.entitylinking.impl.Suggestion;
 import org.apache.stanbol.enhancer.engines.entitylinking.impl.Suggestion.MATCH;
 import org.apache.stanbol.enhancer.nlp.NlpAnnotations;
+import org.apache.stanbol.enhancer.nlp.model.Chunk;
 import org.apache.stanbol.enhancer.nlp.model.Token;
 import org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures;
 import org.apache.stanbol.enhancer.nlp.pos.Pos;
@@ -168,6 +169,14 @@ public class EntityLinkerConfig {
      */
     public static final String MIN_MATCH_FACTOR = "enhancer.engines.linking.minMatchScore";
     /**
+     * The minimum score an Entity must match matchable {@link Token}s within a processable
+     * {@link Chunk}. By {@link #DEFAULT_MIN_CHUNK_MATCH_SCORE default} this is
+     * set to <code>51%</code> to filter Entities that do only match a single token
+     * within a NounPhrase of two words. This feature was introduced with
+     * <a href="https://issues.apache.org/jira/browse/STANBOL-1211">STANBOL-1211</a>
+     */
+    public static final String MIN_CHUNK_MATCH_SCORE = "enhancer.engines.linking.minChunkMatchScore";
+    /**
      * The maximum number of {@link Token} used as search terms with the 
      * {@link EntitySearcher#lookup(String, Set, java.util.List, String[], Integer)}
      * method
@@ -263,6 +272,13 @@ public class EntityLinkerConfig {
     public static final double DEFAULT_MIN_TEXT_SCORE = 0.4;
     public static final double DEFAULT_MIN_MATCH_SCORE = 0.3;
     /**
+     * By default more as 50% of the matchable tokens of a processable chunk
+     * need to match so that a Entity is considered to be mentioned in the text
+     * (STANBOL-1211)
+     */
+    public static final double DEFAULT_MIN_CHUNK_MATCH_SCORE = 0.51;
+    
+    /**
      * Default mapping for Concept types to dc:type values added for
      * TextAnnotations.
      */
@@ -449,6 +465,11 @@ public class EntityLinkerConfig {
     private double minLabelScore = DEFAULT_MIN_LABEL_SCORE;
     private double minTextScore = DEFAULT_MIN_TEXT_SCORE;
     private double minMatchScore = DEFAULT_MIN_MATCH_SCORE;
+    /**
+     * The minimum score an entity needs to match matchable tokens within a
+     * chunk so that is is considered as a mentions (STANBOL-1211)
+     */
+    private double minChunkMatchScore = DEFAULT_MIN_CHUNK_MATCH_SCORE;
 
     private boolean rankEqualScoresBasedOnEntityRankings = DEFAULT_RANK_EQUAL_SCORES_BASED_ON_ENTITY_RANKINGS;
 
@@ -632,7 +653,25 @@ public class EntityLinkerConfig {
         } catch (IllegalArgumentException e){
             throw new ConfigurationException(MIN_MATCH_FACTOR, e.getMessage());
         }
-                
+        
+        value = configuration.get(MIN_CHUNK_MATCH_SCORE);
+        Double minChunkMatchScore = null;
+        if(value instanceof Number){
+            minChunkMatchScore = Double.valueOf(((Number)value).doubleValue());
+        } else if(value != null){
+            try {
+                minChunkMatchScore = Double.valueOf(value.toString());
+            } catch (NumberFormatException e) {
+                throw new ConfigurationException(MIN_CHUNK_MATCH_SCORE, "Parsed value '"
+                        +value+"' is not an valid double!");
+            }
+        }
+        try {
+            linkerConfig.setMinChunkMatchScore(minChunkMatchScore);
+        } catch (IllegalArgumentException e){
+            throw new ConfigurationException(MIN_CHUNK_MATCH_SCORE, e.getMessage());
+        }
+        
         //init LEMMA_MATCHING_STATE
         value = configuration.get(LEMMA_MATCHING_STATE);
         if(value instanceof Boolean){
@@ -1085,14 +1124,15 @@ public class EntityLinkerConfig {
      */
     public UriRef setTypeMapping(String conceptType, UriRef dcType){
         if(dcType == null) {
-            throw new IllegalArgumentException("The parsed dc:type URI MUST NOT be NULL!");
-        }
-        if(conceptType == null){ //handle setting of the default dc:type value
-            UriRef oldDefault = getDefaultDcType();
-            setDefaultDcType(dcType);
-            return oldDefault;
+            return typeMappings.remove(conceptType == null ? null : new UriRef(conceptType));
+        } else {
+            if(conceptType == null){ //handle setting of the default dc:type value
+                UriRef oldDefault = getDefaultDcType();
+                setDefaultDcType(dcType);
+                return oldDefault;
+            }
+            return typeMappings.put(new UriRef(conceptType), dcType);
         }
-        return typeMappings.put(new UriRef(conceptType), dcType);
     }
     
     /**
@@ -1306,7 +1346,35 @@ public class EntityLinkerConfig {
         } else {
             minTextScore = score;
         }
-    }    
+    }
+    /**
+     * Getter for the minimum amount of matchable {@link Token}s an Entity must match
+     * within an {@link Chunk} to be considered (see STANBOL-1211).<p>
+     * The default is <code>&gt;0.5</code> to omit matches for a single token
+     * in a chunk - typically a noun phrase - including two words.
+     * @return the minimum chunk match score.
+     */
+    public double getMinChunkMatchScore() {
+        return minChunkMatchScore;
+    }
+    /**
+     * Setter for the minimum amount of matchable {@link Token}s an Entity must match
+     * within an {@link Chunk} to be considered (see STANBOL-1211).<p>
+     * The default is <code>&gt;0.5</code> to omit matches for a single token
+     * in a chunk - typically a noun phrase - including two words.
+     * @param minChunkMatchScore the minimum chunk match score or <code>null</code>
+     * to reset to the default value
+     */
+    public void setMinChunkMatchScore(Double minChunkMatchScore) {
+        if(minChunkMatchScore == null){
+            this.minChunkMatchScore = DEFAULT_MIN_CHUNK_MATCH_SCORE;
+        } else if(minChunkMatchScore < 0.0 || minChunkMatchScore > 1.0){
+            throw new IllegalArgumentException("The minChunkMatchScore MUST BE "
+                + "in the range [0..1] (parsed: "+minChunkMatchScore+")!");
+        } else {
+            this.minChunkMatchScore = minChunkMatchScore;
+        }
+    }
     /**
      * Getter for the minimum match Score of Entity labels against the
      * Text.<p>

Modified: stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ChunkData.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ChunkData.java?rev=1543431&r1=1543430&r2=1543431&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ChunkData.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ChunkData.java Tue Nov 19 13:51:46 2013
@@ -63,6 +63,23 @@ public class ChunkData {
      */
     int matchableCount;
     /**
+     * The start position of the first matchable {@link Token} within this
+     * chunk
+     */
+    int matchableStart = -1;
+    /**
+     * The start char offset of the first matchable {@link Token} within this chunk
+     */
+    int matchableStartCharIndex = -1;
+    /**
+     * The end position of the last matchable {@link Token} within this chunk
+     */
+    int matchableEnd = -1;
+    /**
+     * The end char offset of the last matchable {@link Token} within this chunk
+     */
+    int matchableEndCharIndex = -1;
+    /**
      * constructs and initializes the meta data for the parsed {@link Chunk}
      * @param chunk
      */
@@ -121,4 +138,37 @@ public class ChunkData {
     public int getEndTokenIndex() {
         return endToken;
     }
+    /**
+     * The index of the first matchable Token within the {@link Chunk} or
+     * <code>-1</code> if none
+     * @return
+     */
+    public int getMatchableStart() {
+        return matchableStart;
+    }
+    /**
+     * The index of the last matchable Token within the {@link Chunk} or
+     * <code>-1</code> if none
+     * @return
+     */
+    public int getMatchableEnd() {
+        return matchableEnd;
+    }
+    /**
+     * The char index of the start character of the first matchable {@link Token}
+     * within the {@link Chunk} or <code>-1</code> if none.
+     * @return
+     */
+    public int getMatchableStartChar() {
+        return matchableStartCharIndex;
+    }
+    /**
+     * the char indes of the end character of the last matchable {@link Token}
+     * within the {@link Chunk} or <code>-1</code> if none
+     * @return
+     */
+    public int getMatchableEndChar() {
+        return matchableEndCharIndex;
+    }
+    
 }
\ No newline at end of file

Modified: stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java?rev=1543431&r1=1543430&r2=1543431&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java Tue Nov 19 13:51:46 2013
@@ -25,6 +25,7 @@ import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
+import java.util.Locale;
 import java.util.Map;
 import java.util.NavigableMap;
 import java.util.Set;
@@ -35,6 +36,7 @@ import org.apache.clerezza.rdf.core.Trip
 import org.apache.clerezza.rdf.core.TripleCollection;
 import org.apache.clerezza.rdf.core.UriRef;
 import org.apache.clerezza.rdf.core.impl.TripleImpl;
+import org.apache.commons.lang.LocaleUtils;
 import org.apache.commons.lang.StringUtils;
 import org.apache.stanbol.enhancer.engines.entitylinking.Entity;
 import org.apache.stanbol.enhancer.engines.entitylinking.EntitySearcher;
@@ -162,24 +164,19 @@ public class EntityLinker {
             //Determine the range we are allowed to search for tokens
             final int minIncludeIndex;
             final int maxIndcludeIndex;
+            int consumedIndex = state.getConsumedIndex();
             //NOTE: testing has shown that using Chunks to restrict search for
             //      additional matchable tokens does have an negative impact on
             //      recall. Because of that this restriction is for now deactivated
-           //TODO: maybe make configurable via an own property
-            boolean restrirctContextByChunks = textProcessingConfig.isIgnoreChunks();
-            int consumedIndex = state.getConsumedIndex();
-            if(token.inChunk != null && !textProcessingConfig.isIgnoreChunks() &&
-                    restrirctContextByChunks){
-                minIncludeIndex = token.inChunk.getStartTokenIndex();
-//                minIncludeIndex = Math.max(
-//                    state.getConsumedIndex()+1, 
-//                    token.inChunk.getStartTokenIndex());
-                maxIndcludeIndex = token.inChunk.getEndTokenIndex();
-            } else {
+//            if(token.inChunk != null && !textProcessingConfig.isIgnoreChunks()){
+//                minIncludeIndex = token.inChunk.getStartTokenIndex();
+//                maxIndcludeIndex = token.inChunk.getEndTokenIndex();
+//                log.debug("  - restrict context to chunk[{}, {}]",
+//                    minIncludeIndex, maxIndcludeIndex);
+//            } else {
                 maxIndcludeIndex = state.getTokens().size() - 1;
-//                minIncludeIndex = state.getConsumedIndex() + 1;
                 minIncludeIndex = 0;
-            }
+//            }
             int prevIndex = token.index;
             int pastIndex = token.index;
             int pastNonMatchable = 0;
@@ -766,12 +763,19 @@ public class EntityLinker {
             PlainLiteral label = labels.next();
             numLabels++;
             String lang = label.getLanguage() != null ? label.getLanguage().toString() : null;
+            String text = label.getLexicalForm();
+            //if case-insensitive matching ... compare lower case versions
+            if(!linkerConfig.isCaseSensitiveMatching()){
+                text = text.toLowerCase(Locale.ROOT);
+            }
             if((lang == null && curLang == null) ||
                     (lang != null && curLang != null && lang.equalsIgnoreCase(curLang))){
-                if(!matchedLabels.contains(label.getLexicalForm())){
+                if(!matchedLabels.contains(text)){
                     matchLabel(searchTokens, match, label);
-                    matchedLabels.add(label.getLexicalForm());
+                    matchedLabels.add(text);
                     matchedLangLabel = true;
+                } else if(!matchedLangLabel){
+                    matchedLangLabel = true; //found a equivalent label in the matchlang
                 }
             } else if((lang == null && mainLang == null) ||
                     (lang != null && mainLang != null && lang.equalsIgnoreCase(mainLang))){
@@ -1043,6 +1047,43 @@ public class EntityLinker {
             final LabelMatch labelMatch;
             int coveredTokens = lastFoundIndex-firstFoundIndex+1;
             int coveredProcessableTokens = lastProcessableFoundIndex-firstProcessableFoundIndex+1;
+            //check if we lookup Entities within a processable chunk
+            final float chunkMatchScore;
+            if(!textProcessingConfig.isIgnoreChunks() &&
+                    state.getToken().inChunk != null &&  //there is a chunk
+                    state.getToken().inChunk.isProcessable){ //the chunk is processable
+                ChunkData cd = state.getToken().inChunk;
+                List<TokenData> tokens = state.getTokens();
+                if(log.isTraceEnabled()){
+                    log.trace("  ... checking match with chunk {}: {}", 
+                        cd.chunk, cd.chunk.getSpan());
+                }
+                int cstart = cd.getMatchableStart() >= 0 ? cd.getMatchableStart() :
+                    firstProcessableFoundIndex;
+                int cend = cd.getMatchableEndChar();
+                //if the match does not cover the whole chunk
+                if(cstart < firstProcessableFoundIndex || cend > lastProcessableFoundIndex){ 
+                    int foundInChunk = 0;
+                    int numInChunk = 0;
+                    for(int i = cd.matchableStart; i <= cd.matchableEnd ; i++){
+                        TokenData td = tokens.get(i);
+                        if(td.isMatchable){
+                            numInChunk++;
+                            if(i >= firstProcessableFoundIndex &&
+                                    i <= lastProcessableFoundIndex){
+                                foundInChunk++;
+                            }
+                        }
+                    }
+                    chunkMatchScore = (float) foundInChunk / (float) numInChunk;
+                    log.trace("  ... label matches {} of {} matchable token in Chunk", 
+                        foundInChunk, numInChunk);
+                } else { //matches the whole chunk
+                    chunkMatchScore = 1f;
+                }
+            } else { //no chunk (or ignoreChuncks == true) .. set chunkMatchScore to 1f
+                chunkMatchScore = 1f;
+            }
             //matched tokens only within the span of the first/last processable token
             //Matching rules
             // - if less than config#minTokenFound() than accept only EXACT
@@ -1050,10 +1091,12 @@ public class EntityLinker {
             //   foundTokens of the PARTIAL match is > than of the FULL/EXACT
             //   match (this will be very rare
             String currentText = state.getTokenText(firstFoundIndex,coveredTokens);
-            if(linkerConfig.isCaseSensitiveMatching() ? currentText.equals(text) : currentText.equalsIgnoreCase(text)){ 
+            if(chunkMatchScore == 1f && //the whole chunk matches
+                    (linkerConfig.isCaseSensitiveMatching() ? currentText.equals(text) : currentText.equalsIgnoreCase(text))){ 
                 labelMatch = new LabelMatch(firstFoundIndex, coveredTokens, label);
-            } else {
-                int coveredLabelTokens = matchedLabelTokens.lastKey().intValue()-matchedLabelTokens.firstKey().intValue()+1;
+            } else if(chunkMatchScore >= linkerConfig.getMinChunkMatchScore()){
+                int coveredLabelTokens = matchedLabelTokens.lastKey().intValue() -
+                        matchedLabelTokens.firstKey().intValue() + 1;
                 if(foundTokens == labelTokens.length && foundTokens == coveredTokens){
                     //if all token matched set found to covered: May be lower because only
                     //processable tokens are counted, but FULL also checks
@@ -1064,10 +1107,30 @@ public class EntityLinker {
                 labelMatch = new LabelMatch(firstProcessableFoundIndex, coveredProcessableTokens, 
                     foundProcessableTokens,foundTokensWithinCoveredProcessableTokens,
                     foundTokenMatch/(float)foundTokens,label,labelTokens.length, coveredLabelTokens);
+            } else {
+                if(log.isTraceEnabled()){ //trace level logging for STANBOL-1211
+                    List<TokenData> tokens = state.getTokens();
+                    int start = tokens.get(firstProcessableFoundIndex).token.getStart();
+                    int end = tokens.get(lastProcessableFoundIndex).token.getEnd();
+                    CharSequence content = state.getToken().token.getContext().getText();
+                    CharSequence match = content.subSequence(start, end);
+                    ChunkData cd = state.getToken().inChunk;
+                    int cStart = tokens.get(cd.matchableStart).token.getStart();
+                    int cEnd = tokens.get(cd.matchableEnd).token.getEnd();
+                    CharSequence context = content.subSequence(cStart, cEnd);
+                    log.trace(" - filter match '{}'@[{},{}] because it does only match "
+                            + "{}% (min: {}%) of the matchable Tokens in Chunk '{}'@[{},{}]",
+                            new Object[]{match, start, end, Math.round(chunkMatchScore*100),
+                                    Math.round(linkerConfig.getMinChunkMatchScore()*100),
+                                    context, cStart, cEnd});
+                }
+                labelMatch = null;
             }
-            if(labelMatch.getLabelScore() >= linkerConfig.getMinLabelScore() && 
+            if(labelMatch != null &&
+                    labelMatch.getLabelScore() >= linkerConfig.getMinLabelScore() && 
                     labelMatch.getTextScore() >= linkerConfig.getMinTextScore() && 
                     labelMatch.getMatchScore() >= linkerConfig.getMinMatchScore()){
+                log.trace(" + add suggestion {}", labelMatch);
                 suggestion.addLabelMatch(labelMatch);
             }
         } //else NO tokens found -> nothing to do

Modified: stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/SectionData.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/SectionData.java?rev=1543431&r1=1543430&r2=1543431&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/SectionData.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/SectionData.java Tue Nov 19 13:51:46 2013
@@ -162,6 +162,17 @@ public class SectionData {
                     } else if(tokenData.isMatchable){
                         activeChunk.matchableCount++;
                     }
+                    if(tokenData.isMatchable){ //for matchable tokens
+                        //update the matchable span within the active chunk
+                        if(activeChunk.matchableStart < 0){
+                            activeChunk.matchableStart = tokenData.index;
+                            activeChunk.matchableStartCharIndex = tokenData.token.getStart();
+                        }
+                        if(activeChunk.matchableStart >= 0){ //if start is set also set end
+                            activeChunk.matchableEnd = tokenData.index;
+                            activeChunk.matchableEndCharIndex = tokenData.token.getEnd();
+                        }
+                    }
                     if (span.getEnd() >= activeChunk.getEndChar()){
                         //this is the last token in the current chunk
                         activeChunk.endToken = tokens.size()-1;

Modified: stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/test/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngineTest.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/test/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngineTest.java?rev=1543431&r1=1543430&r2=1543431&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/test/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngineTest.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/test/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngineTest.java Tue Nov 19 13:51:46 2013
@@ -295,6 +295,7 @@ public class EntityLinkingEngineTest {
         LanguageProcessingConfig tpc = new LanguageProcessingConfig();
         tpc.setLinkedLexicalCategories(LanguageProcessingConfig.DEFAULT_LINKED_LEXICAL_CATEGORIES);
         tpc.setLinkedPos(Collections.EMPTY_SET);
+        tpc.setIgnoreChunksState(true); //to emulate pre STANBOL-1211
         EntityLinkerConfig config = new EntityLinkerConfig();
         config.setMinFoundTokens(2);//this is assumed by this test
         config.setRedirectProcessingMode(RedirectProcessingMode.FOLLOW);

Modified: stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java?rev=1543431&r1=1543430&r2=1543431&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java Tue Nov 19 13:51:46 2013
@@ -269,9 +269,14 @@ public class FstLinkingEngine implements
                         double length = Math.max(alength, matchLabel.getLexicalForm().length());
                         match.setMatch(1d - ((double)distance/length),matchLabel);
                     }
-                    log.trace(" ... add suggestion: label: '{}'; conf: {}", 
+                    if(match.getScore() >= elConfig.getMinMatchScore()){
+                        log.trace(" ... add suggestion: label: '{}'; conf: {}", 
                             matchLabel, match.getScore());
-                    suggestions.add(match);
+                        suggestions.add(match);
+                    } else {
+                        log.trace(" ... filtered because match score < {}", 
+                            elConfig.getMinMatchScore());
+                    }
                 } else { //the type of the current Entity is blacklisted
                     log.trace("  ... filtered because of entity types");
                 }
@@ -356,7 +361,8 @@ public class FstLinkingEngine implements
         TokenStream baseTokenStream = corpus.getTaggingAnalyzer().tokenStream("", 
             new CharSequenceReader(at.getText()));
         LinkableTokenFilter linkableTokenFilter = new LinkableTokenFilter(baseTokenStream, 
-            at, session.getLanguage(), tpConfig.getConfiguration(session.getLanguage()));
+            at, session.getLanguage(), tpConfig.getConfiguration(session.getLanguage()),
+            elConfig.getMinChunkMatchScore());
         //we use two TagClusterReducer implementations.
         // (1) the linkableTokenFilter filters all tags that do not overlap any
         //     linkable Token

Modified: stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java?rev=1543431&r1=1543430&r2=1543431&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java Tue Nov 19 13:51:46 2013
@@ -55,6 +55,7 @@ import org.apache.felix.scr.annotations.
 import org.apache.felix.scr.annotations.Property;
 import org.apache.felix.scr.annotations.PropertyOption;
 import org.apache.felix.scr.annotations.Reference;
+import org.apache.felix.scr.annotations.ReferenceCardinality;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.index.AtomicReader;
 import org.apache.lucene.index.FieldInfo;
@@ -222,7 +223,7 @@ public class FstLinkingEngineComponent {
     /**
      * used to resolve '{prefix}:{local-name}' used within the engines configuration
      */
-    @Reference
+    @Reference(cardinality=ReferenceCardinality.OPTIONAL_UNARY)
     protected NamespacePrefixService prefixService;    
 
     /**

Modified: stanbol/branches/release-0.12/integration-tests/src/test/java/org/apache/stanbol/enhancer/it/FstLinkingTest.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/integration-tests/src/test/java/org/apache/stanbol/enhancer/it/FstLinkingTest.java?rev=1543431&r1=1543430&r2=1543431&view=diff
==============================================================================
--- stanbol/branches/release-0.12/integration-tests/src/test/java/org/apache/stanbol/enhancer/it/FstLinkingTest.java (original)
+++ stanbol/branches/release-0.12/integration-tests/src/test/java/org/apache/stanbol/enhancer/it/FstLinkingTest.java Tue Nov 19 13:51:46 2013
@@ -20,12 +20,14 @@ import org.junit.Test;
 
 public class FstLinkingTest extends EnhancerTestBase {
 
-    
+    //NOTE: adapted text as part of STANBOL-1211 to avoid a single noun phrase 
+    //"SPD candidate Peer Steinbrueck" avoiding the linking of SPD in this
+    //Text.
     public static final String TEST_TEXT = "There has been a worried response in "
             + "Greece to the Sunday's election in Germany. The win of Chancellor "
             + "Angela Merkel means that there will not be a radical change in "
-            + "European policy. Greeks would have preferred SPD candidate Peer "
-            + "Steinbrueck, whose party lost Sunday.";
+            + "European policy. Greeks would have preferred Peer Steinbrueck the"
+            + "candidate of the SPD, whose party lost Sunday.";
     
     /**
      * 
@@ -54,17 +56,20 @@ public class FstLinkingTest extends Enha
                 //and the entityLinkingEngine
                 "http://purl.org/dc/terms/creator.*FstLinkingEngine",
                 //needs to suggest the following Entities
-                "http://fise.iks-project.eu/ontology/entity-reference.*http://dbpedia.org/resource/Chancellor",
                 "http://fise.iks-project.eu/ontology/entity-reference.*http://dbpedia.org/resource/Angela_Merkel",
                 "http://fise.iks-project.eu/ontology/entity-reference.*http://dbpedia.org/resource/Greece",
                 "http://fise.iks-project.eu/ontology/entity-reference.*http://dbpedia.org/resource/Germany",
                 "http://fise.iks-project.eu/ontology/entity-reference.*http://dbpedia.org/resource/Social_Democratic_Party_of_Germany",
                 //for the following sections within the text
-                "http://fise.iks-project.eu/ontology/selected-text.*Chancellor",
                 "http://fise.iks-project.eu/ontology/selected-text.*Angela Merkel",
                 "http://fise.iks-project.eu/ontology/selected-text.*Greece",
                 "http://fise.iks-project.eu/ontology/selected-text.*Germany",
-                "http://fise.iks-project.eu/ontology/selected-text.*SPD");
+                "http://fise.iks-project.eu/ontology/selected-text.*SPD")
+         //with STANBOL-1211 Chancellor MUST NOT be found as "Chancellor" does not
+         //select more as 50% of the tokens of the chunk "Chancellor Angela Merkel"
+         .assertContentRegexp(false, 
+                 "http://fise.iks-project.eu/ontology/entity-reference.*http://dbpedia.org/resource/Chancellor",
+                 "http://fise.iks-project.eu/ontology/selected-text.*Chancellor");
     }