You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2014/01/10 07:02:13 UTC

svn commit: r1557037 - in /stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking: FstLinkingEngine.java FstLinkingEngineComponent.java LinkableTokenFilter.java

Author: rwesten
Date: Fri Jan 10 06:02:13 2014
New Revision: 1557037

URL: http://svn.apache.org/r1557037
Log:
STANBOL-1252: Added support for MIN_FOUND_TOKEN allowing to configure a static lower limit of the number of Tokens that need to match within a processable chunk (typically Noun phrases); Minor: changed the order of the chained TagCloudReducer so that the LinkableTokenFilter needs to process less potential tags.

Modified:
    stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java
    stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java
    stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java

Modified: stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java?rev=1557037&r1=1557036&r2=1557037&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java (original)
+++ stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java Fri Jan 10 06:02:13 2014
@@ -370,13 +370,13 @@ public class FstLinkingEngine implements
             new CharSequenceReader(at.getText()));
         LinkableTokenFilter linkableTokenFilter = new LinkableTokenFilter(baseTokenStream, 
             at, session.getLanguage(), tpConfig.getConfiguration(session.getLanguage()),
-            elConfig.getMinChunkMatchScore());
+            elConfig.getMinChunkMatchScore(), elConfig.getMinFoundTokens());
         //we use two TagClusterReducer implementations.
         // (1) the linkableTokenFilter filters all tags that do not overlap any
         //     linkable Token
         // (2) the LONGEST_DOMINANT_RIGHT reducer (TODO: make configurable)
         TagClusterReducer reducer = new ChainedTagClusterReducer(
-            linkableTokenFilter,TagClusterReducer.LONGEST_DOMINANT_RIGHT);
+            TagClusterReducer.LONGEST_DOMINANT_RIGHT, linkableTokenFilter);
         final long[] time = new long[]{0};
         new Tagger(corpus.getFst(), linkableTokenFilter, reducer,session.isSkipAltTokens()) {
             

Modified: stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java?rev=1557037&r1=1557036&r2=1557037&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java (original)
+++ stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java Fri Jan 10 06:02:13 2014
@@ -213,6 +213,13 @@ public class FstLinkingEngineComponent {
      * The default size of the Entity Cache is set to 65k entities.
      */
     public static final int DEFAULT_ENTITY_CACHE_SIZE = 65536;
+
+    /**
+     * Changed default for the {@link EntityLinkerConfig#MIN_FOUND_TOKENS} property.
+     * This Engine uses <code>2</code> as default. While the {@link EntityLinkerConfig}
+     * currently sets the default to <code>1</code>
+     */
+    private static final Integer FST_DEFAULT_MIN_FOUND_TOKENS = 2;
     
     private final Logger log = LoggerFactory.getLogger(FstLinkingEngineComponent.class);
     /**
@@ -352,7 +359,13 @@ public class FstLinkingEngineComponent {
         //(1) parse the TextProcessing configuration
         //TODO: decide if we should use the TextProcessingConfig for this engine
         textProcessingConfig = TextProcessingConfig.createInstance(properties);
+        //change default for EntityLinkerConfig.MIN_FOUND_TOKENS
+        value = properties.get(EntityLinkerConfig.MIN_FOUND_TOKENS);
         entityLinkerConfig = EntityLinkerConfig.createInstance(properties, prefixService);
+        if(value == null){ //no MIN_FOUND_TOKENS config present
+            //manually set the default to the value used by this engine
+            entityLinkerConfig.setMinFoundTokens(FST_DEFAULT_MIN_FOUND_TOKENS);
+        }
         
         //(2) parse the configured IndexReference
         value = properties.get(SOLR_CORE);

Modified: stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java?rev=1557037&r1=1557036&r2=1557037&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java (original)
+++ stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java Fri Jan 10 06:02:13 2014
@@ -161,9 +161,14 @@ public final class LinkableTokenFilter e
      * {@link Chunk} so that is is not omitted. 
      */
     private double minChunkMatchScore;
+    /**
+     * The minimum amount of matched (matchable) Tokens so that an Entity is
+     * considered. Only used within processable chunks
+     */
+    private int minFoundTokens;
     
     protected LinkableTokenFilter(TokenStream input, AnalysedText at, 
-            String lang, LanguageProcessingConfig lpc, double minChunkMatchScore) {
+            String lang, LanguageProcessingConfig lpc, double minChunkMatchScore, int minFoundTokens) {
         super(input);
         //STANBOL-1177: add attributes in doPrivileged to avoid 
         //AccessControlException: access denied ("java.lang.RuntimePermission" "getClassLoader")
@@ -185,6 +190,7 @@ public final class LinkableTokenFilter e
         this.isUnicaseLanguage = lang != null && !lang.isEmpty() &&
                 UNICASE_SCRIPT_LANUAGES.contains(lang);
         this.minChunkMatchScore = minChunkMatchScore;
+        this.minFoundTokens = minFoundTokens;
     }
 
     @Override
@@ -359,13 +365,13 @@ public final class LinkableTokenFilter e
                 tag.removeLL(); //remove the tag from the cluster
                 if(log.isTraceEnabled()){
                     CharSequence tagSequence = at.getText().subSequence(start, end);
-                    log.trace(" > reduce tag {}", tagSequence);
+                    log.trace(" > reduce tag {} - no overlapp with linkable token", tagSequence);
                 }
             } else { //if the tag overlaps a linkable token 
                 TokenData linkableToken = linkableTokenContext.linkableToken;
                 List<TokenData> tokens = linkableTokenContext.context;
                 ChunkData cd = linkableToken.inChunk; //check if it maches > 50% of the chunk
-                if(!lpc.isIgnoreChunks() && cd != null &&
+                 if(!lpc.isIgnoreChunks() && cd != null &&
                         cd.isProcessable){
                     int cstart = cd.getMatchableStartChar() >= 0 ? cd.getMatchableStartChar() :
                         start;
@@ -385,32 +391,32 @@ public final class LinkableTokenFilter e
                         }
                         //only accept tags with more as half of the matchable
                         //tokens in the Chunk are matched!
-                        if(((float)match/(float)num) < minChunkMatchScore){
+                        if(((float)match/(float)num) < minChunkMatchScore &&
+                                match < minFoundTokens){
                             tag.removeLL(); //ignore
                             if(log.isTraceEnabled()){
                                 CharSequence text = at.getText();
-                                log.trace(" - reduce tag {}[{},{}] because it does only match "
+                                log.trace(" - reduce tag {}[{},{}] - does only match "
                                     + "{} of {} of matchable Chunk {}[{},{}]", 
                                     new Object[]{text.subSequence(start, end), start, end, match,  
                                             num, text.subSequence(cstart, cend), cstart, cend});
                             }
                         } else if(log.isTraceEnabled()){
                             CharSequence text = at.getText();
-                            log.trace(" + keep tag {}[{},{}] matching {} of {} "
+                            log.trace(" + keep tag {}[{},{}] - matches {} of {} "
                                 + "matchable Tokens for matchable Chunk {}[{},{}]", 
                                 new Object[]{text.subSequence(start, end), start, end, match,
                                         num, text.subSequence(cstart, cend), cstart, cend});
                         }
                     } else if(log.isTraceEnabled()){
                         CharSequence text = at.getText();
-                        log.trace(" + keep tag {}[{},{}] for matchable Chunk {}[{},{}]", 
+                        log.trace(" + keep tag {}[{},{}] - matches whole Chunk {}[{},{}]", 
                             new Object[]{text.subSequence(start, end), start, end, 
                                  text.subSequence(cstart, cend), cstart, cend});
                     }
-                }
-                if(log.isTraceEnabled()){
+                } else if(log.isTraceEnabled()){
                     CharSequence tagSequence = at.getText().subSequence(start, end);
-                    log.trace(" + keep tag {}", tagSequence);
+                    log.trace(" + keep tag {} - not in processable chunk", tagSequence);
                 }
             }
         }