You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2011/09/28 13:27:15 UTC
svn commit: r1176827 - in /incubator/stanbol/trunk: commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/ enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/ enhancer/engines/keywor...

Author: rwesten
Date: Wed Sep 28 11:27:15 2011
New Revision: 1176827

URL: http://svn.apache.org/viewvc?rev=1176827&view=rev
Log:
Added support for multiple POS tags to commons.opennlp and the the keyword extraction components. In addition the keyword extraction now uses POS tag probailities to decide if the POS tag should be used or not. 
This improves lookup for entities especially for texts that does not represent full sentences as well as for foreigen words (e.g. Person Names: Greek person on Swedish tests)

This includes:

* Added support for multiple POS tags to tokens
* The PosTypeChunker (creates chunks based on POS types) now supports multiple POS tags
* The PosTypeChunker now supports POS tag probabilities
* The AnalysedContent interface now supports POS tag and chunk probilities
* The EntityLinker now supports multiple POS tags 

Modified:
    incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTagsCollectionEnum.java
    incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTypeChunker.java
    incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/TextAnalyzer.java
    incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java
    incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/AnalysedContent.java
    incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java
    incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/OpenNlpAnalysedContentFactory.java

Modified: incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTagsCollectionEnum.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTagsCollectionEnum.java?rev=1176827&r1=1176826&r2=1176827&view=diff
==============================================================================
--- incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTagsCollectionEnum.java (original)
+++ incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTagsCollectionEnum.java Wed Sep 28 11:27:15 2011
@@ -96,8 +96,11 @@ public enum PosTagsCollectionEnum {
      * for Portuguese.<p>
      * TODO: Someone who speaks this language should check this List<p>
      * NOTES: Currently this includes nouns, proper nouns and numbers.
+     * In addition I added "vp". "vp" is not part of the POS tag set 
+     * documentation but in the training set there is a single occurrence 
+     * therefore the POS tagger sometimes do tag words with this tag.
      */
-    PT_NOUN("pt",PosTypeCollectionType.NOUN,"n","num","prop"),
+    PT_NOUN("pt",PosTypeCollectionType.NOUN,"n","num","prop","vp"),
     /**
      * POS types for Verbs based on the
      * <a href="http://beta.visl.sdu.dk/visl/pt/symbolset-floresta.html">PALAVRAS tag set</a>
@@ -110,7 +113,7 @@ public enum PosTagsCollectionEnum {
      * <a href="http://beta.visl.sdu.dk/visl/pt/symbolset-floresta.html">PALAVRAS tag set</a>
      * for Portuguese.<p>
      * TODO: Someone who speaks this language should check this List<p>
-     * NOTES: Currently this pubctations and prepositions.
+     * NOTES: Currently this pubctations and prepositions. 
      */
     PT_FOLLOW("pt",PosTypeCollectionType.FOLLOW,"punc", "prp"),
     /**

Modified: incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTypeChunker.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTypeChunker.java?rev=1176827&r1=1176826&r2=1176827&view=diff
==============================================================================
--- incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTypeChunker.java (original)
+++ incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTypeChunker.java Wed Sep 28 11:27:15 2011
@@ -42,24 +42,29 @@ import opennlp.tools.util.Span;
  */
 public class PosTypeChunker {
     
-    public final Set<String> followTypes;
+    private final double minPosProb;
+    
+    private final Set<String> followTypes;
 
-    public final Set<String> buildTypes;
+    private final Set<String> buildTypes;
 
     /**
      * Creates an instance for the given language based on the configuration
      * within the {@link PosTagsCollectionEnum}.
      * @param lang The language
+     * @param minPosTagProbaility The minimum probability of a POS tag so that
+     * it is processed. In case of lower Probabilities POS tags are ignored and
+     * assumed to be matching.
      * @return the instance or <code>null</code> if no configuration for the
      * parsed language is present in the {@link PosTagsCollectionEnum}.
      */
-    public static PosTypeChunker getInstance(String lang){
+    public static PosTypeChunker getInstance(String lang,double minPosTagProbaility){
         Set<String> nounPosTagCollection = 
             PosTagsCollectionEnum.getPosTagCollection(lang, PosTypeCollectionType.NOUN);
         if(nounPosTagCollection != null && !nounPosTagCollection.isEmpty()){
             return new PosTypeChunker(nounPosTagCollection, 
                 PosTagsCollectionEnum.getPosTagCollection(
-                    lang,PosTypeCollectionType.FOLLOW));
+                    lang,PosTypeCollectionType.FOLLOW),minPosTagProbaility);
         } else {
             return null;
         }
@@ -76,7 +81,7 @@ public class PosTypeChunker {
      * @param followPosTypes additional POS types followed to extend Chunks (MAY
      * BE <code>null</code> or empty).
      */
-    public PosTypeChunker(Set<String> buildPosTypes,Set<String> followPosTypes){
+    public PosTypeChunker(Set<String> buildPosTypes,Set<String> followPosTypes,double minPosProb){
         if(buildPosTypes == null || buildPosTypes.isEmpty()){
             throw new IllegalArgumentException("The set of POS types used to" +
             		"build Chunks MUST NOT be NULL nor empty!");
@@ -88,17 +93,48 @@ public class PosTypeChunker {
             follow.addAll(followPosTypes);
         }
         this.followTypes = Collections.unmodifiableSet(follow);
+        if(minPosProb > 1){
+            throw new IllegalArgumentException("The minimum POS tag probalility MUST BE set to a value [0..1] or values < 0 to deactivate this feature (parsed="+minPosProb+")!");
+        } else {
+            this.minPosProb = minPosProb;
+        }
     }
     /**
-     * TODO: This might be language specific!
-     * @param pos
-     * @return
-     */
-    private boolean followPOS(String pos){
-        return followTypes.contains(pos);
+     * @param props the probabilities of the pos tags or <code>null</code> if
+     * not available
+     * @param pos the POS tags
+     * @return <code>true</code> if follow
+     */
+    private boolean followPOS(double[] props,String... pos){
+        boolean reject = false;
+        for(int i=0;i<pos.length;i++){
+            if(props == null || props[i] >= minPosProb){
+                if(followTypes.contains(pos[i])){
+                    return true;
+                } else {
+                    reject = true;
+                }
+            } //else  prob to low ... do not process
+        }
+        //in case we have not found a POS tag with a prob > minPosProb
+        //return TRUE
+        return !reject;
     }
-    private boolean includePOS(String pos){
-        return buildTypes.contains(pos);
+    
+    private boolean includePOS(double[] props,String... pos){
+        boolean reject = false;
+        for(int i=0;i<pos.length;i++){
+            if(props == null || props[i] >= minPosProb){
+                if(buildTypes.contains(pos[i])){
+                    return true;
+                } else { 
+                    reject = true;
+                } 
+            }
+        }
+        //in case we have not found a POS tag with a prob > minPosProb
+        //return TRUE
+        return !reject;
     }
     /**
      * The set of POS types followed to extend Chunks. This includes the
@@ -117,7 +153,7 @@ public class PosTypeChunker {
     }
 
     /**
-     * Build the chunks based on the parsed tokens and tags. <p>
+     * Build the chunks based on the parsed tokens and POS tags. <p>
      * This method is the equivalent to 
      * {@link opennlp.tools.chunker.Chunker#chunkAsSpans(String[], String[])}
      * @param tokens the tokens
@@ -125,10 +161,48 @@ public class PosTypeChunker {
      * @return the chunks as spans over the parsed tokens
      */
     public Span[] chunkAsSpans(String[] tokens, String[] tags) {
+//      int consumed = -1;
+        List<Span> chunks = new ArrayList<Span>();
+        for(int i=0;i<tokens.length;i++){
+            if(includePOS(null,tags[i])){
+                int start = i;
+                //do not follow backwards!
+//                while(start-1 > consumed && followPOS(tags[start-1])){
+//                    start--; //follow backwards until consumed
+//                }
+                int followEnd = i;
+                int end = i;
+                while(followEnd+1 < tokens.length && followPOS(null,tags[followEnd+1])){
+                    followEnd++; //follow
+                    if(includePOS(null,tags[followEnd])){
+                        end = followEnd; //extend end only if act is include
+                    }
+                }
+                chunks.add(new Span(start,end));
+//                consumed = end;
+                i = followEnd;
+            }//build no chunk for this token
+        }
+        return chunks.toArray(new Span[chunks.size()]);
+    }
+    /**
+     * Build the chunks based on the parsed tokens and the one or more detected
+     * POS tags alternatives for the tokens. <p>
+     * @param tokens the tokens
+     * @param tags the POS tags for the tokens (1D:tokens; 2D:POS tags)
+     * @return the chunks as spans over the parsed tokens
+     */
+    public Span[] chunkAsSpans(String[] tokens, String[][] tags,double[][]props) {
+        //NOTE: this is a 1:1 copy of the above method!! However this is the
+        //      only solution, because merging them into a single one would
+        //      need to copy the Stirng[] of the other into a String[][1] as
+        //      used by this one :(
+        //      If someone has a better Idea feel free to change!
+        //      Rupert Westenthaler (28.Sep.2011)
 //        int consumed = -1;
         List<Span> chunks = new ArrayList<Span>();
         for(int i=0;i<tokens.length;i++){
-            if(includePOS(tags[i])){
+            if(includePOS(props[i],tags[i])){
                 int start = i;
                 //do not follow backwards!
 //                while(start-1 > consumed && followPOS(tags[start-1])){
@@ -136,9 +210,9 @@ public class PosTypeChunker {
 //                }
                 int followEnd = i;
                 int end = i;
-                while(followEnd+1 < tokens.length && followPOS(tags[followEnd+1])){
+                while(followEnd+1 < tokens.length && followPOS(props[followEnd+1],tags[followEnd+1])){
                     followEnd++; //follow
-                    if(includePOS(tags[followEnd])){
+                    if(includePOS(props[followEnd],tags[followEnd])){
                         end = followEnd; //extend end only if act is include
                     }
                 }

Modified: incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/TextAnalyzer.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/TextAnalyzer.java?rev=1176827&r1=1176826&r2=1176827&view=diff
==============================================================================
--- incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/TextAnalyzer.java (original)
+++ incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/TextAnalyzer.java Wed Sep 28 11:27:15 2011
@@ -35,6 +35,7 @@ import opennlp.tools.sentdetect.Sentence
 import opennlp.tools.sentdetect.SentenceModel;
 import opennlp.tools.tokenize.SimpleTokenizer;
 import opennlp.tools.tokenize.Tokenizer;
+import opennlp.tools.util.Sequence;
 import opennlp.tools.util.Span;
 
 import org.apache.felix.scr.annotations.Reference;
@@ -56,6 +57,10 @@ public class TextAnalyzer {
     private boolean enableSentenceDetector = true;
     private boolean enablePosTypeChunker = true;
     private boolean forcePosTypeChunker = true;
+    /**
+     * The minimum POS type probability used by the PosTypeChunker
+     */
+    private double minPosTagProbability = 0.75;
     
     //private POSTaggerME posTagger;
     //private SentenceDetector sentenceDetector;
@@ -67,6 +72,7 @@ public class TextAnalyzer {
      */
     private Map<String,PosTypeChunker> posTypeChunkers = new HashMap<String,PosTypeChunker>();
 
+
     
     public TextAnalyzer(OpenNLP openNLP){
         if(openNLP == null){
@@ -133,7 +139,7 @@ public class TextAnalyzer {
         }
         PosTypeChunker ptc = posTypeChunkers.get(language);
         if(ptc == null){
-            ptc = PosTypeChunker.getInstance(language);
+            ptc = PosTypeChunker.getInstance(language,minPosTagProbability);
             if(ptc != null){
                 posTypeChunkers.put(language, ptc);
             }
@@ -227,6 +233,29 @@ public class TextAnalyzer {
     }
 
     /**
+     * Getter for the minimum POS tag probability so that the
+     * {@link PosTypeChunker} processes a POS tag.
+     * @return the minPosTypeProbability
+     */
+    public final double getMinPosTypeProbability() {
+        return minPosTagProbability;
+    }
+
+    /**
+     * Setter for the minimum POS tag probability so that the
+     * {@link PosTypeChunker} processes a POS tag.
+     * @param minPosTagProbability The probability [0..1] or value < 0 to 
+     * deactivate this feature
+     * @throws IllegalArgumentException if values > 1 are parsed as probability
+     */
+    public final void setMinPosTagProbability(double probability) {
+        if(probability > 1){
+            throw new IllegalArgumentException("The minimum POS tag probability MUST be set to a value <= 1 (parsed:"+minPosTagProbability+"");
+        }
+        this.minPosTagProbability = probability;
+    }
+
+    /**
      * Analyses the parsed text in a single chunk. No sentence detector is used
      * @param sentence the sentence (text) to analyse
      * @return the Analysed text
@@ -343,18 +372,48 @@ public class TextAnalyzer {
             for(int ti = 0; ti<tokenSpans.length;ti++) {
                 tokens[ti] = tokenSpans[ti].getCoveredText(sentence).toString();
             }
-            String[] pos;
-            double[] posProbs;
+            String[][] posTags;
+            double[][] posProbs;
             Span[] chunkSpans;
             double[] chunkProps;
             if(tagger != null){
-                pos = tagger.tag(tokens);
-                posProbs = tagger.probs();
+                posTags = new String[tokens.length][];
+                posProbs = new double[tokens.length][];
+                //get the topK POS tags and props and copy it over to the 2dim Arrays
+                Sequence[] posSequences = tagger.topKSequences(tokens);
+                //extract the POS tags and props for the current token from the
+                //posSequences.
+                //NOTE: Sequence includes always POS tags for all Tokens. If
+                //      less then posSequences.length are available it adds the
+                //      best match for all followings.
+                //      We do not want such copies.
+                String[] actPos = new String[posSequences.length];
+                double[] actProp = new double[posSequences.length];
+                for(int i=0;i<tokenSpans.length;i++){
+                    boolean done = false;
+                    int j = 0;
+                    while( j < posSequences.length && !done){
+                        String p = posSequences[j].getOutcomes().get(i);
+                        done = j > 0 && p.equals(actPos[0]);
+                        if(!done){
+                            actPos[j] = p;
+                            actProp[j] = posSequences[j].getProbs()[i];
+                            j++;
+                        }
+                    }
+                    posTags[i] = new String[j];
+                    System.arraycopy(actPos, 0, posTags[i], 0, j);
+                    posProbs[i] = new double[j];
+                    System.arraycopy(actProp, 0, posProbs[i], 0, j);
+                }
+                //posProbs = tagger.probs();
                 if(chunker != null){
+                    //we still need the Array of the best ranked POS tags for the chunker
+                    String[] pos = posSequences[0].getOutcomes().toArray(new String[tokens.length]);
                     chunkSpans = chunker.chunkAsSpans(tokens, pos);
                     chunkProps = chunker.probs();
                 } else if(posTypeChunker != null){
-                    chunkSpans = posTypeChunker.chunkAsSpans(tokens, pos);
+                    chunkSpans = posTypeChunker.chunkAsSpans(tokens, posTags, posProbs);
                     chunkProps = new double[chunkSpans.length];
                     Arrays.fill(chunkProps, 1.0);
                 } else {
@@ -362,7 +421,7 @@ public class TextAnalyzer {
                     chunkProps = null;
                 }
             } else {
-                pos = null;
+                posTags = null;
                 posProbs = null;
                 chunkSpans = null;
                 chunkProps = null;
@@ -370,9 +429,10 @@ public class TextAnalyzer {
             List<Token> tokenList = new ArrayList<Token>(tokenSpans.length);
             for(int i=0;i<tokenSpans.length;i++){
                 tokenList.add(new Token(tokenSpans[i], tokens[i],
-                    pos!=null?pos[i]:null, pos!=null?posProbs[i]:-1));
+                    posTags == null ? null: posTags[i], 
+                            posProbs == null ? null : posProbs[i]));
             }
-            //assign the list to the member var but make itunmodifiable!
+            //assign the list to the member var but make unmodifiable!
             this.tokens = Collections.unmodifiableList(tokenList);
             if(chunkSpans != null){
                 List<Chunk> chunkList = new ArrayList<Chunk>(chunkSpans.length);
@@ -412,14 +472,27 @@ public class TextAnalyzer {
             //NOTE: Members are protected to allow the JVM direct access
             protected final Span span;
             protected String token;
-            protected final String pos;
-            protected final double posProbability;
+            protected final String[] posTags;
+            protected final double[] posProbabilities;
 
             private Token(Span span,String token,String pos,double posProbability){
+                this(span,token,new String[]{pos},new double[] {posProbability});
+            }
+            private Token(Span span,String token,String[] posTags, double[] posProbabilities){
                 this.span = span;
-                this.pos = pos;
+                if(posTags == null || posTags.length < 1){
+                    this.posTags = null;
+                } else {
+                    this.posTags = posTags;
+                }
                 this.token = token;
-                this.posProbability = posProbability;
+                if(this.posTags == null){
+                    this.posProbabilities = null;
+                } else if(posTags.length != posProbabilities.length){
+                    throw new IllegalStateException("POS Tag array and POS probability array MUST BE of the same size!");
+                } else {
+                    this.posProbabilities = posProbabilities;
+                }
             }
 
             public int getStart(){
@@ -428,14 +501,35 @@ public class TextAnalyzer {
             public int getEnd(){
                 return span.getEnd();
             }
+            /**
+             * Getter for the best ranked POS tag for this token
+             * @return
+             */
             public String getPosTag(){
-                return pos;
+                return posTags == null ? null : posTags[0];
             }
             /**
+             * Getter for all the POS tags of this Token. The one with the
+             * highest probability is at index 0.
+             * @return All POS tags assigned to this Token
+             */
+            public String[] getPosTags(){
+                return posTags;
+            }
+            /**
+             * Getter for the probability of the top ranked POS tag
              * @return the POS probability
              */
             public double getPosProbability() {
-                return posProbability;
+                return posProbabilities == null ? -1 : posProbabilities[0];
+            }
+            /**
+             * Getter for the probabilities of all {@link #getPosTags() POS tags}
+             * @return the probabilities of the POS tags returned by
+             * {@link #getPosTags()}
+             */
+            public double[] getPosProbabilities(){
+                return posProbabilities;
             }
             /**
              * Getter for the value of this token
@@ -449,7 +543,10 @@ public class TextAnalyzer {
             }
             @Override
             public String toString() {
-                return getText()+(pos != null?'_'+pos:"");
+                return getText()+(posTags != null?
+                        '_'+(posTags.length == 1 ?
+                                posTags[0] :
+                                    Arrays.toString(posTags)):"");
             }
         }
         public class Chunk {

Modified: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java?rev=1176827&r1=1176826&r2=1176827&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java Wed Sep 28 11:27:15 2011
@@ -119,6 +119,7 @@ public class KeywordLinkingEngine implem
     public static final String PROCESSED_LANGUAGES = "org.apache.stanbol.enhancer.engines.keywordextraction.processedLanguages";
     public static final String MIN_FOUND_TOKENS= "org.apache.stanbol.enhancer.engines.keywordextraction.minFoundTokens";
     public static final String DEFAULT_MATCHING_LANGUAGE = "org.apache.stanbol.enhancer.engines.keywordextraction.defaultMatchingLanguage";
+    public static final String MIN_POS_TAG_PROBABILITY = "org.apache.stanbol.enhancer.engines.keywordextraction.minPosTagProbability";
 //  public static final String SIMPLE_TOKENIZER = "org.apache.stanbol.enhancer.engines.keywordextraction.simpleTokenizer";
 //  public static final String ENABLE_CHUNKER = "org.apache.stanbol.enhancer.engines.keywordextraction.enableChunker";
     /**
@@ -126,6 +127,7 @@ public class KeywordLinkingEngine implem
      * language are processed. 
      */
     public static final Set<String> DEFAULT_LANGUAGES = Collections.emptySet();
+    public static final double DEFAULT_MIN_POS_TAG_PROBABILITY = 0.75;
     /**
      * The languages this engine is configured to enhance. An empty List is
      * considered as active for any language
@@ -444,7 +446,7 @@ public class KeywordLinkingEngine implem
      * call<ul>
      * <li> {@link #activateEntitySearcher(ComponentContext, Dictionary)}
      * <li> {@link #initEntityLinkerConfig(Dictionary, EntityLinkerConfig)} and
-     * <li> {@link #activateProcessedLanguages(Dictionary)}
+     * <li> {@link #activateTextAnalyzer(Dictionary)}
      * </ul>
      * if applicable.
      * @param context the Component context
@@ -454,21 +456,26 @@ public class KeywordLinkingEngine implem
     @Activate
     @SuppressWarnings("unchecked")
     protected void activate(ComponentContext context) throws ConfigurationException {
-        textAnalyser = new TextAnalyzer(openNLP);
-        analysedContentFactory = OpenNlpAnalysedContentFactory.getInstance(textAnalyser);
         Dictionary<String,Object> properties = context.getProperties();
+        activateTextAnalyzer(properties);
         activateEntitySearcher(context, properties);
         activateEntityLinkerConfig(properties);
-        activateProcessedLanguages(properties);
     }
 
     /**
-     * Initialise the processed languages based on the value of the
-     * {@link #PROCESSED_LANGUAGES} key. If no configuration is present the
+     * Initialise the {@link TextAnalyzer} component.<p>
+     * Currently this includes the following configurations: <ul>
+     * <li>{@link #PROCESSED_LANGUAGES}: If no configuration is present the
      * default (process all languages) is used.
+     * <li> {@value #MIN_POS_TAG_PROBABILITY}: If no configuration is
+     * present the #DEFAULT_MIN_POS_TAG_PROBABILITY is used
+     * languages based on the value of the
+     * 
      * @param configuration the OSGI component configuration
      */
-    protected final void activateProcessedLanguages(Dictionary<String,Object> configuration) {
+    protected final void activateTextAnalyzer(Dictionary<String,Object> configuration) throws ConfigurationException {
+        textAnalyser = new TextAnalyzer(openNLP);
+        analysedContentFactory = OpenNlpAnalysedContentFactory.getInstance(textAnalyser);
         Object value;
         value = configuration.get(PROCESSED_LANGUAGES);
         if(value == null){
@@ -487,6 +494,26 @@ public class KeywordLinkingEngine implem
                 }
             }
         }
+        value = configuration.get(MIN_POS_TAG_PROBABILITY);
+        double minPosTagProb;
+        if(value instanceof Number){
+            minPosTagProb = ((Number)value).doubleValue();
+        } else if(value != null && !value.toString().isEmpty()){
+            try {
+                minPosTagProb = Double.valueOf(value.toString());
+            } catch (NumberFormatException e) {
+                throw new ConfigurationException(MIN_POS_TAG_PROBABILITY, 
+                    "Unable to parse the min POS tag probability from the parsed value "+value,e);
+            }
+        } else {
+            minPosTagProb = DEFAULT_MIN_POS_TAG_PROBABILITY;
+        }
+        if(minPosTagProb > 1){
+            throw new ConfigurationException(MIN_POS_TAG_PROBABILITY, 
+                "The configured min POS tag probability MUST BE in the range [0..1] " +
+                "or < 0 to deactivate this feature (parsed value "+value+")!");
+        }
+        textAnalyser.setMinPosTagProbability(minPosTagProb);
     }
 
     /**
@@ -659,14 +686,17 @@ public class KeywordLinkingEngine implem
     @Deactivate
     protected void deactivate(ComponentContext context) {
         deactivateEntitySearcher();
-        deactivateProcessedLanguages();
+        deactivateTextAnalyzer();
         deactivateEntityLinkerConfig();
     }
 
     /**
-     * Sets the languages to {@link #DEFAULT_LANGUAGES}
+     * Deactivates the {@link TextAnalyzer} as well as resets the set of languages
+     * to process to {@link #DEFAULT_LANGUAGES}
      */
-    protected void deactivateProcessedLanguages() {
+    protected void deactivateTextAnalyzer() {
+        this.textAnalyser = null;
+        this.analysedContentFactory = null;
         languages = DEFAULT_LANGUAGES;
     }
 

Modified: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/AnalysedContent.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/AnalysedContent.java?rev=1176827&r1=1176826&r2=1176827&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/AnalysedContent.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/AnalysedContent.java Wed Sep 28 11:27:15 2011
@@ -39,21 +39,25 @@ public interface AnalysedContent {
      * Called to check if a {@link Token} should be used to search for
      * Concepts within the Taxonomy based on the POS tag of the Token.
      * @param posTag the POS tag to check
+     * @param posProb the probability of the POS tag or <code>1.0</code> if not
+     * available
      * @return <code>true</code> if Tokens with this POS tag should be
      * included in searches. Otherwise <code>false</code>.  If this information 
      * is not available (e.g. no set of Tags that need to be processed is defined) 
      * this Method MUST return <code>null</code>
      */
-    public Boolean processPOS(String posTag);
+    public Boolean processPOS(String posTag, double posProb);
     /**
      * Called to check if a chunk should be used to search for Concepts.
      * @param chunkTag the tag (type) of the chunk
+     * @param chunkProb the probability of the chunk tag or <code>1.0</code> if
+     * not available
      * @return <code>true</code> if chunks with this tag (type) should be
      * processed (used to search for matches of concepts) and <code>false</code>
      * if not. If this information is not available (e.g. no set of Tags that
      * need to be processed is defined) this Method MUST return <code>null</code>
      */
-    public Boolean processChunk(String chunkTag);
+    public Boolean processChunk(String chunkTag,double chunkProb);
     /**
      * Tokenizes the parsed label
      * @param label the label to tokenize

Modified: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java?rev=1176827&r1=1176826&r2=1176827&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java Wed Sep 28 11:27:15 2011
@@ -379,8 +379,14 @@ public class EntityLinker {
      */
     private boolean isProcessableToken(Token token) {
         Boolean processToken = null;
-        if(token.getPosTag() != null){
-            processToken = content.processPOS(token.getPosTag());
+        String[] posTags = token.getPosTags();
+        double[] posProb = token.getPosProbabilities();
+        if(posTags != null){
+            int i=0;
+            do {
+                processToken = content.processPOS(posTags[i],posProb[i]);
+                i++;
+            } while(processToken != null && processToken.equals(Boolean.FALSE) && i<posTags.length);
         }
         if(processToken == null) {
              processToken = token.getText().length() >= config.getMinSearchTokenLength();

Modified: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/OpenNlpAnalysedContentFactory.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/OpenNlpAnalysedContentFactory.java?rev=1176827&r1=1176826&r2=1176827&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/OpenNlpAnalysedContentFactory.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/OpenNlpAnalysedContentFactory.java Wed Sep 28 11:27:15 2011
@@ -89,6 +89,7 @@ public class OpenNlpAnalysedContentFacto
      *
      */
     private class OpenNlpAnalysedContent implements AnalysedContent{
+        private final double minPosTagProbability;
         private final String language;
         private final Iterator<AnalysedText> sentences;
         private final Set<String> posTags;
@@ -99,6 +100,7 @@ public class OpenNlpAnalysedContentFacto
             this.sentences = textAnalyzer.analyse(text, lang);
             this.posTags = PosTagsCollectionEnum.getPosTagCollection(lang, PosTypeCollectionType.NOUN);
             this.tokenizer = textAnalyzer.getTokenizer(lang);
+            minPosTagProbability = textAnalyzer.getMinPosTypeProbability();
         }
         
         /**
@@ -113,22 +115,25 @@ public class OpenNlpAnalysedContentFacto
          * Called to check if a {@link Token} should be used to search for
          * Concepts within the Taxonomy based on the POS tag of the Token.
          * @param posTag the POS tag to check
+         * @param posProb the probability of the parsed POS tag
          * @return <code>true</code> if Tokens with this POS tag should be
          * included in searches. Otherwise <code>false</code>. Also returns
          * <code>true</code> if no POS type configuration is available for the
          * language parsed in the constructor
          */
         @Override
-        public Boolean processPOS(String posTag) {
-            return posTags != null ? Boolean.valueOf(posTags.contains(posTag)) : null;
+        public Boolean processPOS(String posTag, double posProb) {
+            return posTags != null && posProb > minPosTagProbability ? 
+                    Boolean.valueOf(posTags.contains(posTag)) : null;
         }
         /**
          * Not yet implemented.
          * @param chunkTag the type of the chunk
+         * @param chunkProb the probability of the parsed chunk tag
          * @return returns always <code>true</code>
          */
         @Override
-        public Boolean processChunk(String chunkTag) {
+        public Boolean processChunk(String chunkTag, double chunkProb) {
             // TODO implement
             return null;
         }