You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2012/10/29 11:16:15 UTC

svn commit: r1403228 - in /stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-pos/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/pos: model/PosTagSetRegistry.java services/POSTaggingEngine.java

Author: rwesten
Date: Mon Oct 29 10:16:14 2012
New Revision: 1403228

URL: http://svn.apache.org/viewvc?rev=1403228&view=rev
Log:
STANBOL-734, STANBOL-735: Added Pos enum mappings to the OpenNLP PosTagSet's

Modified:
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-pos/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/pos/model/PosTagSetRegistry.java
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-pos/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/pos/services/POSTaggingEngine.java

Modified: stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-pos/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/pos/model/PosTagSetRegistry.java
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-pos/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/pos/model/PosTagSetRegistry.java?rev=1403228&r1=1403227&r2=1403228&view=diff
==============================================================================
--- stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-pos/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/pos/model/PosTagSetRegistry.java (original)
+++ stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-pos/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/pos/model/PosTagSetRegistry.java Mon Oct 29 10:16:14 2012
@@ -6,6 +6,7 @@ import java.util.Map;
 import org.apache.stanbol.commons.opennlp.PosTagsCollectionEnum;
 import org.apache.stanbol.enhancer.nlp.model.tag.TagSet;
 import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
+import org.apache.stanbol.enhancer.nlp.pos.Pos;
 import org.apache.stanbol.enhancer.nlp.pos.PosTag;
 import org.apache.stanbol.enhancer.nlp.pos.olia.English;
 import org.apache.stanbol.enhancer.nlp.pos.olia.German;
@@ -94,40 +95,39 @@ public final class PosTagSetRegistry {
     
     static {
         DANISH.addTag(new PosTag("N",LexicalCategory.Noun));
-        DANISH.addTag(new PosTag("NP",LexicalCategory.Noun));
-        DANISH.addTag(new PosTag("NC",LexicalCategory.Noun));
-        DANISH.addTag(new PosTag("AC",LexicalCategory.Quantifier)); //numbers
-        DANISH.addTag(new PosTag("AO",LexicalCategory.Quantifier)); //numbers
-        DANISH.addTag(new PosTag("XX",LexicalCategory.Noun)); //unsure
-        DANISH.addTag(new PosTag("XF",LexicalCategory.Noun)); //foreign word
-        DANISH.addTag(new PosTag("XR",LexicalCategory.Quantifier)); //number letters
-        DANISH.addTag(new PosTag("XR",LexicalCategory.Quantifier)); //symbol letters
+        DANISH.addTag(new PosTag("NP",Pos.ProperNoun));
+        DANISH.addTag(new PosTag("NC",Pos.CommonNoun));
+        DANISH.addTag(new PosTag("AC",Pos.CardinalNumber)); //numbers
+        DANISH.addTag(new PosTag("AO",Pos.OrdinalNumber)); //numbers
+        DANISH.addTag(new PosTag("AN",LexicalCategory.Adjective));
+        DANISH.addTag(new PosTag("XX",Pos.Typo)); //non-words (incl typos ..)
+        DANISH.addTag(new PosTag("XF",Pos.Foreign)); //foreign word
+        DANISH.addTag(new PosTag("XR",Pos.Symbol)); //symbol letters
         DANISH.addTag(new PosTag("XA",LexicalCategory.Noun)); //abbreviations
-        DANISH.addTag(new PosTag("XX",LexicalCategory.Quantifier)); //tokenizer errors
+        DANISH.addTag(new PosTag("XS",Pos.Abbreviation)); //abbreviations
         DANISH.addTag(new PosTag("V",LexicalCategory.Verb)); 
-        DANISH.addTag(new PosTag("VA",LexicalCategory.Verb)); 
-        DANISH.addTag(new PosTag("VAD",LexicalCategory.Verb)); 
-        DANISH.addTag(new PosTag("VAF",LexicalCategory.Verb)); 
-        DANISH.addTag(new PosTag("VAG",LexicalCategory.Verb)); 
-        DANISH.addTag(new PosTag("VAPR",LexicalCategory.Verb)); 
-        DANISH.addTag(new PosTag("VAPA",LexicalCategory.Verb)); 
-        DANISH.addTag(new PosTag("VE",LexicalCategory.Verb)); 
-        DANISH.addTag(new PosTag("VED",LexicalCategory.Verb)); 
-        DANISH.addTag(new PosTag("VEF",LexicalCategory.Verb)); 
+        DANISH.addTag(new PosTag("VA", Pos.MainVerb)); 
+        DANISH.addTag(new PosTag("VAD", Pos.MainVerb, Pos.IndicativeVerb)); 
+        DANISH.addTag(new PosTag("VAF", Pos.MainVerb, Pos.Infinitive)); 
+        DANISH.addTag(new PosTag("VAG", Pos.MainVerb, Pos.Gerund)); 
+        DANISH.addTag(new PosTag("VAPR", Pos.MainVerb, Pos.PresentParticiple)); 
+        DANISH.addTag(new PosTag("VAPA", Pos.MainVerb, Pos.PastParticiple)); 
+        DANISH.addTag(new PosTag("VE", LexicalCategory.Verb)); //TODO MedialVerb is missing 
+        DANISH.addTag(new PosTag("VED",Pos.IndicativeVerb)); //TODO MedialVerb is missing 
+        DANISH.addTag(new PosTag("VEF",Pos.Infinitive)); //TODO MedialVerb is missing 
         DANISH.addTag(new PosTag("XP",LexicalCategory.Punctuation)); 
-        DANISH.addTag(new PosTag("CC",LexicalCategory.Conjuction)); 
-        DANISH.addTag(new PosTag("CS",LexicalCategory.Conjuction)); 
-        DANISH.addTag(new PosTag("U",LexicalCategory.Noun)); //unknown tokens
-        DANISH.addTag(new PosTag("SP",LexicalCategory.Adposition)); 
-        DANISH.addTag(new PosTag("AN",LexicalCategory.Adjective)); //unsure
-        DANISH.addTag(new PosTag("R",LexicalCategory.Adverb)); //unsure
-        DANISH.addTag(new PosTag("RG",LexicalCategory.Adverb)); //unsure
-        DANISH.addTag(new PosTag("PD",LexicalCategory.PronounOrDeterminer)); //unsure
-        DANISH.addTag(new PosTag("PI",LexicalCategory.PronounOrDeterminer)); //unsure
-        DANISH.addTag(new PosTag("PT",LexicalCategory.PronounOrDeterminer)); //unsure
-        DANISH.addTag(new PosTag("PP",LexicalCategory.PronounOrDeterminer)); //unsure
-        DANISH.addTag(new PosTag("PO",LexicalCategory.PronounOrDeterminer)); //unsure
-        DANISH.addTag(new PosTag("PC",LexicalCategory.PronounOrDeterminer)); //unsure
+        DANISH.addTag(new PosTag("CC",Pos.CoordinatingConjunction)); 
+        DANISH.addTag(new PosTag("SC",Pos.SubordinatingConjunction)); 
+        DANISH.addTag(new PosTag("U")); //unmarked for degree
+        DANISH.addTag(new PosTag("SP",Pos.Preposition)); 
+        DANISH.addTag(new PosTag("R",Pos.AdjectivalAdverb));
+        DANISH.addTag(new PosTag("RG",LexicalCategory.Adverb));
+        DANISH.addTag(new PosTag("PD",Pos.DemonstrativePronoun)); 
+        DANISH.addTag(new PosTag("PI",Pos.IndefinitePronoun)); 
+        DANISH.addTag(new PosTag("PT",Pos.InterrogativePronoun,Pos.RelativePronoun));
+        DANISH.addTag(new PosTag("PP",Pos.PersonalPronoun)); //unsure
+        DANISH.addTag(new PosTag("PO",Pos.PossessivePronoun)); //unsure
+        DANISH.addTag(new PosTag("PC",Pos.ReciprocalPronoun)); //unsure
         DANISH.addTag(new PosTag("U=",LexicalCategory.Unique)); //unsure
         DANISH.addTag(new PosTag("I=",LexicalCategory.Interjection)); //unsure
         getInstance().add(DANISH);
@@ -141,22 +141,24 @@ public final class PosTagSetRegistry {
     public static final TagSet<PosTag> PORTUGUESE = new TagSet<PosTag>("PALAVRAS Portuguese","pt");
     
     static {
-        PORTUGUESE.addTag(new PosTag("n",LexicalCategory.Noun));
-        PORTUGUESE.addTag(new PosTag("prop",LexicalCategory.Noun));
-        PORTUGUESE.addTag(new PosTag("v-fin",LexicalCategory.Verb));
-        PORTUGUESE.addTag(new PosTag("v-inf",LexicalCategory.Verb));
-        PORTUGUESE.addTag(new PosTag("v-pcp",LexicalCategory.Verb));
-        PORTUGUESE.addTag(new PosTag("v-ger",LexicalCategory.Verb));
-        PORTUGUESE.addTag(new PosTag("art",LexicalCategory.PronounOrDeterminer));
-        PORTUGUESE.addTag(new PosTag("pron-pers",LexicalCategory.PronounOrDeterminer));
-        PORTUGUESE.addTag(new PosTag("pron-det",LexicalCategory.PronounOrDeterminer));
-        PORTUGUESE.addTag(new PosTag("pron-indp",LexicalCategory.PronounOrDeterminer));
+        PORTUGUESE.addTag(new PosTag("n",Pos.CommonNoun));
+        PORTUGUESE.addTag(new PosTag("prop",Pos.ProperNoun));
+        PORTUGUESE.addTag(new PosTag("adj",LexicalCategory.Adjective));
+        PORTUGUESE.addTag(new PosTag("v-fin",Pos.FiniteVerb));
+        PORTUGUESE.addTag(new PosTag("v-inf",Pos.Infinitive));
+        PORTUGUESE.addTag(new PosTag("v-pcp",Pos.Participle));
+        PORTUGUESE.addTag(new PosTag("v-ger",Pos.Gerund));
+        PORTUGUESE.addTag(new PosTag("art",Pos.Article));
+        PORTUGUESE.addTag(new PosTag("pron",Pos.Pronoun));
+        PORTUGUESE.addTag(new PosTag("pron-pers",Pos.PersonalPronoun));
+        PORTUGUESE.addTag(new PosTag("pron-det",Pos.DeterminalPronoun));
+        PORTUGUESE.addTag(new PosTag("pron-indp",Pos.Pronoun)); //TODO: missing independent pronoun 
         PORTUGUESE.addTag(new PosTag("adv",LexicalCategory.Adverb));
-        PORTUGUESE.addTag(new PosTag("num",LexicalCategory.Quantifier));
-        PORTUGUESE.addTag(new PosTag("prp",LexicalCategory.Adposition));
+        PORTUGUESE.addTag(new PosTag("num",Pos.Numeral));
+        PORTUGUESE.addTag(new PosTag("prp",Pos.Preposition));
         PORTUGUESE.addTag(new PosTag("in",LexicalCategory.Interjection));
-        PORTUGUESE.addTag(new PosTag("conj-s",LexicalCategory.Conjuction));
-        PORTUGUESE.addTag(new PosTag("conj-c",LexicalCategory.Conjuction));
+        PORTUGUESE.addTag(new PosTag("conj-s",Pos.SubordinatingConjunction));
+        PORTUGUESE.addTag(new PosTag("conj-c",Pos.CoordinatingConjunction));
         PORTUGUESE.addTag(new PosTag("punc",LexicalCategory.Punctuation)); //missing on the webpage ^
         getInstance().add(PORTUGUESE);
     }
@@ -166,21 +168,22 @@ public final class PosTagSetRegistry {
      * voor het Nederlands", doctoral dissertation, Department of language & 
      * Speech, Nijmegen University (renamed to Radboud University), 
      * december 1994.<p>
-     * 
+     * <b>NOTE:</b> This {@link TagSet} DOES NOT distinquish beteen Proper- and
+     * Common- Nouns!<p>
      */
     public static final TagSet<PosTag> DUTCH = new TagSet<PosTag>("WOTAN Dutch","nl");
     
     static {
         DUTCH.addTag(new PosTag("Adj",LexicalCategory.Adjective));
         DUTCH.addTag(new PosTag("Adv",LexicalCategory.Adverb));
-        DUTCH.addTag(new PosTag("Art",LexicalCategory.PronounOrDeterminer));
+        DUTCH.addTag(new PosTag("Art",Pos.Article));
         DUTCH.addTag(new PosTag("Conj",LexicalCategory.Conjuction));
         DUTCH.addTag(new PosTag("Int",LexicalCategory.Interjection));
         DUTCH.addTag(new PosTag("N",LexicalCategory.Noun));
-        DUTCH.addTag(new PosTag("Num",LexicalCategory.Quantifier));
-        DUTCH.addTag(new PosTag("Misc",null));
-        DUTCH.addTag(new PosTag("Prep",LexicalCategory.Adposition));
-        DUTCH.addTag(new PosTag("Pron",LexicalCategory.PronounOrDeterminer));
+        DUTCH.addTag(new PosTag("Num",Pos.Numeral));
+        DUTCH.addTag(new PosTag("Misc"));
+        DUTCH.addTag(new PosTag("Prep",Pos.Preposition));
+        DUTCH.addTag(new PosTag("Pron",Pos.Pronoun));
         DUTCH.addTag(new PosTag("Punc",LexicalCategory.Punctuation));
         DUTCH.addTag(new PosTag("V",LexicalCategory.Verb));
         getInstance().add(DUTCH);
@@ -189,51 +192,53 @@ public final class PosTagSetRegistry {
      * POS tags used by the Swedish POS model of OpenNLP for Swedish based on the
      * <a href="http://w3.msi.vxu.se/users/nivre/research/MAMBAlex.html">
      * Lexical categories in MAMBA</a>
+     * Most of the <i>'interesting'</i> {@link Pos} mappings would be defined
+     * as "Features" of MABAS.
      */
     public static final TagSet<PosTag> SWEDISH = new TagSet<PosTag>("MAMBA Swedish","sv");
     
     static {
-        SWEDISH.addTag(new PosTag("PN",LexicalCategory.Noun));
-        SWEDISH.addTag(new PosTag("MN",LexicalCategory.Noun));
-        SWEDISH.addTag(new PosTag("AN",LexicalCategory.Noun));
-        SWEDISH.addTag(new PosTag("VN",LexicalCategory.Noun));
-        SWEDISH.addTag(new PosTag("NN",LexicalCategory.Noun));
-        SWEDISH.addTag(new PosTag("PO",LexicalCategory.PronounOrDeterminer));
-        SWEDISH.addTag(new PosTag("EN",LexicalCategory.Quantifier));
-        SWEDISH.addTag(new PosTag("RO",LexicalCategory.Quantifier));
+        SWEDISH.addTag(new PosTag("PN",Pos.ProperNoun));
+        SWEDISH.addTag(new PosTag("MN",Pos.CommonNoun)); //TODO: missing Meta-Nouns
+        SWEDISH.addTag(new PosTag("AN",Pos.CommonNoun)); //TODO: missing Adjectival noun
+        SWEDISH.addTag(new PosTag("VN",Pos.VerbalNoun));
+        SWEDISH.addTag(new PosTag("NN",Pos.CommonNoun));
+        SWEDISH.addTag(new PosTag("PO",Pos.Pronoun));
+        SWEDISH.addTag(new PosTag("EN",Pos.IndefiniteArticle, Pos.Numeral));
+        SWEDISH.addTag(new PosTag("RO",Pos.Numeral));
         SWEDISH.addTag(new PosTag("AJ",LexicalCategory.Adjective));
-        SWEDISH.addTag(new PosTag("AV",LexicalCategory.Verb));
-        SWEDISH.addTag(new PosTag("BV",LexicalCategory.Verb));
-        SWEDISH.addTag(new PosTag("HV",LexicalCategory.Verb));
-        SWEDISH.addTag(new PosTag("WV",LexicalCategory.Verb));
-        SWEDISH.addTag(new PosTag("QV",LexicalCategory.Verb));
-        SWEDISH.addTag(new PosTag("MV",LexicalCategory.Verb));
-        SWEDISH.addTag(new PosTag("KV",LexicalCategory.Verb));
-        SWEDISH.addTag(new PosTag("SV",LexicalCategory.Verb));
-        SWEDISH.addTag(new PosTag("GV",LexicalCategory.Verb));
-        SWEDISH.addTag(new PosTag("FV",LexicalCategory.Verb));
-        SWEDISH.addTag(new PosTag("VV",LexicalCategory.Verb));
-        SWEDISH.addTag(new PosTag("TP",LexicalCategory.Verb));
-        SWEDISH.addTag(new PosTag("SP",LexicalCategory.Verb));
+        SWEDISH.addTag(new PosTag("AV",LexicalCategory.Verb)); //"vara" (be)
+        SWEDISH.addTag(new PosTag("BV",LexicalCategory.Verb)); //"bli(va)" (become)
+        SWEDISH.addTag(new PosTag("HV",LexicalCategory.Verb)); //"ha(va)" (have)
+        SWEDISH.addTag(new PosTag("WV",LexicalCategory.Verb)); //"vilja" (want)
+        SWEDISH.addTag(new PosTag("QV",LexicalCategory.Verb)); //"kunna" (can)
+        SWEDISH.addTag(new PosTag("MV",LexicalCategory.Verb)); //"måste" (must)
+        SWEDISH.addTag(new PosTag("KV",LexicalCategory.Verb)); // locution "komma att" (periphrastic future)
+        SWEDISH.addTag(new PosTag("SV",LexicalCategory.Verb)); //"skola" (will, shall)
+        SWEDISH.addTag(new PosTag("GV",LexicalCategory.Verb)); //"göra" (do, make)
+        SWEDISH.addTag(new PosTag("FV",LexicalCategory.Verb)); //få" (get)
+        SWEDISH.addTag(new PosTag("VV",LexicalCategory.Verb)); //all other verbs
+        SWEDISH.addTag(new PosTag("TP",Pos.PastParticiple)); //PerfectParticle
+        SWEDISH.addTag(new PosTag("SP",Pos.PresentParticiple));
         SWEDISH.addTag(new PosTag("AB",LexicalCategory.Adverb));
-        SWEDISH.addTag(new PosTag("PR",LexicalCategory.Adposition));
-        SWEDISH.addTag(new PosTag("IM",LexicalCategory.Verb));
-        SWEDISH.addTag(new PosTag("++",LexicalCategory.Conjuction));
-        SWEDISH.addTag(new PosTag("UK",LexicalCategory.Conjuction));
-        SWEDISH.addTag(new PosTag("IK",LexicalCategory.Punctuation));
-        SWEDISH.addTag(new PosTag("IP",LexicalCategory.Punctuation));
-        SWEDISH.addTag(new PosTag("I?",LexicalCategory.Punctuation));
-        SWEDISH.addTag(new PosTag("IU",LexicalCategory.Punctuation));
-        SWEDISH.addTag(new PosTag("IQ",LexicalCategory.Punctuation));
-        SWEDISH.addTag(new PosTag("IS",LexicalCategory.Punctuation));
-        SWEDISH.addTag(new PosTag("IT",LexicalCategory.Punctuation));
-        SWEDISH.addTag(new PosTag("IR",LexicalCategory.Punctuation));
+        SWEDISH.addTag(new PosTag("PR",Pos.Preposition));
+        SWEDISH.addTag(new PosTag("IM",Pos.Infinitive));
+        SWEDISH.addTag(new PosTag("++",Pos.CoordinatingConjunction));
+        SWEDISH.addTag(new PosTag("UK",Pos.SubordinatingConjunction));
+        SWEDISH.addTag(new PosTag("IK",Pos.Comma));
+        SWEDISH.addTag(new PosTag("IP",Pos.Point));
+        SWEDISH.addTag(new PosTag("I?",Pos.QuestionMark));
+        SWEDISH.addTag(new PosTag("IU",Pos.ExclamativePoint));
+        SWEDISH.addTag(new PosTag("IQ",Pos.Colon));
+        SWEDISH.addTag(new PosTag("IS",Pos.SemiColon));
+        SWEDISH.addTag(new PosTag("IT",Pos.Hyphen));
+        SWEDISH.addTag(new PosTag("IR",Pos.ParentheticalPunctuation));
         SWEDISH.addTag(new PosTag("IC",LexicalCategory.Punctuation));
-        SWEDISH.addTag(new PosTag("PU",LexicalCategory.Punctuation));
+        SWEDISH.addTag(new PosTag("PU",Pos.ListMarker));
         SWEDISH.addTag(new PosTag("IG",LexicalCategory.Punctuation));
-        SWEDISH.addTag(new PosTag("YY",LexicalCategory.Conjuction));
+        SWEDISH.addTag(new PosTag("YY",Pos.Interjection));
         SWEDISH.addTag(new PosTag("ID",LexicalCategory.Noun));
-        SWEDISH.addTag(new PosTag("XX",null));
+        SWEDISH.addTag(new PosTag("XX"));
         getInstance().add(SWEDISH);
     }
 

Modified: stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-pos/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/pos/services/POSTaggingEngine.java
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-pos/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/pos/services/POSTaggingEngine.java?rev=1403228&r1=1403227&r2=1403228&view=diff
==============================================================================
--- stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-pos/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/pos/services/POSTaggingEngine.java (original)
+++ stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-pos/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/pos/services/POSTaggingEngine.java Mon Oct 29 10:16:14 2012
@@ -255,17 +255,18 @@ public class POSTaggingEngine extends Ab
             posTag(tokenList, posTagger,tagSet,adhocTags);
             
         }
-        
-        logAnnotations(at);
+        if(log.isTraceEnabled()){
+            logAnnotations(at);
+        }
     }
     
     private void logAnnotations(AnalysedText at){
         Iterator<Span> it = at.getEnclosed(EnumSet.of(SpanTypeEnum.Sentence, SpanTypeEnum.Token));
         while(it.hasNext()){
             Span span = it.next();
-            log.info(" > {}",span);
+            log.trace(" > {}",span);
             for(Value<PosTag> value : span.getAnnotations(POS_ANNOTATION)){
-                log.info("   - {}",value);
+                log.trace("   - {}",value);
             }
         }
     }