You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2012/10/29 11:16:15 UTC
svn commit: r1403228 - in
/stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-pos/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/pos:
model/PosTagSetRegistry.java services/POSTaggingEngine.java
Author: rwesten
Date: Mon Oct 29 10:16:14 2012
New Revision: 1403228
URL: http://svn.apache.org/viewvc?rev=1403228&view=rev
Log:
STANBOL-734, STANBOL-735: Added Pos enum mappings to the OpenNLP PosTagSet's
Modified:
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-pos/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/pos/model/PosTagSetRegistry.java
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-pos/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/pos/services/POSTaggingEngine.java
Modified: stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-pos/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/pos/model/PosTagSetRegistry.java
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-pos/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/pos/model/PosTagSetRegistry.java?rev=1403228&r1=1403227&r2=1403228&view=diff
==============================================================================
--- stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-pos/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/pos/model/PosTagSetRegistry.java (original)
+++ stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-pos/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/pos/model/PosTagSetRegistry.java Mon Oct 29 10:16:14 2012
@@ -6,6 +6,7 @@ import java.util.Map;
import org.apache.stanbol.commons.opennlp.PosTagsCollectionEnum;
import org.apache.stanbol.enhancer.nlp.model.tag.TagSet;
import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
+import org.apache.stanbol.enhancer.nlp.pos.Pos;
import org.apache.stanbol.enhancer.nlp.pos.PosTag;
import org.apache.stanbol.enhancer.nlp.pos.olia.English;
import org.apache.stanbol.enhancer.nlp.pos.olia.German;
@@ -94,40 +95,39 @@ public final class PosTagSetRegistry {
static {
DANISH.addTag(new PosTag("N",LexicalCategory.Noun));
- DANISH.addTag(new PosTag("NP",LexicalCategory.Noun));
- DANISH.addTag(new PosTag("NC",LexicalCategory.Noun));
- DANISH.addTag(new PosTag("AC",LexicalCategory.Quantifier)); //numbers
- DANISH.addTag(new PosTag("AO",LexicalCategory.Quantifier)); //numbers
- DANISH.addTag(new PosTag("XX",LexicalCategory.Noun)); //unsure
- DANISH.addTag(new PosTag("XF",LexicalCategory.Noun)); //foreign word
- DANISH.addTag(new PosTag("XR",LexicalCategory.Quantifier)); //number letters
- DANISH.addTag(new PosTag("XR",LexicalCategory.Quantifier)); //symbol letters
+ DANISH.addTag(new PosTag("NP",Pos.ProperNoun));
+ DANISH.addTag(new PosTag("NC",Pos.CommonNoun));
+ DANISH.addTag(new PosTag("AC",Pos.CardinalNumber)); //numbers
+ DANISH.addTag(new PosTag("AO",Pos.OrdinalNumber)); //numbers
+ DANISH.addTag(new PosTag("AN",LexicalCategory.Adjective));
+ DANISH.addTag(new PosTag("XX",Pos.Typo)); //non-words (incl typos ..)
+ DANISH.addTag(new PosTag("XF",Pos.Foreign)); //foreign word
+ DANISH.addTag(new PosTag("XR",Pos.Symbol)); //symbol letters
DANISH.addTag(new PosTag("XA",LexicalCategory.Noun)); //abbreviations
- DANISH.addTag(new PosTag("XX",LexicalCategory.Quantifier)); //tokenizer errors
+ DANISH.addTag(new PosTag("XS",Pos.Abbreviation)); //abbreviations
DANISH.addTag(new PosTag("V",LexicalCategory.Verb));
- DANISH.addTag(new PosTag("VA",LexicalCategory.Verb));
- DANISH.addTag(new PosTag("VAD",LexicalCategory.Verb));
- DANISH.addTag(new PosTag("VAF",LexicalCategory.Verb));
- DANISH.addTag(new PosTag("VAG",LexicalCategory.Verb));
- DANISH.addTag(new PosTag("VAPR",LexicalCategory.Verb));
- DANISH.addTag(new PosTag("VAPA",LexicalCategory.Verb));
- DANISH.addTag(new PosTag("VE",LexicalCategory.Verb));
- DANISH.addTag(new PosTag("VED",LexicalCategory.Verb));
- DANISH.addTag(new PosTag("VEF",LexicalCategory.Verb));
+ DANISH.addTag(new PosTag("VA", Pos.MainVerb));
+ DANISH.addTag(new PosTag("VAD", Pos.MainVerb, Pos.IndicativeVerb));
+ DANISH.addTag(new PosTag("VAF", Pos.MainVerb, Pos.Infinitive));
+ DANISH.addTag(new PosTag("VAG", Pos.MainVerb, Pos.Gerund));
+ DANISH.addTag(new PosTag("VAPR", Pos.MainVerb, Pos.PresentParticiple));
+ DANISH.addTag(new PosTag("VAPA", Pos.MainVerb, Pos.PastParticiple));
+ DANISH.addTag(new PosTag("VE", LexicalCategory.Verb)); //TODO MedialVerb is missing
+ DANISH.addTag(new PosTag("VED",Pos.IndicativeVerb)); //TODO MedialVerb is missing
+ DANISH.addTag(new PosTag("VEF",Pos.Infinitive)); //TODO MedialVerb is missing
DANISH.addTag(new PosTag("XP",LexicalCategory.Punctuation));
- DANISH.addTag(new PosTag("CC",LexicalCategory.Conjuction));
- DANISH.addTag(new PosTag("CS",LexicalCategory.Conjuction));
- DANISH.addTag(new PosTag("U",LexicalCategory.Noun)); //unknown tokens
- DANISH.addTag(new PosTag("SP",LexicalCategory.Adposition));
- DANISH.addTag(new PosTag("AN",LexicalCategory.Adjective)); //unsure
- DANISH.addTag(new PosTag("R",LexicalCategory.Adverb)); //unsure
- DANISH.addTag(new PosTag("RG",LexicalCategory.Adverb)); //unsure
- DANISH.addTag(new PosTag("PD",LexicalCategory.PronounOrDeterminer)); //unsure
- DANISH.addTag(new PosTag("PI",LexicalCategory.PronounOrDeterminer)); //unsure
- DANISH.addTag(new PosTag("PT",LexicalCategory.PronounOrDeterminer)); //unsure
- DANISH.addTag(new PosTag("PP",LexicalCategory.PronounOrDeterminer)); //unsure
- DANISH.addTag(new PosTag("PO",LexicalCategory.PronounOrDeterminer)); //unsure
- DANISH.addTag(new PosTag("PC",LexicalCategory.PronounOrDeterminer)); //unsure
+ DANISH.addTag(new PosTag("CC",Pos.CoordinatingConjunction));
+ DANISH.addTag(new PosTag("SC",Pos.SubordinatingConjunction));
+ DANISH.addTag(new PosTag("U")); //unmarked for degree
+ DANISH.addTag(new PosTag("SP",Pos.Preposition));
+ DANISH.addTag(new PosTag("R",Pos.AdjectivalAdverb));
+ DANISH.addTag(new PosTag("RG",LexicalCategory.Adverb));
+ DANISH.addTag(new PosTag("PD",Pos.DemonstrativePronoun));
+ DANISH.addTag(new PosTag("PI",Pos.IndefinitePronoun));
+ DANISH.addTag(new PosTag("PT",Pos.InterrogativePronoun,Pos.RelativePronoun));
+ DANISH.addTag(new PosTag("PP",Pos.PersonalPronoun)); //unsure
+ DANISH.addTag(new PosTag("PO",Pos.PossessivePronoun)); //unsure
+ DANISH.addTag(new PosTag("PC",Pos.ReciprocalPronoun)); //unsure
DANISH.addTag(new PosTag("U=",LexicalCategory.Unique)); //unsure
DANISH.addTag(new PosTag("I=",LexicalCategory.Interjection)); //unsure
getInstance().add(DANISH);
@@ -141,22 +141,24 @@ public final class PosTagSetRegistry {
public static final TagSet<PosTag> PORTUGUESE = new TagSet<PosTag>("PALAVRAS Portuguese","pt");
static {
- PORTUGUESE.addTag(new PosTag("n",LexicalCategory.Noun));
- PORTUGUESE.addTag(new PosTag("prop",LexicalCategory.Noun));
- PORTUGUESE.addTag(new PosTag("v-fin",LexicalCategory.Verb));
- PORTUGUESE.addTag(new PosTag("v-inf",LexicalCategory.Verb));
- PORTUGUESE.addTag(new PosTag("v-pcp",LexicalCategory.Verb));
- PORTUGUESE.addTag(new PosTag("v-ger",LexicalCategory.Verb));
- PORTUGUESE.addTag(new PosTag("art",LexicalCategory.PronounOrDeterminer));
- PORTUGUESE.addTag(new PosTag("pron-pers",LexicalCategory.PronounOrDeterminer));
- PORTUGUESE.addTag(new PosTag("pron-det",LexicalCategory.PronounOrDeterminer));
- PORTUGUESE.addTag(new PosTag("pron-indp",LexicalCategory.PronounOrDeterminer));
+ PORTUGUESE.addTag(new PosTag("n",Pos.CommonNoun));
+ PORTUGUESE.addTag(new PosTag("prop",Pos.ProperNoun));
+ PORTUGUESE.addTag(new PosTag("adj",LexicalCategory.Adjective));
+ PORTUGUESE.addTag(new PosTag("v-fin",Pos.FiniteVerb));
+ PORTUGUESE.addTag(new PosTag("v-inf",Pos.Infinitive));
+ PORTUGUESE.addTag(new PosTag("v-pcp",Pos.Participle));
+ PORTUGUESE.addTag(new PosTag("v-ger",Pos.Gerund));
+ PORTUGUESE.addTag(new PosTag("art",Pos.Article));
+ PORTUGUESE.addTag(new PosTag("pron",Pos.Pronoun));
+ PORTUGUESE.addTag(new PosTag("pron-pers",Pos.PersonalPronoun));
+ PORTUGUESE.addTag(new PosTag("pron-det",Pos.DeterminalPronoun));
+ PORTUGUESE.addTag(new PosTag("pron-indp",Pos.Pronoun)); //TODO: missing independent pronoun
PORTUGUESE.addTag(new PosTag("adv",LexicalCategory.Adverb));
- PORTUGUESE.addTag(new PosTag("num",LexicalCategory.Quantifier));
- PORTUGUESE.addTag(new PosTag("prp",LexicalCategory.Adposition));
+ PORTUGUESE.addTag(new PosTag("num",Pos.Numeral));
+ PORTUGUESE.addTag(new PosTag("prp",Pos.Preposition));
PORTUGUESE.addTag(new PosTag("in",LexicalCategory.Interjection));
- PORTUGUESE.addTag(new PosTag("conj-s",LexicalCategory.Conjuction));
- PORTUGUESE.addTag(new PosTag("conj-c",LexicalCategory.Conjuction));
+ PORTUGUESE.addTag(new PosTag("conj-s",Pos.SubordinatingConjunction));
+ PORTUGUESE.addTag(new PosTag("conj-c",Pos.CoordinatingConjunction));
PORTUGUESE.addTag(new PosTag("punc",LexicalCategory.Punctuation)); //missing on the webpage ^
getInstance().add(PORTUGUESE);
}
@@ -166,21 +168,22 @@ public final class PosTagSetRegistry {
* voor het Nederlands", doctoral dissertation, Department of language &
* Speech, Nijmegen University (renamed to Radboud University),
* december 1994.<p>
- *
+ * <b>NOTE:</b> This {@link TagSet} DOES NOT distinquish beteen Proper- and
+ * Common- Nouns!<p>
*/
public static final TagSet<PosTag> DUTCH = new TagSet<PosTag>("WOTAN Dutch","nl");
static {
DUTCH.addTag(new PosTag("Adj",LexicalCategory.Adjective));
DUTCH.addTag(new PosTag("Adv",LexicalCategory.Adverb));
- DUTCH.addTag(new PosTag("Art",LexicalCategory.PronounOrDeterminer));
+ DUTCH.addTag(new PosTag("Art",Pos.Article));
DUTCH.addTag(new PosTag("Conj",LexicalCategory.Conjuction));
DUTCH.addTag(new PosTag("Int",LexicalCategory.Interjection));
DUTCH.addTag(new PosTag("N",LexicalCategory.Noun));
- DUTCH.addTag(new PosTag("Num",LexicalCategory.Quantifier));
- DUTCH.addTag(new PosTag("Misc",null));
- DUTCH.addTag(new PosTag("Prep",LexicalCategory.Adposition));
- DUTCH.addTag(new PosTag("Pron",LexicalCategory.PronounOrDeterminer));
+ DUTCH.addTag(new PosTag("Num",Pos.Numeral));
+ DUTCH.addTag(new PosTag("Misc"));
+ DUTCH.addTag(new PosTag("Prep",Pos.Preposition));
+ DUTCH.addTag(new PosTag("Pron",Pos.Pronoun));
DUTCH.addTag(new PosTag("Punc",LexicalCategory.Punctuation));
DUTCH.addTag(new PosTag("V",LexicalCategory.Verb));
getInstance().add(DUTCH);
@@ -189,51 +192,53 @@ public final class PosTagSetRegistry {
* POS tags used by the Swedish POS model of OpenNLP for Swedish based on the
* <a href="http://w3.msi.vxu.se/users/nivre/research/MAMBAlex.html">
* Lexical categories in MAMBA</a>
+ * Most of the <i>'interesting'</i> {@link Pos} mappings would be defined
+ * as "Features" of MABAS.
*/
public static final TagSet<PosTag> SWEDISH = new TagSet<PosTag>("MAMBA Swedish","sv");
static {
- SWEDISH.addTag(new PosTag("PN",LexicalCategory.Noun));
- SWEDISH.addTag(new PosTag("MN",LexicalCategory.Noun));
- SWEDISH.addTag(new PosTag("AN",LexicalCategory.Noun));
- SWEDISH.addTag(new PosTag("VN",LexicalCategory.Noun));
- SWEDISH.addTag(new PosTag("NN",LexicalCategory.Noun));
- SWEDISH.addTag(new PosTag("PO",LexicalCategory.PronounOrDeterminer));
- SWEDISH.addTag(new PosTag("EN",LexicalCategory.Quantifier));
- SWEDISH.addTag(new PosTag("RO",LexicalCategory.Quantifier));
+ SWEDISH.addTag(new PosTag("PN",Pos.ProperNoun));
+ SWEDISH.addTag(new PosTag("MN",Pos.CommonNoun)); //TODO: missing Meta-Nouns
+ SWEDISH.addTag(new PosTag("AN",Pos.CommonNoun)); //TODO: missing Adjectival noun
+ SWEDISH.addTag(new PosTag("VN",Pos.VerbalNoun));
+ SWEDISH.addTag(new PosTag("NN",Pos.CommonNoun));
+ SWEDISH.addTag(new PosTag("PO",Pos.Pronoun));
+ SWEDISH.addTag(new PosTag("EN",Pos.IndefiniteArticle, Pos.Numeral));
+ SWEDISH.addTag(new PosTag("RO",Pos.Numeral));
SWEDISH.addTag(new PosTag("AJ",LexicalCategory.Adjective));
- SWEDISH.addTag(new PosTag("AV",LexicalCategory.Verb));
- SWEDISH.addTag(new PosTag("BV",LexicalCategory.Verb));
- SWEDISH.addTag(new PosTag("HV",LexicalCategory.Verb));
- SWEDISH.addTag(new PosTag("WV",LexicalCategory.Verb));
- SWEDISH.addTag(new PosTag("QV",LexicalCategory.Verb));
- SWEDISH.addTag(new PosTag("MV",LexicalCategory.Verb));
- SWEDISH.addTag(new PosTag("KV",LexicalCategory.Verb));
- SWEDISH.addTag(new PosTag("SV",LexicalCategory.Verb));
- SWEDISH.addTag(new PosTag("GV",LexicalCategory.Verb));
- SWEDISH.addTag(new PosTag("FV",LexicalCategory.Verb));
- SWEDISH.addTag(new PosTag("VV",LexicalCategory.Verb));
- SWEDISH.addTag(new PosTag("TP",LexicalCategory.Verb));
- SWEDISH.addTag(new PosTag("SP",LexicalCategory.Verb));
+ SWEDISH.addTag(new PosTag("AV",LexicalCategory.Verb)); //"vara" (be)
+ SWEDISH.addTag(new PosTag("BV",LexicalCategory.Verb)); //"bli(va)" (become)
+ SWEDISH.addTag(new PosTag("HV",LexicalCategory.Verb)); //"ha(va)" (have)
+ SWEDISH.addTag(new PosTag("WV",LexicalCategory.Verb)); //"vilja" (want)
+ SWEDISH.addTag(new PosTag("QV",LexicalCategory.Verb)); //"kunna" (can)
+ SWEDISH.addTag(new PosTag("MV",LexicalCategory.Verb)); //"måste" (must)
+ SWEDISH.addTag(new PosTag("KV",LexicalCategory.Verb)); // locution "komma att" (periphrastic future)
+ SWEDISH.addTag(new PosTag("SV",LexicalCategory.Verb)); //"skola" (will, shall)
+ SWEDISH.addTag(new PosTag("GV",LexicalCategory.Verb)); //"göra" (do, make)
+ SWEDISH.addTag(new PosTag("FV",LexicalCategory.Verb)); //få" (get)
+ SWEDISH.addTag(new PosTag("VV",LexicalCategory.Verb)); //all other verbs
+ SWEDISH.addTag(new PosTag("TP",Pos.PastParticiple)); //PerfectParticle
+ SWEDISH.addTag(new PosTag("SP",Pos.PresentParticiple));
SWEDISH.addTag(new PosTag("AB",LexicalCategory.Adverb));
- SWEDISH.addTag(new PosTag("PR",LexicalCategory.Adposition));
- SWEDISH.addTag(new PosTag("IM",LexicalCategory.Verb));
- SWEDISH.addTag(new PosTag("++",LexicalCategory.Conjuction));
- SWEDISH.addTag(new PosTag("UK",LexicalCategory.Conjuction));
- SWEDISH.addTag(new PosTag("IK",LexicalCategory.Punctuation));
- SWEDISH.addTag(new PosTag("IP",LexicalCategory.Punctuation));
- SWEDISH.addTag(new PosTag("I?",LexicalCategory.Punctuation));
- SWEDISH.addTag(new PosTag("IU",LexicalCategory.Punctuation));
- SWEDISH.addTag(new PosTag("IQ",LexicalCategory.Punctuation));
- SWEDISH.addTag(new PosTag("IS",LexicalCategory.Punctuation));
- SWEDISH.addTag(new PosTag("IT",LexicalCategory.Punctuation));
- SWEDISH.addTag(new PosTag("IR",LexicalCategory.Punctuation));
+ SWEDISH.addTag(new PosTag("PR",Pos.Preposition));
+ SWEDISH.addTag(new PosTag("IM",Pos.Infinitive));
+ SWEDISH.addTag(new PosTag("++",Pos.CoordinatingConjunction));
+ SWEDISH.addTag(new PosTag("UK",Pos.SubordinatingConjunction));
+ SWEDISH.addTag(new PosTag("IK",Pos.Comma));
+ SWEDISH.addTag(new PosTag("IP",Pos.Point));
+ SWEDISH.addTag(new PosTag("I?",Pos.QuestionMark));
+ SWEDISH.addTag(new PosTag("IU",Pos.ExclamativePoint));
+ SWEDISH.addTag(new PosTag("IQ",Pos.Colon));
+ SWEDISH.addTag(new PosTag("IS",Pos.SemiColon));
+ SWEDISH.addTag(new PosTag("IT",Pos.Hyphen));
+ SWEDISH.addTag(new PosTag("IR",Pos.ParentheticalPunctuation));
SWEDISH.addTag(new PosTag("IC",LexicalCategory.Punctuation));
- SWEDISH.addTag(new PosTag("PU",LexicalCategory.Punctuation));
+ SWEDISH.addTag(new PosTag("PU",Pos.ListMarker));
SWEDISH.addTag(new PosTag("IG",LexicalCategory.Punctuation));
- SWEDISH.addTag(new PosTag("YY",LexicalCategory.Conjuction));
+ SWEDISH.addTag(new PosTag("YY",Pos.Interjection));
SWEDISH.addTag(new PosTag("ID",LexicalCategory.Noun));
- SWEDISH.addTag(new PosTag("XX",null));
+ SWEDISH.addTag(new PosTag("XX"));
getInstance().add(SWEDISH);
}
Modified: stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-pos/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/pos/services/POSTaggingEngine.java
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-pos/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/pos/services/POSTaggingEngine.java?rev=1403228&r1=1403227&r2=1403228&view=diff
==============================================================================
--- stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-pos/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/pos/services/POSTaggingEngine.java (original)
+++ stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-pos/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/pos/services/POSTaggingEngine.java Mon Oct 29 10:16:14 2012
@@ -255,17 +255,18 @@ public class POSTaggingEngine extends Ab
posTag(tokenList, posTagger,tagSet,adhocTags);
}
-
- logAnnotations(at);
+ if(log.isTraceEnabled()){
+ logAnnotations(at);
+ }
}
private void logAnnotations(AnalysedText at){
Iterator<Span> it = at.getEnclosed(EnumSet.of(SpanTypeEnum.Sentence, SpanTypeEnum.Token));
while(it.hasNext()){
Span span = it.next();
- log.info(" > {}",span);
+ log.trace(" > {}",span);
for(Value<PosTag> value : span.getAnnotations(POS_ANNOTATION)){
- log.info(" - {}",value);
+ log.trace(" - {}",value);
}
}
}