You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2013/08/23 11:22:54 UTC
svn commit: r1516775 - in
/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl:
ChunkData.java EntityLinker.java ProcessingState.java SectionData.java
TokenData.java
Author: rwesten
Date: Fri Aug 23 09:22:53 2013
New Revision: 1516775
URL: http://svn.apache.org/r1516775
Log:
STANBOL-1128: Refactorings to the internal API of the EntityLinkingEngine required for reusage in the FST Linking Engine
Added:
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ChunkData.java
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/SectionData.java
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/TokenData.java
Modified:
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java
Added: stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ChunkData.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ChunkData.java?rev=1516775&view=auto
==============================================================================
--- stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ChunkData.java (added)
+++ stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ChunkData.java Fri Aug 23 09:22:53 2013
@@ -0,0 +1,108 @@
+package org.apache.stanbol.enhancer.engines.entitylinking.impl;
+
+import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.PHRASE_ANNOTATION;
+
+import org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig;
+import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
+import org.apache.stanbol.enhancer.nlp.model.Chunk;
+import org.apache.stanbol.enhancer.nlp.model.Sentence;
+import org.apache.stanbol.enhancer.nlp.model.Token;
+import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
+import org.apache.stanbol.enhancer.nlp.phrase.PhraseTag;
+
+/**
+ * Represents a Chunk (group of tokens) used as context for EntityLinking.
+ * Typically a single {@link ChunkData#chunk} is used, but in case of
+ * overlapping and {@link ChunkData#isProcessable processable} chunks
+ * multiple {@link Chunk}s might be merged to a single {@link ChunkData}
+ * instance. In such cases {@link ChunkData#chunk} represents the
+ * first and {@link ChunkData#merged} the last of the merged chunks.<p>
+ * {@link ChunkData#startToken} and {@link ChunkData#endToken} represent
+ * the covered [start,end) {@link Token} indices relative to the current
+ * sections (typically a {@link Sentence}). {@link ChunkData#getStartChar()}
+ * and {@link ChunkData#getEndChar()} are the absolute [start,end) character
+ * indices within the {@link AnalysedText#getSpan()}
+ */
+public class ChunkData {
+ protected final static boolean DEFAULT_PROCESSABLE_STATE = true;
+ /** if the Chunk is processable */
+ public final boolean isProcessable;
+ /** the Chunk */
+ public final Chunk chunk;
+ /**
+ * In case multiple overlapping and processable {@link Chunk}s the
+ * section selected by the chunks are merged. While {@link #chunk}
+ * holds the original chunk (the first) this variable holds the
+ * last merged one. Enclosed chunks (in case more than two are
+ * merged) are not available via this class, but can be retrieved
+ * by iterating over the {@link AnalysedText} content part.
+ */
+ Chunk merged;
+ /** the start token index relative to the current section (sentence) */
+ int startToken;
+ /** the end token index relative to the current section (sentence) */
+ int endToken;
+ /**
+ * The number of matchable Tokens enclosed by this Chunk
+ */
+ int matchableCount;
+ /**
+ * constructs and initializes the meta data for the parsed {@link Chunk}
+ * @param chunk
+ */
+ public ChunkData(LanguageProcessingConfig tpc, Chunk chunk){
+ this.chunk = chunk;
+ Boolean process = null;
+ for (Value<PhraseTag> phraseAnnotation : chunk.getAnnotations(PHRASE_ANNOTATION)) {
+ if (tpc.getProcessedPhraseCategories().contains(phraseAnnotation.value().getCategory())
+ || tpc.getProcessedPhraseTags().contains(phraseAnnotation.value().getTag())) {
+ if (phraseAnnotation.probability() == Value.UNKNOWN_PROBABILITY ||
+ phraseAnnotation.probability() >= tpc.getMinPhraseAnnotationProbability()) {
+ process = true;
+ break;
+ } // else probability to low for inclusion
+ } else if (phraseAnnotation.probability() == Value.UNKNOWN_PROBABILITY ||
+ phraseAnnotation.probability() >= tpc.getMinExcludePhraseAnnotationProbability()) {
+ process = false;
+ break;
+ } // else probability to low for exclusion
+ }
+ isProcessable = process == null ? DEFAULT_PROCESSABLE_STATE : process;
+ }
+ /**
+ * Getter for the start character position
+ * @return the start character position of the selected text span.
+ */
+ public int getStartChar(){
+ return chunk.getStart();
+ }
+ /**
+ * Getter for the end character position of the text selected by
+ * possible multiple {@link #merged} chunks.
+ * @return the end character position considering possible {@link #merged}
+ * chunks.
+ */
+ public int getEndChar(){
+ return merged == null ? chunk.getEnd() : merged.getEnd();
+ }
+ /**
+ * If this chunk is processable
+ * @return the state
+ */
+ public boolean isProcessable() {
+ return isProcessable;
+ }
+ /**
+ * Getter for the number of matchable tokens contained in this chunk
+ * @return The number of matchable tokens contained in this chunk
+ */
+ public int getMatchableCount() {
+ return matchableCount;
+ }
+ public int getStartTokenIndex() {
+ return startToken;
+ }
+ public int getEndTokenIndex() {
+ return endToken;
+ }
+}
\ No newline at end of file
Modified: stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java?rev=1516775&r1=1516774&r2=1516775&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java (original)
+++ stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java Fri Aug 23 09:22:53 2013
@@ -43,7 +43,6 @@ import org.apache.stanbol.enhancer.engin
import org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig;
import org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.RedirectProcessingMode;
import org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig;
-import org.apache.stanbol.enhancer.engines.entitylinking.impl.ProcessingState.TokenData;
import org.apache.stanbol.enhancer.engines.entitylinking.impl.Suggestion.MATCH;
import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
import org.apache.stanbol.enhancer.nlp.model.Section;
@@ -351,7 +350,7 @@ public class EntityLinker {
LinkedEntity linkedEntity = linkedEntities.get(selectedText);
if(linkedEntity == null){
linkedEntity = new LinkedEntity(selectedText,
- suggestions, getLinkedEntityTypes(suggestions.subList(0, 1)));
+ suggestions, getLinkedEntityTypes(suggestions));
linkedEntities.put(selectedText, linkedEntity);
} // else Assumption: The list of suggestions is the SAME
linkedEntity.addOccurrence(state.getSentence(),
@@ -374,7 +373,7 @@ public class EntityLinker {
linkedEntity = linkedEntities.get(selectedText);
if(linkedEntity == null){
linkedEntity = new LinkedEntity(selectedText,
- partialMatches, getLinkedEntityTypes(suggestions.subList(0, 1)));
+ partialMatches, getLinkedEntityTypes(suggestions));
linkedEntities.put(selectedText, linkedEntity);
} // else Assumption: The list of suggestions is the SAME
linkedEntity.addOccurrence(state.getSentence(),
@@ -486,7 +485,12 @@ public class EntityLinker {
*/
private Set<UriRef> getLinkedEntityTypes(Collection<Suggestion> suggestions){
Collection<UriRef> conceptTypes = new HashSet<UriRef>();
+ double score = -1; //only consider types of the best ranked Entities
for(Suggestion suggestion : suggestions){
+ double actScore = suggestion.getScore();
+ if(actScore < score){
+ break;
+ }
for(Iterator<UriRef> types =
suggestion.getEntity().getReferences(linkerConfig.getTypeField());
types.hasNext();conceptTypes.add(types.next()));
Modified: stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java?rev=1516775&r1=1516774&r2=1516775&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java (original)
+++ stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java Fri Aug 23 09:22:53 2013
@@ -19,9 +19,7 @@
*/
package org.apache.stanbol.enhancer.engines.entitylinking.impl;
-import static java.util.Collections.disjoint;
import static org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig.UNICASE_SCRIPT_LANUAGES;
-import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.PHRASE_ANNOTATION;
import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.POS_ANNOTATION;
import java.util.ArrayList;
@@ -35,9 +33,8 @@ import java.util.Locale;
import org.apache.commons.collections.Predicate;
import org.apache.commons.collections.iterators.FilterIterator;
import org.apache.stanbol.commons.namespaceprefix.service.StanbolNamespacePrefixService;
-import org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig;
import org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig;
-import org.apache.stanbol.enhancer.nlp.NlpAnnotations;
+import org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig;
import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
import org.apache.stanbol.enhancer.nlp.model.Chunk;
import org.apache.stanbol.enhancer.nlp.model.Section;
@@ -45,11 +42,7 @@ import org.apache.stanbol.enhancer.nlp.m
import org.apache.stanbol.enhancer.nlp.model.Span;
import org.apache.stanbol.enhancer.nlp.model.Span.SpanTypeEnum;
import org.apache.stanbol.enhancer.nlp.model.Token;
-import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
-import org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures;
-import org.apache.stanbol.enhancer.nlp.phrase.PhraseTag;
import org.apache.stanbol.enhancer.nlp.pos.Pos;
-import org.apache.stanbol.enhancer.nlp.pos.PosTag;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -251,149 +244,11 @@ public class ProcessingState {
continue; //ignore this section
}
consumedSectionIndex = section.getEnd();
- tokens.clear(); //clear token for each section (STANBOL-818)
- Iterator<Span> enclosed = section.getEnclosed(enclosedSpanTypes);
- ChunkData activeChunk = null;
- while(enclosed.hasNext()){
- Span span = enclosed.next();
- if(span.getStart() >= span.getEnd()){ //save guard against empty spans
- log.warn("Detected Empty Span {} in section {} of Blob {}",
- new Object[]{span,section, at.getBlob()});
- }
- if(span.getType() == SpanTypeEnum.Chunk){
- ChunkData chunkData = new ChunkData((Chunk)span);
- if(chunkData.isProcessable()){
- if(activeChunk != null){ //current Chunk not yet closed -> overlapping chunks!
- if(activeChunk.getEndChar() < span.getEnd()){ //merge partly overlapping chunks
- log.info(" - merge overlapping and processable Chunks {} <-> {}",
- activeChunk.merged == null? activeChunk.chunk : activeChunk.merged,span);
- activeChunk.merged = (Chunk)span; //set this one as last merged
- } //ignore completely covered chunks
- } else { // a new Chunk starts
- activeChunk = chunkData;
- activeChunk.startToken = tokens.size();
- if(log.isDebugEnabled()){
- log.debug(">> Chunk: (type:{}, startPos: {}) text: '{}'",
- new Object []{
- activeChunk.chunk.getType(),
- activeChunk.startToken,
- activeChunk.chunk.getSpan()
- });
- }
- }
- } //else ignore chunks that are not processable
- } else if(span.getType() == SpanTypeEnum.Token){
- TokenData tokenData = new TokenData(tokens.size(),(Token)span,activeChunk);
- if(log.isDebugEnabled()){
- log.debug(" > {}: {} {}(pos:{}) chunk: '{}'",
- new Object[]{tokenData.index,tokenData.token,
- tokenData.morpho != null ? ("(lemma: "+tokenData.morpho.getLemma()+") ") : "",
- tokenData.token.getAnnotations(POS_ANNOTATION),
- tokenData.inChunk != null ? tokenData.inChunk.chunk.getSpan() : "none"});
- }
- if(!tokenData.hasAlphaNumeric){
- tokenData.isLinkable = false;
- tokenData.isMatchable = false;
- } else {
- // (1) apply basic rules for linkable/processable tokens
- //determine if the token should be linked/matched
- tokenData.isLinkable = tokenData.isLinkablePos != null ? tokenData.isLinkablePos : false;
- //matchabel := linkable OR has matchablePos
- tokenData.isMatchable = tokenData.isLinkable ||
- (tokenData.isMatchablePos != null && tokenData.isMatchablePos);
-
- //(2) for non linkable tokens check for upper case rules
- if(!tokenData.isLinkable && tokenData.upperCase &&
- tokenData.index > 0 && //not a sentence or sub-sentence start
- !tokens.get(tokenData.index-1).isSubSentenceStart){
- //We have an upper case token!
- if(tpc.isLinkUpperCaseTokens()){
- if(tokenData.isMatchable) { //convert matchable to
- tokenData.isLinkable = true; //linkable
- tokenData.isMatchable = true;
- } else { // and other tokens to
- tokenData.isMatchable = true; //matchable
- }
- } else {
- //finally we need to convert other Tokens to matchable
- //if MatchUpperCaseTokens is active
- if(!tokenData.isMatchable && tpc.isMatchUpperCaseTokens()){
- tokenData.isMatchable = true;
- }
- }
- } //else not an upper case token
-
- //(3) Unknown POS tag Rules (see STANBOL-1049)
- if(!tokenData.isLinkable && (tokenData.isLinkablePos == null ||
- tokenData.isMatchablePos == null)){
- if(isUnicaseLanguage || !tpc.isLinkOnlyUpperCaseTokensWithUnknownPos()){
- if(tokenData.isLinkablePos == null && tokenData.hasSearchableLength){
- tokenData.isLinkable = true;
- tokenData.isMatchable = true;
- } //else no need to change the state
- } else { //non unicase language and link only upper case tokens enabled
- if(tokenData.upperCase && // upper case token
- tokenData.index > 0 && //not a sentence or sub-sentence start
- !tokens.get(tokenData.index-1).isSubSentenceStart){
- if(tokenData.hasSearchableLength && tokenData.isLinkablePos == null){
- tokenData.isLinkable = true;
- tokenData.isMatchable = true;
- } else if(tokenData.isMatchablePos == null){
- tokenData.isMatchable = true;
- }
- } else if(tokenData.hasSearchableLength && //lower case and long token
- tokenData.isMatchablePos == null){
- tokenData.isMatchable = true;
- } //else lower case and short word
- }
- } //else already linkable or POS tag present
- }
- log.debug(" - {}",tokenData);
- //add the token to the list
- tokens.add(tokenData);
- if(!foundLinkableToken){
- foundLinkableToken = tokenData.isLinkable;
- }
- if(activeChunk != null){
- if (tokenData.isLinkable){
- //ignore matchableCount in Chunks with linkable Tokens
- activeChunk.matchableCount = -10; //by setting the count to -10
- } else if(tokenData.isMatchable){
- activeChunk.matchableCount++;
- }
- if (span.getEnd() >= activeChunk.getEndChar()){
- //this is the last token in the current chunk
- activeChunk.endToken = tokens.size()-1;
- log.debug(" - end Chunk@pos: {}", activeChunk.endToken);
- if(tpc.isLinkMultiMatchableTokensInChunk() &&
- activeChunk.getMatchableCount() > 1 ){
- log.debug(" - multi-matchable Chunk:");
- //mark the last of two immediate following matchable
- //tokens as processable
- for(int i = activeChunk.endToken-1;i >= activeChunk.startToken+1;i--){
- TokenData ct = tokens.get(i);
- TokenData pt = tokens.get(i-1);
- if(ct.isMatchable && pt.isMatchable){
- if(!ct.isLinkable) { //if not already processable
- log.debug(" > convert Token {}: {} (pos:{}) from matchable to processable",
- new Object[]{i,ct.token.getSpan(),ct.token.getAnnotations(POS_ANNOTATION)});
- ct.isLinkable = true;
- if(!foundLinkableToken){
- foundLinkableToken = true;
- }
- }
- i--;//mark both (ct & pt) as processed
- }
- }
- }
- activeChunk = null;
- }
- }
- }
- }
- if(activeChunk != null) { //close the last chunk (if not done)
- activeChunk.endToken = tokens.size()-1;
- }
+ SectionData sectionData = new SectionData(tpc, section, enclosedSpanTypes, foundLinkableToken);
+ //TODO: It would be better to use a SectionData field instead
+ tokens = sectionData.getTokens();
+ section = sectionData.section;
+ foundLinkableToken = sectionData.hasLinkableToken();
}
processableTokensIterator = new FilterIterator(tokens.iterator(), PROCESSABLE_TOKEN_OREDICATE);
return foundLinkableToken;
@@ -421,71 +276,6 @@ public class ProcessingState {
tokens.get(start+(tokenCount-1)).token.getEnd()-offset);
}
-// /**
-
-// */
-// protected boolean getProcessablePosTag(Token token) {
-// for(Value<PosTag> posAnnotation : token.getAnnotations(POS_ANNOTATION)){
-// // check three possible match
-// // 1. the LexicalCategory matches
-// // 2. the Pos matches
-// // 3. the String tag matches
-// PosTag posTag = posAnnotation.value();
-//// log.debug(" ... check PosAnntation {} (lc:{}|pos:{}|tag:{}",
-//// new Object[]{posAnnotation,posTag.getCategories(),
-//// posTag.getPosHierarch(),posTag.getTag()});
-// if((!Collections.disjoint(tpc.getProcessedLexicalCategories(),
-// posTag.getCategories())) ||
-// (!Collections.disjoint(tpc.getProcessedPos(),
-// posTag.getPosHierarchy())) ||
-// tpc.getProcessedPosTags().contains(
-// posTag.getTag())){
-// if(posAnnotation.probability() >= tpc.getMinPosAnnotationProbability()){
-// return true;
-// } // else probability to low for inclusion
-// } else if(posAnnotation.probability() >= tpc.getMinExcludePosAnnotationProbability()){
-// return false;
-// } // else probability to low for exclusion
-// }
-// return token.getSpan().length() >= elc.getMinSearchTokenLength();
-// }
-
-// Both
-// protected boolean isMatchableToken(Token token){
-// for(Value<PosTag> posAnnotation : token.getAnnotations(POS_ANNOTATION)){
-// PosTag posTag = posAnnotation.value();
-// if(posTag.isMapped()){
-// if(!Collections.disjoint(tpc.getMatchableLexicalCategories(),
-// posTag.getCategories())){
-// if(posAnnotation.probability() >= tpc.getMinPosAnnotationProbability()){
-// return true;
-// } // else probability to low for inclusion
-// } else if(posAnnotation.probability() >= tpc.getMinExcludePosAnnotationProbability()){
-// return false;
-// } // else probability to low for exclusion
-// } //else not matched ... search next one
-// }
-// return token.getSpan().length() >= elc.getMinSearchTokenLength();
-// }
-//
-//
-// protected boolean isProcesableChunk(Chunk chunk){
-// for(Value<PhraseTag> phraseAnnotation : chunk.getAnnotations(PHRASE_ANNOTATION)){
-// if(tpc.getProcessedPhraseCategories().contains(
-// phraseAnnotation.value().getCategory()) ||
-// tpc.getProcessedPhraseTags().contains(
-// phraseAnnotation.value().getTag())){
-// if(phraseAnnotation.probability() >= tpc.getMinPhraseAnnotationProbability()){
-// return true;
-// } // else probability to low for inclusion
-// } else if(phraseAnnotation.probability() >= tpc.getMinExcludePhraseAnnotationProbability()){
-// return false;
-// } // else probability to low for exclusion
-// }
-// //neither a clear accept/reject ...
-// return true;
-// }
-
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
@@ -510,329 +300,4 @@ public class ProcessingState {
return sb.toString();
}
- /**
- * Internally used to store additional Metadata for Tokens of the current Sentence
- * <p>
- * Checks if the parsed {@link Token} is processable. This decision is taken first based on the POS
- * annotation ( Lexical Category, POS tag) and second on the
- * {@link EntityLinkerConfig#getMinSearchTokenLength()} if no POS annotations are available or the
- * probability of the POS annotations is to low.
- * <p>
- * Since STANBOL-685two POS Probabilities are used <ul>
- * <li> {@link LanguageProcessingConfig#getMinPosAnnotationProbability()} for accepting POS tags that are
- * processed - included in {@link LanguageProcessingConfig#getLinkedLexicalCategories()} or
- * {@link LanguageProcessingConfig#getLinkedPosTags()}.
- * <li> {@link LanguageProcessingConfig#getMinExcludePosAnnotationProbability()} for those that are not
- * processed. By default the exclusion probability is set to half of the inclusion one.
- * </ul>
- * Assuming that the <code>minPosTypePropb=0.667</code> a
- * <ul>
- * <li>noun with the prop 0.8 would result in returning <code>true</code>
- * <li>noun with prop 0.5 would return <code>null</code>
- * <li>verb with prop 0.4 would return <code>false</code>
- * <li>verb with prop 0.3 would return <code>null</code>
- * </ul>
- * This algorithm makes it less likely that the {@link EntityLinkerConfig#getMinSearchTokenLength()} needs
- * to be used as fallback for Tokens (what typically still provides better estimations as the token
- * length).
- * <p>
- * (see also STANBOL-685 even that this Issue refers a version of this Engine that has not yet used the
- * Stanbol NLP processing chain)
- *
- * @param token
- * the {@link Token} to check.
- * @return <code>true</code> if the parsed token needs to be processed. Otherwise <code>false</code>
- */
- public class TokenData {
- /** The Token */
- public final Token token;
- /** The index of the Token within the current Section (Sentence) */
- public final int index;
- /** If this Token should be linked with the Vocabulary */
- public boolean isLinkable;
- /** If this Token should be used for multi word searches in the Vocabulary */
- public boolean isMatchable;
- /** if this Token has an alpha or numeric char */
- public final boolean hasAlphaNumeric;
- /** the chunk of this Token */
- public final ChunkData inChunk;
- /** the morphological features of the Token (selected based on the POS Tag) */
- public final MorphoFeatures morpho;
- /**
- * if this token starts with an upperCase letter
- */
- public final boolean upperCase;
- /**
- * if the length of the token is >= {@link LanguageProcessingConfig#getMinSearchTokenLength()}
- */
- public boolean hasSearchableLength;
- /**
- * If the POS type of this word matches a linkable category
- */
- public final Boolean isLinkablePos;
- /**
- * if the POS type of this word matches a matchable category
- */
- public final Boolean isMatchablePos;
- /**
- * if this Token represents the start of an sub-sentence such as an
- * starting ending quote
- * @see ProcessingState#SUB_SENTENCE_START_POS
- */
- public final boolean isSubSentenceStart;
- /**
- * Constructs and initializes meta data needed for linking based
- * on the current tokens (and its NLP annotation)
- * @param index the index of the Token within the current section
- * @param token the token
- * @param chunk the current chunk or <code>null</code> if none
- */
- TokenData(int index,Token token, ChunkData chunk) {
- //(0) init fields
- this.token = token;
- this.index = index;
- this.inChunk = chunk;
- this.hasAlphaNumeric = Utils.hasAlphaNumericChar(token.getSpan());
- this.hasSearchableLength = token.getSpan().length() >= tpc.getMinSearchTokenLength();
- PosTag selectedPosTag = null;
- boolean matchedPosTag = false; //matched any of the POS annotations
-
- //(1) check if this Token should be linked against the Vocabulary (isProcessable)
- upperCase = token.getEnd() > token.getStart() && //not an empty token
- Character.isUpperCase(token.getSpan().codePointAt(0)); //and upper case
- boolean isLinkablePos = false;
- boolean isMatchablePos = false;
- boolean isSubSentenceStart = false;
- List<Value<PosTag>> posAnnotations = token.getAnnotations(POS_ANNOTATION);
- for(Value<PosTag> posAnnotation : posAnnotations){
- // check three possible match
- // 1. the LexicalCategory matches
- // 2. the Pos matches
- // 3. the String tag matches
- PosTag posTag = posAnnotation.value();
- if((!disjoint(tpc.getLinkedLexicalCategories(), posTag.getCategories())) ||
- (!disjoint(tpc.getLinkedPos(), posTag.getPosHierarchy())) ||
- tpc.getLinkedPosTags().contains(posTag.getTag())){
- if(posAnnotation.probability() == Value.UNKNOWN_PROBABILITY ||
- posAnnotation.probability() >= tpc.getMinPosAnnotationProbability()){
- selectedPosTag = posTag;
- isLinkablePos = true;
- isMatchablePos = true;
- matchedPosTag = true;
- break;
- } // else probability to low for inclusion
- } else if(posAnnotation.probability() == Value.UNKNOWN_PROBABILITY ||
- posAnnotation.probability() >= tpc.getMinExcludePosAnnotationProbability()){
- selectedPosTag = posTag; //also rejected PosTags are selected
- matchedPosTag = true;
- isLinkablePos = false;
- break;
- } // else probability to low for exclusion
- }
- if(!matchedPosTag) { //not matched against a POS Tag ...
- this.isLinkablePos = null;
- } else {
- this.isLinkablePos = isLinkablePos;
- }
-
- //(2) check if this token should be considered to match labels of suggestions
- if(this.isLinkablePos != null && this.isLinkablePos){ //processable tokens are also matchable
- this.isMatchablePos = true;
- } else { //check POS and length to see if token is matchable
- matchedPosTag = false; //reset to false!
- for(Value<PosTag> posAnnotation : posAnnotations){
- PosTag posTag = posAnnotation.value();
- if(posTag.isMapped()){
- if(!Collections.disjoint(tpc.getMatchedLexicalCategories(),
- posTag.getCategories())){
- if(posAnnotation.probability() == Value.UNKNOWN_PROBABILITY ||
- posAnnotation.probability() >= tpc.getMinPosAnnotationProbability()){
- //override selectedPosTag if present
- selectedPosTag = posTag; //mark the matchable as selected PosTag
- isMatchablePos = true;
- matchedPosTag = true;
- break;
- } // else probability to low for inclusion
- } else if(posAnnotation.probability() == Value.UNKNOWN_PROBABILITY ||
- posAnnotation.probability() >= tpc.getMinExcludePosAnnotationProbability()){
- if(selectedPosTag == null){ //do not override existing values
- selectedPosTag = posTag; //also rejected PosTags are selected
- }
- isMatchablePos = false;
- matchedPosTag = true;
- break;
- } // else probability to low for exclusion
- } //else not matched ... search next one
- }
- if(!matchedPosTag){ //not matched against POS tag ...
- //fall back to the token length
- this.isMatchablePos = null;
- //this.isMatchablePos = token.getSpan().length() >= tpc.getMinSearchTokenLength();
- } else {
- this.isMatchablePos = isMatchablePos;
- }
- }
- //(3) check if the POS tag indicates the start/end of an sub-sentence
- for(Value<PosTag> posAnnotation : posAnnotations){
- PosTag posTag = posAnnotation.value();
- if((!disjoint(SUB_SENTENCE_START_POS,posTag.getPosHierarchy()))){
- if(posAnnotation.probability() == Value.UNKNOWN_PROBABILITY ||
- posAnnotation.probability() >= tpc.getMinPosAnnotationProbability()){
- isSubSentenceStart = true;
- } // else probability to low for inclusion
- } else if(posAnnotation.probability() == Value.UNKNOWN_PROBABILITY ||
- posAnnotation.probability() >= tpc.getMinExcludePosAnnotationProbability()){
- isSubSentenceStart = false;
- }
- }
- this.isSubSentenceStart = isSubSentenceStart;
-
- //(4) check for morpho analyses
- if(selectedPosTag == null){ //token is not processable or matchable
- //we need to set the selectedPoas tag to the first POS annotation
- Value<PosTag> posAnnotation = token.getAnnotation(POS_ANNOTATION);
- if(posAnnotation != null) {
- selectedPosTag = posAnnotation.value();
- }
- }
- List<Value<MorphoFeatures>> morphoAnnotations = token.getAnnotations(NlpAnnotations.MORPHO_ANNOTATION);
- if(selectedPosTag == null){ //no POS information ... use the first morpho annotation
- morpho = morphoAnnotations.isEmpty() ? null : morphoAnnotations.get(0).value();
- } else { //select the correct morpho annotation based on the POS tag
- MorphoFeatures mf = null;
- selectMorphoFeature :
- for(Value<MorphoFeatures> morphoAnnotation : morphoAnnotations){
- for(PosTag posTag : morphoAnnotation.value().getPosList()){
- if(!disjoint(selectedPosTag.getCategories(),posTag.getCategories())){
- mf = morphoAnnotation.value();
- break selectMorphoFeature; //stop after finding the first one
- }
- }
- }
- morpho = mf;
- }
-
- }
-
- /**
- * Getter for token text
- * @return the text of the token
- */
- public String getTokenText(){
- return token.getSpan();
- }
- /**
- * Getter for the Lemma of the token.
- * @return the Lemma of the Token or <code>null</code> if not available
- */
- public String getTokenLemma(){
- return morpho != null ? morpho.getLemma() : null;
- }
- @Override
- public String toString() {
- return new StringBuilder("TokenData: '").append(getTokenText())
- .append("'[linkable=").append(isLinkable).append("(linkabkePos=").append(isLinkablePos)
- .append(")| matchable=").append(isMatchable).append("(matchablePos=").append(isMatchablePos)
- .append(")| alpha=").append(hasAlphaNumeric).append("| seachLength=")
- .append(hasSearchableLength).append("| upperCase=").append(upperCase)
- .append("]").toString();
- }
- }
- /**
- * Represents a Chunk (group of tokens) used as context for EntityLinking.
- * Typically a single {@link ChunkData#chunk} is used, but in case of
- * overlapping and {@link ChunkData#isProcessable processable} chunks
- * multiple {@link Chunk}s might be merged to a single {@link ChunkData}
- * instance. In such cases {@link ChunkData#chunk} represents the
- * first and {@link ChunkData#merged} the last of the merged chunks.<p>
- * {@link ChunkData#startToken} and {@link ChunkData#endToken} represent
- * the covered [start,end) {@link Token} indices relative to the current
- * sections (typically a {@link Sentence}). {@link ChunkData#getStartChar()}
- * and {@link ChunkData#getEndChar()} are the absolute [start,end) character
- * indices within the {@link AnalysedText#getSpan()}
- */
- public class ChunkData {
- protected final static boolean DEFAULT_PROCESSABLE_STATE = true;
- /** if the Chunk is processable */
- public final boolean isProcessable;
- /** the Chunk */
- public final Chunk chunk;
- /**
- * In case multiple overlapping and processable {@link Chunk}s the
- * section selected by the chunks are merged. While {@link #chunk}
- * holds the original chunk (the first) this variable holds the
- * last merged one. Enclosed chunks (in case more than two are
- * merged) are not available via this class, but can be retrieved
- * by iterating over the {@link AnalysedText} content part.
- */
- private Chunk merged;
- /** the start token index relative to the current section (sentence) */
- private int startToken;
- /** the end token index relative to the current section (sentence) */
- private int endToken;
- /**
- * The number of matchable Tokens enclosed by this Chunk
- */
- int matchableCount;
- /**
- * constructs and initializes the meta data for the parsed {@link Chunk}
- * @param chunk
- */
- ChunkData(Chunk chunk){
- this.chunk = chunk;
- Boolean process = null;
- for (Value<PhraseTag> phraseAnnotation : chunk.getAnnotations(PHRASE_ANNOTATION)) {
- if (tpc.getProcessedPhraseCategories().contains(phraseAnnotation.value().getCategory())
- || tpc.getProcessedPhraseTags().contains(phraseAnnotation.value().getTag())) {
- if (phraseAnnotation.probability() == Value.UNKNOWN_PROBABILITY ||
- phraseAnnotation.probability() >= tpc.getMinPhraseAnnotationProbability()) {
- process = true;
- break;
- } // else probability to low for inclusion
- } else if (phraseAnnotation.probability() == Value.UNKNOWN_PROBABILITY ||
- phraseAnnotation.probability() >= tpc.getMinExcludePhraseAnnotationProbability()) {
- process = false;
- break;
- } // else probability to low for exclusion
- }
- isProcessable = process == null ? DEFAULT_PROCESSABLE_STATE : process;
- }
- /**
- * Getter for the start character position
- * @return the start character position of the selected text span.
- */
- public int getStartChar(){
- return chunk.getStart();
- }
- /**
- * Getter for the end character position of the text selected by
- * possible multiple {@link #merged} chunks.
- * @return the end character position considering possible {@link #merged}
- * chunks.
- */
- public int getEndChar(){
- return merged == null ? chunk.getEnd() : merged.getEnd();
- }
- /**
- * If this chunk is processable
- * @return the state
- */
- public boolean isProcessable() {
- return isProcessable;
- }
- /**
- * Getter for the number of matchable tokens contained in this chunk
- * @return The number of matchable tokens contained in this chunk
- */
- public int getMatchableCount() {
- return matchableCount;
- }
- public int getStartTokenIndex() {
- return startToken;
- }
- public int getEndTokenIndex() {
- return endToken;
- }
- }
-
}
\ No newline at end of file
Added: stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/SectionData.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/SectionData.java?rev=1516775&view=auto
==============================================================================
--- stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/SectionData.java (added)
+++ stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/SectionData.java Fri Aug 23 09:22:53 2013
@@ -0,0 +1,188 @@
+package org.apache.stanbol.enhancer.engines.entitylinking.impl;
+
+import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.POS_ANNOTATION;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig;
+import org.apache.stanbol.enhancer.nlp.model.Chunk;
+import org.apache.stanbol.enhancer.nlp.model.Section;
+import org.apache.stanbol.enhancer.nlp.model.Span;
+import org.apache.stanbol.enhancer.nlp.model.Token;
+import org.apache.stanbol.enhancer.nlp.model.Span.SpanTypeEnum;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class SectionData {
+
+ private static final Logger log = LoggerFactory.getLogger(SectionData.class);
+
+ /**
+ * The section
+ */
+ public final Section section;
+ /**
+ * Holds the {@link Token}s of the current {@link #sentence}
+ * to allow fast index based access.
+ */
+ private List<TokenData> tokens = new ArrayList<TokenData>(64);
+ /**
+ * If a linkable token is present in this section
+ */
+ private boolean hasLinkableToken = false;
+
+ public SectionData(LanguageProcessingConfig tpc, Section section,
+ Set<SpanTypeEnum> enclosedSpanTypes, boolean isUnicaseLanguage){
+ this.section = section;
+ Iterator<Span> enclosed = section.getEnclosed(enclosedSpanTypes);
+ ChunkData activeChunk = null;
+ while(enclosed.hasNext()){
+ Span span = enclosed.next();
+ if(span.getStart() >= span.getEnd()){ //save guard against empty spans
+ log.warn("Detected Empty Span {} in section {}: '{}'",
+ new Object[]{span,section, section.getSpan()});
+ }
+ if(span.getType() == SpanTypeEnum.Chunk){
+ ChunkData chunkData = new ChunkData(tpc,(Chunk)span);
+ if(chunkData.isProcessable()){
+ if(activeChunk != null){ //current Chunk not yet closed -> overlapping chunks!
+ if(activeChunk.getEndChar() < span.getEnd()){ //merge partly overlapping chunks
+ log.info(" - merge overlapping and processable Chunks {} <-> {}",
+ activeChunk.merged == null? activeChunk.chunk : activeChunk.merged,span);
+ activeChunk.merged = (Chunk)span; //set this one as last merged
+ } //ignore completely covered chunks
+ } else { // a new Chunk starts
+ activeChunk = chunkData;
+ activeChunk.startToken = tokens.size();
+ if(log.isDebugEnabled()){
+ log.debug(">> Chunk: (type:{}, startPos: {}) text: '{}'",
+ new Object []{
+ activeChunk.chunk.getType(),
+ activeChunk.startToken,
+ activeChunk.chunk.getSpan()
+ });
+ }
+ }
+ } //else ignore chunks that are not processable
+ } else if(span.getType() == SpanTypeEnum.Token){
+ TokenData tokenData = new TokenData(tpc,tokens.size(),(Token)span,activeChunk);
+ if(log.isDebugEnabled()){
+ log.debug(" > {}: {} {}(pos:{}) chunk: '{}'",
+ new Object[]{tokenData.index,tokenData.token,
+ tokenData.morpho != null ? ("(lemma: "+tokenData.morpho.getLemma()+") ") : "",
+ tokenData.token.getAnnotations(POS_ANNOTATION),
+ tokenData.inChunk != null ? tokenData.inChunk.chunk.getSpan() : "none"});
+ }
+ if(!tokenData.hasAlphaNumeric){
+ tokenData.isLinkable = false;
+ tokenData.isMatchable = false;
+ } else {
+ // (1) apply basic rules for linkable/processable tokens
+ //determine if the token should be linked/matched
+ tokenData.isLinkable = tokenData.isLinkablePos != null ? tokenData.isLinkablePos : false;
+ //matchabel := linkable OR has matchablePos
+ tokenData.isMatchable = tokenData.isLinkable ||
+ (tokenData.isMatchablePos != null && tokenData.isMatchablePos);
+
+ //(2) for non linkable tokens check for upper case rules
+ if(!tokenData.isLinkable && tokenData.upperCase &&
+ tokenData.index > 0 && //not a sentence or sub-sentence start
+ !tokens.get(tokenData.index-1).isSubSentenceStart){
+ //We have an upper case token!
+ if(tpc.isLinkUpperCaseTokens()){
+ if(tokenData.isMatchable) { //convert matchable to
+ tokenData.isLinkable = true; //linkable
+ tokenData.isMatchable = true;
+ } else { // and other tokens to
+ tokenData.isMatchable = true; //matchable
+ }
+ } else {
+ //finally we need to convert other Tokens to matchable
+ //if MatchUpperCaseTokens is active
+ if(!tokenData.isMatchable && tpc.isMatchUpperCaseTokens()){
+ tokenData.isMatchable = true;
+ }
+ }
+ } //else not an upper case token
+
+ //(3) Unknown POS tag Rules (see STANBOL-1049)
+ if(!tokenData.isLinkable && (tokenData.isLinkablePos == null ||
+ tokenData.isMatchablePos == null)){
+ if(isUnicaseLanguage || !tpc.isLinkOnlyUpperCaseTokensWithUnknownPos()){
+ if(tokenData.isLinkablePos == null && tokenData.hasSearchableLength){
+ tokenData.isLinkable = true;
+ tokenData.isMatchable = true;
+ } //else no need to change the state
+ } else { //non unicase language and link only upper case tokens enabled
+ if(tokenData.upperCase && // upper case token
+ tokenData.index > 0 && //not a sentence or sub-sentence start
+ !tokens.get(tokenData.index-1).isSubSentenceStart){
+ if(tokenData.hasSearchableLength && tokenData.isLinkablePos == null){
+ tokenData.isLinkable = true;
+ tokenData.isMatchable = true;
+ } else if(tokenData.isMatchablePos == null){
+ tokenData.isMatchable = true;
+ }
+ } else if(tokenData.hasSearchableLength && //lower case and long token
+ tokenData.isMatchablePos == null){
+ tokenData.isMatchable = true;
+ } //else lower case and short word
+ }
+ } //else already linkable or POS tag present
+ }
+ log.debug(" - {}",tokenData);
+ //add the token to the list
+ tokens.add(tokenData);
+ if(!hasLinkableToken){
+ hasLinkableToken = tokenData.isLinkable;
+ }
+ if(activeChunk != null){
+ if (tokenData.isLinkable){
+ //ignore matchableCount in Chunks with linkable Tokens
+ activeChunk.matchableCount = -10; //by setting the count to -10
+ } else if(tokenData.isMatchable){
+ activeChunk.matchableCount++;
+ }
+ if (span.getEnd() >= activeChunk.getEndChar()){
+ //this is the last token in the current chunk
+ activeChunk.endToken = tokens.size()-1;
+ log.debug(" - end Chunk@pos: {}", activeChunk.endToken);
+ if(tpc.isLinkMultiMatchableTokensInChunk() &&
+ activeChunk.getMatchableCount() > 1 ){
+ log.debug(" - multi-matchable Chunk:");
+ //mark the last of two immediate following matchable
+ //tokens as processable
+ for(int i = activeChunk.endToken-1;i >= activeChunk.startToken+1;i--){
+ TokenData ct = tokens.get(i);
+ TokenData pt = tokens.get(i-1);
+ if(ct.isMatchable && pt.isMatchable){
+ if(!ct.isLinkable) { //if not already processable
+ log.debug(" > convert Token {}: {} (pos:{}) from matchable to processable",
+ new Object[]{i,ct.token.getSpan(),ct.token.getAnnotations(POS_ANNOTATION)});
+ ct.isLinkable = true;
+ if(!hasLinkableToken){
+ hasLinkableToken = true;
+ }
+ }
+ i--;//mark both (ct & pt) as processed
+ }
+ }
+ }
+ activeChunk = null;
+ }
+ }
+ }
+ }
+ }
+
+ public List<TokenData> getTokens() {
+ return tokens;
+ }
+
+ public boolean hasLinkableToken() {
+ return hasLinkableToken;
+ }
+}
\ No newline at end of file
Added: stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/TokenData.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/TokenData.java?rev=1516775&view=auto
==============================================================================
--- stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/TokenData.java (added)
+++ stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/TokenData.java Fri Aug 23 09:22:53 2013
@@ -0,0 +1,244 @@
+package org.apache.stanbol.enhancer.engines.entitylinking.impl;
+
+import static java.util.Collections.disjoint;
+import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.POS_ANNOTATION;
+
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig;
+import org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig;
+import org.apache.stanbol.enhancer.nlp.NlpAnnotations;
+import org.apache.stanbol.enhancer.nlp.model.Token;
+import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
+import org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures;
+import org.apache.stanbol.enhancer.nlp.pos.PosTag;
+
+/**
+ * Internally used to store additional Metadata for Tokens of the current Sentence
+ * <p>
+ * Checks if the parsed {@link Token} is processable. This decision is taken first based on the POS
+ * annotation ( Lexical Category, POS tag) and second on the
+ * {@link EntityLinkerConfig#getMinSearchTokenLength()} if no POS annotations are available or the
+ * probability of the POS annotations is to low.
+ * <p>
+ * Since STANBOL-685two POS Probabilities are used <ul>
+ * <li> {@link LanguageProcessingConfig#getMinPosAnnotationProbability()} for accepting POS tags that are
+ * processed - included in {@link LanguageProcessingConfig#getLinkedLexicalCategories()} or
+ * {@link LanguageProcessingConfig#getLinkedPosTags()}.
+ * <li> {@link LanguageProcessingConfig#getMinExcludePosAnnotationProbability()} for those that are not
+ * processed. By default the exclusion probability is set to half of the inclusion one.
+ * </ul>
+ * Assuming that the <code>minPosTypePropb=0.667</code> a
+ * <ul>
+ * <li>noun with the prop 0.8 would result in returning <code>true</code>
+ * <li>noun with prop 0.5 would return <code>null</code>
+ * <li>verb with prop 0.4 would return <code>false</code>
+ * <li>verb with prop 0.3 would return <code>null</code>
+ * </ul>
+ * This algorithm makes it less likely that the {@link EntityLinkerConfig#getMinSearchTokenLength()} needs
+ * to be used as fallback for Tokens (what typically still provides better estimations as the token
+ * length).
+ * <p>
+ * (see also STANBOL-685 even that this Issue refers a version of this Engine that has not yet used the
+ * Stanbol NLP processing chain)
+ *
+ * @param token
+ * the {@link Token} to check.
+ * @return <code>true</code> if the parsed token needs to be processed. Otherwise <code>false</code>
+ */
+public class TokenData {
+ /** The Token */
+ public final Token token;
+ /** The index of the Token within the current Section (Sentence) */
+ public final int index;
+ /** If this Token should be linked with the Vocabulary */
+ public boolean isLinkable;
+ /** If this Token should be used for multi word searches in the Vocabulary */
+ public boolean isMatchable;
+ /** if this Token has an alpha or numeric char */
+ public final boolean hasAlphaNumeric;
+ /** the chunk of this Token */
+ public final ChunkData inChunk;
+ /** the morphological features of the Token (selected based on the POS Tag) */
+ public final MorphoFeatures morpho;
+ /**
+ * if this token starts with an upperCase letter
+ */
+ public final boolean upperCase;
+ /**
+ * if the length of the token is >= {@link LanguageProcessingConfig#getMinSearchTokenLength()}
+ */
+ public boolean hasSearchableLength;
+ /**
+ * If the POS type of this word matches a linkable category
+ */
+ public final Boolean isLinkablePos;
+ /**
+ * if the POS type of this word matches a matchable category
+ */
+ public final Boolean isMatchablePos;
+ /**
+ * if this Token represents the start of an sub-sentence such as an
+ * starting ending quote
+ * @see ProcessingState#SUB_SENTENCE_START_POS
+ */
+ public final boolean isSubSentenceStart;
+ /**
+ * Constructs and initializes meta data needed for linking based
+ * on the current tokens (and its NLP annotation)
+ * @param index the index of the Token within the current section
+ * @param token the token
+ * @param chunk the current chunk or <code>null</code> if none
+ */
+ public TokenData(LanguageProcessingConfig tpc, int index,Token token, ChunkData chunk) {
+ //(0) init fields
+ this.token = token;
+ this.index = index;
+ this.inChunk = chunk;
+ this.hasAlphaNumeric = Utils.hasAlphaNumericChar(token.getSpan());
+ this.hasSearchableLength = token.getSpan().length() >= tpc.getMinSearchTokenLength();
+ PosTag selectedPosTag = null;
+ boolean matchedPosTag = false; //matched any of the POS annotations
+
+ //(1) check if this Token should be linked against the Vocabulary (isProcessable)
+ upperCase = token.getEnd() > token.getStart() && //not an empty token
+ Character.isUpperCase(token.getSpan().codePointAt(0)); //and upper case
+ boolean isLinkablePos = false;
+ boolean isMatchablePos = false;
+ boolean isSubSentenceStart = false;
+ List<Value<PosTag>> posAnnotations = token.getAnnotations(POS_ANNOTATION);
+ for(Value<PosTag> posAnnotation : posAnnotations){
+ // check three possible match
+ // 1. the LexicalCategory matches
+ // 2. the Pos matches
+ // 3. the String tag matches
+ PosTag posTag = posAnnotation.value();
+ if((!disjoint(tpc.getLinkedLexicalCategories(), posTag.getCategories())) ||
+ (!disjoint(tpc.getLinkedPos(), posTag.getPosHierarchy())) ||
+ tpc.getLinkedPosTags().contains(posTag.getTag())){
+ if(posAnnotation.probability() == Value.UNKNOWN_PROBABILITY ||
+ posAnnotation.probability() >= tpc.getMinPosAnnotationProbability()){
+ selectedPosTag = posTag;
+ isLinkablePos = true;
+ isMatchablePos = true;
+ matchedPosTag = true;
+ break;
+ } // else probability to low for inclusion
+ } else if(posAnnotation.probability() == Value.UNKNOWN_PROBABILITY ||
+ posAnnotation.probability() >= tpc.getMinExcludePosAnnotationProbability()){
+ selectedPosTag = posTag; //also rejected PosTags are selected
+ matchedPosTag = true;
+ isLinkablePos = false;
+ break;
+ } // else probability to low for exclusion
+ }
+ if(!matchedPosTag) { //not matched against a POS Tag ...
+ this.isLinkablePos = null;
+ } else {
+ this.isLinkablePos = isLinkablePos;
+ }
+
+ //(2) check if this token should be considered to match labels of suggestions
+ if(this.isLinkablePos != null && this.isLinkablePos){ //processable tokens are also matchable
+ this.isMatchablePos = true;
+ } else { //check POS and length to see if token is matchable
+ matchedPosTag = false; //reset to false!
+ for(Value<PosTag> posAnnotation : posAnnotations){
+ PosTag posTag = posAnnotation.value();
+ if(posTag.isMapped()){
+ if(!Collections.disjoint(tpc.getMatchedLexicalCategories(),
+ posTag.getCategories())){
+ if(posAnnotation.probability() == Value.UNKNOWN_PROBABILITY ||
+ posAnnotation.probability() >= tpc.getMinPosAnnotationProbability()){
+ //override selectedPosTag if present
+ selectedPosTag = posTag; //mark the matchable as selected PosTag
+ isMatchablePos = true;
+ matchedPosTag = true;
+ break;
+ } // else probability to low for inclusion
+ } else if(posAnnotation.probability() == Value.UNKNOWN_PROBABILITY ||
+ posAnnotation.probability() >= tpc.getMinExcludePosAnnotationProbability()){
+ if(selectedPosTag == null){ //do not override existing values
+ selectedPosTag = posTag; //also rejected PosTags are selected
+ }
+ isMatchablePos = false;
+ matchedPosTag = true;
+ break;
+ } // else probability to low for exclusion
+ } //else not matched ... search next one
+ }
+ if(!matchedPosTag){ //not matched against POS tag ...
+ //fall back to the token length
+ this.isMatchablePos = null;
+ //this.isMatchablePos = token.getSpan().length() >= tpc.getMinSearchTokenLength();
+ } else {
+ this.isMatchablePos = isMatchablePos;
+ }
+ }
+ //(3) check if the POS tag indicates the start/end of an sub-sentence
+ for(Value<PosTag> posAnnotation : posAnnotations){
+ PosTag posTag = posAnnotation.value();
+ if((!disjoint(ProcessingState.SUB_SENTENCE_START_POS,posTag.getPosHierarchy()))){
+ if(posAnnotation.probability() == Value.UNKNOWN_PROBABILITY ||
+ posAnnotation.probability() >= tpc.getMinPosAnnotationProbability()){
+ isSubSentenceStart = true;
+ } // else probability to low for inclusion
+ } else if(posAnnotation.probability() == Value.UNKNOWN_PROBABILITY ||
+ posAnnotation.probability() >= tpc.getMinExcludePosAnnotationProbability()){
+ isSubSentenceStart = false;
+ }
+ }
+ this.isSubSentenceStart = isSubSentenceStart;
+
+ //(4) check for morpho analyses
+ if(selectedPosTag == null){ //token is not processable or matchable
+ //we need to set the selectedPoas tag to the first POS annotation
+ Value<PosTag> posAnnotation = token.getAnnotation(POS_ANNOTATION);
+ if(posAnnotation != null) {
+ selectedPosTag = posAnnotation.value();
+ }
+ }
+ List<Value<MorphoFeatures>> morphoAnnotations = token.getAnnotations(NlpAnnotations.MORPHO_ANNOTATION);
+ if(selectedPosTag == null){ //no POS information ... use the first morpho annotation
+ morpho = morphoAnnotations.isEmpty() ? null : morphoAnnotations.get(0).value();
+ } else { //select the correct morpho annotation based on the POS tag
+ MorphoFeatures mf = null;
+ selectMorphoFeature :
+ for(Value<MorphoFeatures> morphoAnnotation : morphoAnnotations){
+ for(PosTag posTag : morphoAnnotation.value().getPosList()){
+ if(!disjoint(selectedPosTag.getCategories(),posTag.getCategories())){
+ mf = morphoAnnotation.value();
+ break selectMorphoFeature; //stop after finding the first one
+ }
+ }
+ }
+ morpho = mf;
+ }
+
+ }
+
+ /**
+ * Getter for token text
+ * @return the text of the token
+ */
+ public String getTokenText(){
+ return token.getSpan();
+ }
+ /**
+ * Getter for the Lemma of the token.
+ * @return the Lemma of the Token or <code>null</code> if not available
+ */
+ public String getTokenLemma(){
+ return morpho != null ? morpho.getLemma() : null;
+ }
+ @Override
+ public String toString() {
+ return new StringBuilder("TokenData: '").append(getTokenText())
+ .append("'[linkable=").append(isLinkable).append("(linkabkePos=").append(isLinkablePos)
+ .append(")| matchable=").append(isMatchable).append("(matchablePos=").append(isMatchablePos)
+ .append(")| alpha=").append(hasAlphaNumeric).append("| seachLength=")
+ .append(hasSearchableLength).append("| upperCase=").append(upperCase)
+ .append("]").toString();
+ }
+}
\ No newline at end of file