You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2014/02/24 06:12:03 UTC
svn commit: r1571145 - in /stanbol/branches/release-0.12/enhancement-engines:
entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/
entityhublinking/src/main/resources/OSGI-INF/metatype/
entitylinking/engine/src/main/java...
Author: rwesten
Date: Mon Feb 24 05:12:03 2014
New Revision: 1571145
URL: http://svn.apache.org/r1571145
Log:
Implementation of STANBOL-1285, STANBOL-1284, STANBOL-1283 and STANBOL-1282 for the 0.12-release branch.
Modified:
stanbol/branches/release-0.12/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/EntityhubLinkingEngine.java
stanbol/branches/release-0.12/enhancement-engines/entityhublinking/src/main/resources/OSGI-INF/metatype/metatype.properties
stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/EntityLinkerConfig.java
stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java
stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ChunkData.java
stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java
stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/SectionData.java
stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/TokenData.java
stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/CorpusInfo.java
stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java
stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java
stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java
stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/Match.java
stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/resources/OSGI-INF/metatype/metatype.properties
stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseBuilder.java
stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseTypeDefinition.java
stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/engine/PosChunkerEngine.java
Modified: stanbol/branches/release-0.12/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/EntityhubLinkingEngine.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/EntityhubLinkingEngine.java?rev=1571145&r1=1571144&r2=1571145&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/EntityhubLinkingEngine.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/EntityhubLinkingEngine.java Mon Feb 24 05:12:03 2014
@@ -26,7 +26,9 @@ import static org.apache.stanbol.enhance
import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.DEREFERENCE_ENTITIES;
import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.DEREFERENCE_ENTITIES_FIELDS;
import static org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig.MIN_SEARCH_TOKEN_LENGTH;
+import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.DEFAULT_INCLUDE_SIMILAR_SCORE;
import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.ENTITY_TYPES;
+import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.INCLUDE_SIMILAR_SCORE;
import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.MIN_TOKEN_SCORE;
import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.NAME_FIELD;
import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.REDIRECT_FIELD;
@@ -108,6 +110,7 @@ import org.slf4j.LoggerFactory;
@Property(name=MIN_SEARCH_TOKEN_LENGTH, intValue=DEFAULT_MIN_SEARCH_TOKEN_LENGTH),
@Property(name=MIN_TOKEN_SCORE,floatValue=DEFAULT_MIN_TOKEN_SCORE),
@Property(name=SUGGESTIONS, intValue=DEFAULT_SUGGESTIONS),
+ @Property(name=INCLUDE_SIMILAR_SCORE, boolValue=DEFAULT_INCLUDE_SIMILAR_SCORE),
@Property(name=PROCESS_ONLY_PROPER_NOUNS_STATE, boolValue=DEFAULT_PROCESS_ONLY_PROPER_NOUNS_STATE),
@Property(name=PROCESSED_LANGUAGES,
cardinality=Integer.MAX_VALUE,
Modified: stanbol/branches/release-0.12/enhancement-engines/entityhublinking/src/main/resources/OSGI-INF/metatype/metatype.properties
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/entityhublinking/src/main/resources/OSGI-INF/metatype/metatype.properties?rev=1571145&r1=1571144&r2=1571145&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/entityhublinking/src/main/resources/OSGI-INF/metatype/metatype.properties (original)
+++ stanbol/branches/release-0.12/enhancement-engines/entityhublinking/src/main/resources/OSGI-INF/metatype/metatype.properties Mon Feb 24 05:12:03 2014
@@ -62,6 +62,11 @@ enhancer.engines.linking.suggestions.nam
enhancer.engines.linking.suggestions.description=The maximal \
number of suggestions returned for a single mention.
+enhancer.engines.linking.includeSimilarScore.name=Include Similar Score Suggestions
+enhancer.engines.linking.includeSimilarScore.description= If enabled all suggestions \
+with a similar score as the last one will be included in the result. Enabling this \
+will result in more entities being suggested as configured by 'Max Suggestions'
+
enhancer.engines.linking.minFoundTokens.name=Number of Required Tokens
enhancer.engines.linking.minFoundTokens.description=For lookups with \
several words (e.g. Dr Patrick Marshall) this is the minimum number of Tokens the label of an \
Modified: stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/EntityLinkerConfig.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/EntityLinkerConfig.java?rev=1571145&r1=1571144&r2=1571145&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/EntityLinkerConfig.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/EntityLinkerConfig.java Mon Feb 24 05:12:03 2014
@@ -92,6 +92,11 @@ public class EntityLinkerConfig {
*/
public static final String SUGGESTIONS = "enhancer.engines.linking.suggestions";
/**
+ * If enabled Suggestions with similar scores are included. This means also that
+ * there might me more as {@link #SUGGESTIONS} results returned by the engine.
+ */
+ public static final String INCLUDE_SIMILAR_SCORE = "enhancer.engines.linking.includeSimilarScore";
+ /**
* If enabled {@link MorphoFeatures#getLemma()} values are used instead of the {@link Token#getSpan()} to
* search/match Entities within the Vocabulary linked against.
* @see EntityLinkerConfig#isLemmaMatching()
@@ -218,6 +223,10 @@ public class EntityLinkerConfig {
*/
public static final int DEFAULT_SUGGESTIONS = 3;
/**
+ * By default {@link #INCLUDE_SIMILAR_SCORE} is deactivated
+ */
+ public static final boolean DEFAULT_INCLUDE_SIMILAR_SCORE = false;
+ /**
* Default value for the number of tokens that must be contained in
* suggested terms. The default is <code>1</code>
*/
@@ -360,6 +369,8 @@ public class EntityLinkerConfig {
* The the maximum number of terms suggested for a word
*/
private int maxSuggestions = DEFAULT_SUGGESTIONS;
+
+ private boolean includeSuggestionsWithSimilarScore = DEFAULT_INCLUDE_SIMILAR_SCORE;
/**
* The minimum number of Tokens in the text that must match with
* a label of the Entity so that also non-exact matches are
@@ -585,6 +596,13 @@ public class EntityLinkerConfig {
}
linkerConfig.setMaxSuggestions(maxSuggestions);
}
+ //init INCLUDE_SIMILAR_SCORE
+ value = configuration.get(INCLUDE_SIMILAR_SCORE);
+ if(value instanceof Boolean){
+ linkerConfig.setIncludeSuggestionsWithSimilarScore((Boolean)value);
+ } else if(value != null){
+ linkerConfig.setIncludeSuggestionsWithSimilarScore(Boolean.parseBoolean(value.toString()));
+ }
//init MIN_FOUND_TOKENS
value = configuration.get(MIN_FOUND_TOKENS);
@@ -1047,6 +1065,18 @@ public class EntityLinkerConfig {
public int getMaxSuggestions() {
return maxSuggestions;
}
+
+ public boolean isIncludeSuggestionsWithSimilarScore(){
+ return includeSuggestionsWithSimilarScore;
+ }
+ public void setIncludeSuggestionsWithSimilarScore(Boolean state){
+ if(state == null){
+ includeSuggestionsWithSimilarScore = DEFAULT_INCLUDE_SIMILAR_SCORE;
+ } else {
+ includeSuggestionsWithSimilarScore = state;
+ }
+ }
+
/**
* Setter for the minimum number of Tokens (of the content) that MUST match
* with a {@link EntitySearcher#getNameField() label} of a
Modified: stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java?rev=1571145&r1=1571144&r2=1571145&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java Mon Feb 24 05:12:03 2014
@@ -54,6 +54,13 @@ public class LanguageProcessingConfig im
EnumSet.of(LexicalCategory.Noun, LexicalCategory.Quantifier,LexicalCategory.Residual);
/**
+ * The default set of {@link Pos} used to match (and search) for Entities <p>
+ * Matched Tokens are not used for linking, but are considered when matching
+ * label tokens of Entities with the Text.
+ */
+ public static final Set<Pos> DEFAULT_MATCHED_POS = EnumSet.of(Pos.Gerund);
+
+ /**
* The default set of {@link Pos} types that are used to lookup (link) Entities.
* By defualt only {@link Pos#ProperNoun}s and two
* {@link LexicalCategory#Residual} acronyms and
@@ -139,6 +146,10 @@ public class LanguageProcessingConfig im
private Set<LexicalCategory> matchedLexicalCategories = DEFAULT_MATCHED_LEXICAL_CATEGORIES;
+ private Set<Pos> matchedPos = DEFAULT_MATCHED_POS;
+
+ private Set<String> matchedPosTags = Collections.emptySet();
+
/**
* The linked {@link Pos} categories
*/
@@ -245,6 +256,42 @@ public class LanguageProcessingConfig im
}
}
/**
+ * Getter for the set of {@link Pos} tags used to match label tokens of
+ * suggested Entities
+ * @return the set of {@link Pos} tags used for matching
+ */
+ public Set<Pos> getMatchedPos(){
+ return matchedPos;
+ }
+ /**
+ * Setter for the matched {@link Pos} tags
+ * @param pos the set or <code>null</code>
+ * to set the {@link #DEFAULT_MATCHED_POS}
+ */
+ public void setMatchedPos(Set<Pos> pos) {
+ if(pos == null){
+ this.matchedPos = DEFAULT_MATCHED_POS;
+ } else {
+ this.matchedPos = EnumSet.noneOf(Pos.class);
+ this.matchedPos.addAll(pos);
+ }
+ }
+ public Set<String> getMatchedPosTags(){
+ return matchedPosTags;
+ }
+
+ public void setMatchedPosTags(Set<String> matchedPosTags){
+ if(matchedPosTags == null){
+ this.matchedPosTags = Collections.emptySet();
+ } else if(matchedPosTags.contains(null)){
+ throw new IllegalArgumentException("The parsed set with matched POS tags MUST NOT contain the NULL element!");
+ } else {
+ this.matchedPosTags = matchedPosTags;
+ }
+
+ }
+
+ /**
* The set of tags used for linking. This is useful if the string tags
* used by the POS tagger are not mapped to {@link LexicalCategory} nor
* {@link Pos} enum members.
Modified: stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ChunkData.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ChunkData.java?rev=1571145&r1=1571144&r2=1571145&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ChunkData.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ChunkData.java Mon Feb 24 05:12:03 2014
@@ -54,6 +54,10 @@ public class ChunkData {
/** the end token index relative to the current section (sentence) */
int endToken;
/**
+ * If this chunk has a linkable token
+ */
+ boolean hasLinkable = false;
+ /**
* The number of matchable Tokens enclosed by this Chunk
*/
int matchableCount;
@@ -129,6 +133,13 @@ public class ChunkData {
public boolean isNamedEntity() {
return isNamedEntity;
}
+ /**
+ * If this chunk covers a linkable token
+ * @return
+ */
+ public boolean hasLinkable(){
+ return hasLinkable;
+ }
/**
* Getter for the number of matchable tokens contained in this chunk
Modified: stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java?rev=1571145&r1=1571144&r2=1571145&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java Mon Feb 24 05:12:03 2014
@@ -315,15 +315,28 @@ public class EntityLinker {
log.warn(" currnet ranking : {}",suggestions);
log.warn(" ... this will result in worng confidence values relative to the best match");
}
+ int maxSuggestions = linkerConfig.getMaxSuggestions();
+ if(suggestions.size() > maxSuggestions &&
+ linkerConfig.isIncludeSuggestionsWithSimilarScore()){
+ //include suggestions with similar score
+ double minIncludeScore = suggestions.get(maxSuggestions).getScore();
+ int numInclude = maxSuggestions + 1; //the next element
+ double actScore;
+ do {
+ actScore = suggestions.get(numInclude).getScore();
+ numInclude++; //increase for the next iteration
+ } while(numInclude < suggestions.size() && actScore >= minIncludeScore);
+ maxSuggestions = numInclude - 1;
+ }
+ //remove all suggestions > maxSuggestions
+ if(suggestions.size() > maxSuggestions){
+ suggestions.subList(maxSuggestions,suggestions.size()).clear();
+ }
//adapt equals rankings based on the entity rank
if(linkerConfig.isRankEqualScoresBasedOnEntityRankings()){
adaptScoresForEntityRankings(suggestions);
adaptScoresForEntityRankings(partialMatches);
}
- //remove all suggestions > config.maxSuggestions
- if(suggestions.size() > linkerConfig.getMaxSuggestions()){
- suggestions.subList(linkerConfig.getMaxSuggestions(),suggestions.size()).clear();
- }
if(log.isDebugEnabled()){
log.debug(" >> Suggestions:");
int i=0;
Modified: stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/SectionData.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/SectionData.java?rev=1571145&r1=1571144&r2=1571145&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/SectionData.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/SectionData.java Mon Feb 24 05:12:03 2014
@@ -153,6 +153,7 @@ public class SectionData {
while(activeChunkIt.hasNext()){
ChunkData activeChunk = activeChunkIt.next();
if (tokenData.isLinkable){
+ activeChunk.hasLinkable = true;
//ignore matchableCount in Chunks with linkable Tokens
activeChunk.matchableCount = -10; //by setting the count to -10
} else if(tokenData.isMatchable){
Modified: stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/TokenData.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/TokenData.java?rev=1571145&r1=1571144&r2=1571145&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/TokenData.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/TokenData.java Mon Feb 24 05:12:03 2014
@@ -163,8 +163,9 @@ public class TokenData {
for(Value<PosTag> posAnnotation : posAnnotations){
PosTag posTag = posAnnotation.value();
if(posTag.isMapped()){
- if(!Collections.disjoint(tpc.getMatchedLexicalCategories(),
- posTag.getCategories())){
+ if((!Collections.disjoint(tpc.getMatchedLexicalCategories(), posTag.getCategories())) ||
+ (!Collections.disjoint(tpc.getMatchedPos(), posTag.getPosHierarchy())) ||
+ tpc.getMatchedPosTags().contains(posTag.getTag())){
if(posAnnotation.probability() == Value.UNKNOWN_PROBABILITY ||
posAnnotation.probability() >= tpc.getMinPosAnnotationProbability()){
//override selectedPosTag if present
Modified: stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/CorpusInfo.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/CorpusInfo.java?rev=1571145&r1=1571144&r2=1571145&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/CorpusInfo.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/CorpusInfo.java Mon Feb 24 05:12:03 2014
@@ -24,6 +24,7 @@ import java.lang.ref.WeakReference;
import java.security.AccessController;
import java.security.PrivilegedActionException;
import java.security.PrivilegedExceptionAction;
+import java.text.SimpleDateFormat;
import java.util.Date;
import org.apache.commons.io.FileUtils;
@@ -176,6 +177,7 @@ public class CorpusInfo {
if(corpus != null){
//on first usage replace a WeakReference with a SoftReference
if(taggerCorpusRef instanceof WeakReference<?>){
+ log.debug(" ... convert Weak to Soft Reference for Corpus {}", fst);
taggerCorpusRef.clear();
taggerCorpusRef = new SoftReference<TaggerFstCorpus>(corpus);
}
@@ -183,6 +185,7 @@ public class CorpusInfo {
taggerCorpusRef = null; //reset to null as the reference was taken
}
if(corpus == null) {
+ log.info(" ... load FST corpus {}",fst);
try { //STANBOL-1177: load FST models in AccessController.doPrivileged(..)
corpus = AccessController.doPrivileged(new PrivilegedExceptionAction<TaggerFstCorpus>() {
public TaggerFstCorpus run() throws IOException {
@@ -194,9 +197,17 @@ public class CorpusInfo {
//I need to set fstDate here, because I can not
//access lastModified() outside doPrivileged
fstDate = new Date(fst.lastModified());
+ if(log.isInfoEnabled()){
+ log.info(" ... loaded FST (date: {})",
+ SimpleDateFormat.getDateTimeInstance().format(fstDate));
+ }
+ } else {
+ log.warn(" ... no corpus loaded from {}",fst);
}
return corpus;
} else {
+ log.warn(" ... unable to load FST from {} (exists: {}, fileError {})",
+ new Object[]{fst, fst.exists(),fstFileError});
return null;
}
}
Modified: stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java?rev=1571145&r1=1571144&r2=1571145&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java Mon Feb 24 05:12:03 2014
@@ -294,12 +294,25 @@ public class FstLinkingEngine implements
} else if(suggestions.size() > 1){ //if we have multiple suggestions
//sort based on score
Collections.sort(suggestions, Match.SCORE_COMPARATOR);
+ int maxSuggestions = elConfig.getMaxSuggestions();
+ if(suggestions.size() > maxSuggestions &&
+ elConfig.isIncludeSuggestionsWithSimilarScore()){
+ //include suggestions with similar score
+ double minIncludeScore = suggestions.get(maxSuggestions).getScore();
+ int numInclude = maxSuggestions + 1; //the next element
+ double actScore;
+ do {
+ actScore = suggestions.get(numInclude).getScore();
+ numInclude++; //increase for the next iteration
+ } while(numInclude < suggestions.size() && actScore >= minIncludeScore);
+ maxSuggestions = numInclude - 1;
+ }
+ //remove all suggestions > maxSuggestions
+ if(suggestions.size() > maxSuggestions){
+ suggestions.subList(maxSuggestions,suggestions.size()).clear();
+ }
//adapt score based on entity ranking
adaptScoresForEntityRankings(suggestions);
- //cut the list on the maximum nuber of suggestions
- if(suggestions.size() > elConfig.getMaxSuggestions()){
- suggestions = suggestions.subList(0, elConfig.getMaxSuggestions());
- }
}
if(log.isTraceEnabled()){ //log the suggestion information
log.trace("Suggestions:");
Modified: stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java?rev=1571145&r1=1571144&r2=1571145&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java Mon Feb 24 05:12:03 2014
@@ -18,9 +18,11 @@ package org.apache.stanbol.enhancer.engi
import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.CASE_SENSITIVE;
import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.DEFAULT_CASE_SENSITIVE_MATCHING_STATE;
+import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.DEFAULT_INCLUDE_SIMILAR_SCORE;
import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.DEFAULT_MATCHING_LANGUAGE;
import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.DEFAULT_SUGGESTIONS;
import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.ENTITY_TYPES;
+import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.INCLUDE_SIMILAR_SCORE;
import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.SUGGESTIONS;
import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.TYPE_MAPPINGS;
import static org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig.DEFAULT_PROCESS_ONLY_PROPER_NOUNS_STATE;
@@ -154,6 +156,7 @@ import com.google.common.util.concurrent
@Property(name=FstLinkingEngineComponent.ENTITY_CACHE_SIZE,
intValue=FstLinkingEngineComponent.DEFAULT_ENTITY_CACHE_SIZE),
@Property(name=SUGGESTIONS, intValue=DEFAULT_SUGGESTIONS),
+ @Property(name=INCLUDE_SIMILAR_SCORE, boolValue=DEFAULT_INCLUDE_SIMILAR_SCORE),
@Property(name=CASE_SENSITIVE,boolValue=DEFAULT_CASE_SENSITIVE_MATCHING_STATE),
@Property(name=PROCESS_ONLY_PROPER_NOUNS_STATE, boolValue=DEFAULT_PROCESS_ONLY_PROPER_NOUNS_STATE),
@Property(name=PROCESSED_LANGUAGES, cardinality=Integer.MAX_VALUE,
Modified: stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java?rev=1571145&r1=1571144&r2=1571145&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java Mon Feb 24 05:12:03 2014
@@ -228,13 +228,34 @@ public final class LinkableTokenFilter e
}
first = false;
if(token.isLinkable){
+ log.trace(" + lookup because {} is linkable", token);
lookup = true;
} else if (token.isMatchable){
lastMatchable = token.index;
lastIndex = lastMatchable;
- } //else if(token.hasAlphaNumeric){
- // lastIndex = token.index;
- //}
+ }
+ //special rules for processable chunks (typically noun phrases)
+ //accept all tokens in processable chunks with a linkable or
+ //multiple matchable tokens.
+ if(!lookup && (!lpc.isIgnoreChunks()) && token.inChunk != null
+ && token.inChunk.isProcessable){
+ if(token.inChunk.isNamedEntity()){
+ if(log.isTraceEnabled()){
+ log.trace(" + lookup because {} is part of Named Entity '{}'",
+ token.token, token.inChunk.chunk.getSpan());
+ }
+ lookup = true;
+ }
+ if(token.inChunk.hasLinkable() ||
+ (lpc.isLinkMultiMatchableTokensInChunk() &&
+ token.inChunk.getMatchableCount() > 1)){
+ if(log.isTraceEnabled()){
+ log.trace(" + lookup because {} is part of a linkable chunk '{}'",
+ token.token, token.inChunk.chunk.getSpan());
+ }
+ lookup = true;
+ }
+ }
}
//lookahead
if(!lookup && lastIndex >= 0 && sectionData != null){
Modified: stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/Match.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/Match.java?rev=1571145&r1=1571144&r2=1571145&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/Match.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/Match.java Mon Feb 24 05:12:03 2014
@@ -126,6 +126,10 @@ public class Match {
public void updateScore(double score) {
this.score = score;
}
+ /**
+ * The score
+ * @return the score
+ */
public double getScore() {
return score;
}
Modified: stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/resources/OSGI-INF/metatype/metatype.properties
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/resources/OSGI-INF/metatype/metatype.properties?rev=1571145&r1=1571144&r2=1571145&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/resources/OSGI-INF/metatype/metatype.properties (original)
+++ stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/resources/OSGI-INF/metatype/metatype.properties Mon Feb 24 05:12:03 2014
@@ -102,6 +102,11 @@ The EntityCache is a LRU cache for such
enhancer.engines.linking.suggestions.name=Max Suggestions
enhancer.engines.linking.suggestions.description=The maximum number of suggestions
+enhancer.engines.linking.includeSimilarScore.name=Include Similar Score Suggestions
+enhancer.engines.linking.includeSimilarScore.description= If enabled all suggestions \
+with a similar score as the last one will be included in the result. Enabling this \
+will result in more entities being suggested as configured by 'Max Suggestions'
+
enhancer.engines.linking.minSearchTokenLength.name=Min Token Length
enhancer.engines.linking.minSearchTokenLength.description=The minimum \
length of Tokens used to lookup Entities within the Controlled Vocabulary. This parameter is ignored \
Modified: stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseBuilder.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseBuilder.java?rev=1571145&r1=1571144&r2=1571145&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseBuilder.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseBuilder.java Mon Feb 24 05:12:03 2014
@@ -24,6 +24,7 @@ import java.util.Collections;
import java.util.List;
import java.util.Set;
+import org.apache.stanbol.enhancer.engines.poschunker.PhraseTypeDefinition.TokenTypeDefinition;
import org.apache.stanbol.enhancer.nlp.NlpAnnotations;
import org.apache.stanbol.enhancer.nlp.model.Chunk;
import org.apache.stanbol.enhancer.nlp.model.Section;
@@ -105,7 +106,7 @@ public class PhraseBuilder {
public void nextSection(Section section){
buildPhrase(null);
- log.trace("-- next {} --", section);
+ log.debug("-- next {} --", section);
}
@@ -115,12 +116,14 @@ public class PhraseBuilder {
phraseType.getRequiredType());
if(states[0]){
current.add(token);
- if(log.isTraceEnabled()) {
- log.trace("-- {} phrase start --", phraseType.getPhraseType().name());
- log.trace(" {}. {} {}", new Object[]{ current.size(), token,
+ if(log.isDebugEnabled()) {
+ log.debug("-- {} phrase start --", phraseType.getPhraseType().name());
+ log.debug(" {}. {} {}", new Object[]{ current.size(), token,
logPosCategories(token)});
}
valid = states[1];
+ } else if(log.isTraceEnabled()){
+ log.trace(" - {} {}", token, logPosCategories(token));
}
}
@@ -135,8 +138,8 @@ public class PhraseBuilder {
}
if(states[0]){
current.add(token);
- if(log.isTraceEnabled()) {
- log.trace(" {}. {} {}", new Object[]{ current.size(), token,
+ if(log.isDebugEnabled()) {
+ log.debug(" {}. {} {}", new Object[]{ current.size(), token,
logPosCategories(token)});
}
}
@@ -163,17 +166,17 @@ public class PhraseBuilder {
Chunk chunk = chunkFactory.createChunk(current.get(0), lastConsumedToken);
//TODO: add support for confidence
chunk.addAnnotation(PHRASE_ANNOTATION, Value.value(phraseTag));
- if(log.isTraceEnabled()){
- log.trace(" << add {} phrase {} '{}'", new Object[]{
+ if(log.isDebugEnabled()){
+ log.debug(" << add {} phrase {} '{}'", new Object[]{
phraseType.getPhraseType().name(), chunk,chunk.getSpan()});
}
- } else if(log.isTraceEnabled()){
- log.trace(" >> ignore {} phrase with single {} ",
+ } else if(log.isDebugEnabled()){
+ log.debug(" >> ignore {} phrase with single {} ",
phraseType.getPhraseType().name() ,
current.get(0));
}
- } else if(!current.isEmpty() && log.isTraceEnabled()){
- log.trace(" << ignore invalid {} phrase [{},{}]", new Object[]{
+ } else if(!current.isEmpty() && log.isDebugEnabled()){
+ log.debug(" << ignore invalid {} phrase [{},{}]", new Object[]{
phraseType.getPhraseType().name(), current.get(0).getStart(),
current.get(current.size()-1).getEnd()});
}
@@ -193,12 +196,12 @@ public class PhraseBuilder {
* is suitable for {@link PhraseTypeDefinition#getStartType()} and
* {@link PhraseTypeDefinition#getRequiredType()}.
* @param token the Token
- * @param categories the list of categories to check
+ * @param ttd the list of categories to check
* @return if the sum of matching annotations compared to the score of all
* POS annotations is higher or equals the configured {@link #minPosSocre}.
* For each parsed categories set a boolean state is returned.
*/
- private boolean[] checkCategories(Token token, Set<LexicalCategory>...categories) {
+ private boolean[] checkCategories(Token token, TokenTypeDefinition...ttd) {
//there are different ways NLP frameworks do assign scores. For some the
//sum of all categories would sum up to 1.0, but as only the top three
//categories are included the sum would be < 1
@@ -210,22 +213,28 @@ public class PhraseBuilder {
//Match.max(1.0,sumScore).
//POS tags without score are assigned a #DEFAULT_SCORE. If not a single
//POS tag with a score is present the sumScore is NOT normalized to 1.0
+ log.trace("> check Categories for {}",token);
+ if(log.isTraceEnabled()){
+ for(int i = 0; i < ttd.length; i++){
+ log.trace( "Cat {}: {}",i,ttd[i]);
+ }
+ }
boolean scorePresent = false;
double sumScore = 0;
- double[] matchScores = new double[categories.length];
+ double[] matchScores = new double[ttd.length];
for(Value<PosTag> pos : token.getAnnotations(POS_ANNOTATION)){
+ log.trace(" - {}",pos);
double score = pos.probability();
if(score == Value.UNKNOWN_PROBABILITY){
score = DEFAULT_SCORE;
} else {
scorePresent = true;
}
- sumScore = sumScore + pos.probability();
- Set<LexicalCategory> tokenCategories = pos.value().getCategories();
- for(int i = 0; i < categories.length; i++){
- Set<LexicalCategory> category = categories[i];
- if(!Collections.disjoint(tokenCategories, category)){
- matchScores[i] = matchScores[i] + pos.probability();
+ sumScore = sumScore + score;
+ for(int i = 0; i < ttd.length; i++){
+ if(ttd[i].matches(pos.value())){
+ log.trace(" matches Category {} with score {}",i,score);
+ matchScores[i] = matchScores[i] + score;
}
}
}
Modified: stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseTypeDefinition.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseTypeDefinition.java?rev=1571145&r1=1571144&r2=1571145&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseTypeDefinition.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseTypeDefinition.java Mon Feb 24 05:12:03 2014
@@ -18,10 +18,13 @@ package org.apache.stanbol.enhancer.engi
import java.util.Collections;
import java.util.EnumSet;
+import java.util.HashSet;
import java.util.Set;
+import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
import org.apache.stanbol.enhancer.nlp.pos.Pos;
+import org.apache.stanbol.enhancer.nlp.pos.PosTag;
/**
* Definition of a phrase type<p>
@@ -47,72 +50,24 @@ public class PhraseTypeDefinition {
protected final LexicalCategory phraseType;
- private final Set<LexicalCategory> startTypes;
- protected final Set<LexicalCategory> readOnlyStartTypes;
- private final Set<LexicalCategory> prefixTypes;
- protected final Set<LexicalCategory> readOnlyPrefixTypes;
- private final Set<LexicalCategory> continuationTypes;
- protected final Set<LexicalCategory> readOnlyContinuationTypes;
- private final Set<LexicalCategory> requiredTypes;
- protected final Set<LexicalCategory> readOnlyRequiredTypes;
- private final Set<LexicalCategory> endTypes;
- protected final Set<LexicalCategory> readOnlyEndTypes;
+ private final TokenTypeDefinition startTypeDefinition;
+ private final TokenTypeDefinition prefixTypeDefinition;
+ private final TokenTypeDefinition continuationTypeDefinition;
+ private final TokenTypeDefinition requiredTypeDefinition;
+ private final TokenTypeDefinition endTypeDefinition;
public PhraseTypeDefinition(LexicalCategory phraseType) {
if(phraseType == null){
throw new IllegalArgumentException("The parsed PhraseType MUST NOT be NULL!");
}
this.phraseType = phraseType;
- startTypes = EnumSet.of(phraseType);
- readOnlyStartTypes = Collections.unmodifiableSet(startTypes);
- prefixTypes = EnumSet.of(phraseType);
- readOnlyPrefixTypes = Collections.unmodifiableSet(prefixTypes);
- continuationTypes = EnumSet.of(phraseType);
- readOnlyContinuationTypes = Collections.unmodifiableSet(continuationTypes);
- requiredTypes = EnumSet.of(phraseType);
- readOnlyRequiredTypes = Collections.unmodifiableSet(requiredTypes);
- endTypes = EnumSet.of(phraseType);
- readOnlyEndTypes = Collections.unmodifiableSet(startTypes);
+ startTypeDefinition = new TokenTypeDefinition(phraseType);
+ prefixTypeDefinition = new TokenTypeDefinition(phraseType);
+ continuationTypeDefinition = new TokenTypeDefinition(phraseType);
+ requiredTypeDefinition = new TokenTypeDefinition(phraseType);
+ endTypeDefinition = new TokenTypeDefinition(phraseType);
}
- public boolean addStartType(LexicalCategory...types){
- return add(startTypes,types);
- }
-
- public boolean addPrefixType(LexicalCategory...types){
- return add(prefixTypes,types);
- }
-
- public boolean addContinuationType(LexicalCategory...types){
- return add(continuationTypes,types);
- }
-
- public boolean addRequiredType(LexicalCategory...types){
- return add(requiredTypes,types);
- }
- public boolean addEndType(LexicalCategory...types){
- return add(endTypes,types);
- }
-
- public boolean removeStartType(LexicalCategory...types){
- return remove(startTypes,types);
- }
-
- public boolean removePrefixType(LexicalCategory...types){
- return remove(prefixTypes,types);
- }
-
- public boolean removeContinuationType(LexicalCategory...types){
- return remove(continuationTypes,types);
- }
-
- public boolean removeRequiredType(LexicalCategory...types){
- return remove(requiredTypes,types);
- }
-
- public boolean removeEndType(LexicalCategory...types){
- return remove(endTypes,types);
- }
/**
* Getter for the type of this phrase definition
* @return
@@ -126,8 +81,8 @@ public class PhraseTypeDefinition {
* @return the read only set with {@link LexicalCategory LexicalCategories}
* that can start a phrase of that type
*/
- public Set<LexicalCategory> getStartType(){
- return readOnlyStartTypes;
+ public TokenTypeDefinition getStartType(){
+ return startTypeDefinition;
}
/**
* Getter for the read only set with the prefix types
@@ -138,8 +93,8 @@ public class PhraseTypeDefinition {
* considered in prefixes (e.g. "A nice weekend") but excluded after the
* first noun (e.g. "the trip last week").
*/
- public Set<LexicalCategory> getPrefixType(){
- return readOnlyPrefixTypes;
+ public TokenTypeDefinition getPrefixType(){
+ return prefixTypeDefinition;
}
/**
@@ -151,8 +106,8 @@ public class PhraseTypeDefinition {
* considered in prefixes (e.g. "A nice weekend") but excluded after the
* first noun (e.g. "the trip last week").
*/
- public Set<LexicalCategory> getContinuationType(){
- return readOnlyContinuationTypes;
+ public TokenTypeDefinition getContinuationType(){
+ return continuationTypeDefinition;
}
/**
@@ -160,8 +115,8 @@ public class PhraseTypeDefinition {
* @return the read only set with {@link LexicalCategory LexicalCategories}
* that MUST occur within a phrase of that type
*/
- public Set<LexicalCategory> getRequiredType(){
- return readOnlyRequiredTypes;
+ public TokenTypeDefinition getRequiredType(){
+ return requiredTypeDefinition;
}
/**
@@ -169,40 +124,263 @@ public class PhraseTypeDefinition {
* @return the read only set with {@link LexicalCategory LexicalCategories}
* that can end a phrase of that type
*/
- public Set<LexicalCategory> getEndType(){
- return readOnlyEndTypes;
+ public TokenTypeDefinition getEndType(){
+ return endTypeDefinition;
}
-
- private boolean add(Set<LexicalCategory> set, LexicalCategory...types){
- boolean changed = false;
- if(types != null){
- for(LexicalCategory type : types){
- if(type != null){
- if(set.add(type)){
- changed = true;
+
+ @Override
+ public String toString() {
+ return phraseType.name();
+ }
+
+ public static class TokenTypeDefinition {
+
+ private final Set<LexicalCategory> categories = EnumSet.noneOf(LexicalCategory.class);
+ private Set<Pos> posTags = EnumSet.noneOf(Pos.class);
+ private Set<Pos> excludedPosTags = EnumSet.noneOf(Pos.class);
+ private Set<String> tags = new HashSet<String>();
+
+ /**
+ * Used by the constructor of the {@link PhraseTypeDefinition} class
+ * @param lc
+ */
+ private TokenTypeDefinition(LexicalCategory lc){
+ this(Collections.singleton(lc),null);
+ }
+
+ public TokenTypeDefinition(Set<LexicalCategory> categories, Set<Pos> posTags, String...tags) {
+ if(categories != null){
+ for(LexicalCategory lc : categories){
+ if(lc != null){
+ this.categories.add(lc);
+ }
+ }
+ }
+ if(posTags != null){
+ for(Pos pos : posTags){
+ if(pos != null){
+ this.posTags.add(pos);
+ }
+ }
+ }
+ if(tags != null){
+ for(String tag : tags){
+ if(tag != null){
+ this.tags.add(tag);
}
}
}
}
- return changed;
- }
-
- private boolean remove(Set<LexicalCategory> set, LexicalCategory...types){
- boolean changed = false;
- if(types != null){
- for(LexicalCategory type : types){
- if(type != null){
- if(set.remove(type)){
- changed = true;
+ /**
+ * Read-/writeable set of {@link LexicalCategory LexicalCategories}
+ * @return the set of lexical categories
+ */
+ public Set<LexicalCategory> getCategories() {
+ return categories;
+ }
+ /**
+ * Adds the parsed {@link LexicalCategory LexicalCategories}
+ * @param categories the LexicalCategories
+ * @return if the {@link TokenTypeDefinition} was updated by this operation
+ */
+ public boolean addCategories(LexicalCategory...categories){
+ return add(this.categories, categories);
+ }
+
+ /**
+ * Removes the parsed {@link LexicalCategory LexicalCategories}
+ * @param categories the LexicalCategories
+ * @return if the {@link TokenTypeDefinition} was updated by this operation
+ */
+ public boolean removeCategories(LexicalCategory...categories){
+ return remove(this.categories, categories);
+ }
+
+ /**
+ * Read-/writeable set of {@link Pos} tags
+ * @return the set of POS tags
+ */
+ public Set<Pos> getPosTags() {
+ return posTags;
+ }
+
+ /**
+ * Adds the parsed {@link Pos} tags
+ * @param pos the {@link Pos} tags
+ * @return if the {@link TokenTypeDefinition} was updated by this operation
+ */
+ public boolean addPosTags(Pos...pos){
+ return add(this.posTags, pos);
+ }
+
+ /**
+ * Removes the parsed {@link Pos} tags
+ * @param pos the {@link Pos} tags
+ * @return if the {@link TokenTypeDefinition} was updated by this operation
+ */
+ public boolean removePosTags(Pos...pos){
+ return remove(this.posTags, pos);
+ }
+
+ /**
+ * Read-/writeable set of excluded {@link Pos} tags. This allows to
+ * include a {@link LexicalCategory} but to exclude some specific
+ * {@link Pos} member of this category.
+ * @return the set of excluded POS tags
+ */
+ public Set<Pos> getExcludedPosTags() {
+ return excludedPosTags;
+ }
+
+ /**
+ * Adds the parsed {@link Pos} tags to the set of excluded {@link Pos} tags
+ * @param pos the {@link Pos} tags
+ * @return if the {@link TokenTypeDefinition} was updated by this operation
+ */
+ public boolean addExcludedPosTags(Pos...pos){
+ return add(this.excludedPosTags, pos);
+ }
+
+ /**
+ * Removes the parsed {@link Pos} tags to the set of excluded {@link Pos} tags
+ * @param pos the {@link Pos} tags
+ * @return if the {@link TokenTypeDefinition} was updated by this operation
+ */
+ public boolean removeExcludedPosTags(Pos...pos){
+ return remove(this.excludedPosTags, pos);
+ }
+ /**
+ * Read-/writeable set of string tags (as provided by the POS tagger)
+ * @return the set of String tags
+ */
+ public Set<String> getTags() {
+ return tags;
+ }
+ /**
+ * Adds the parsed tags
+ * @param tag the tags
+ * @return if the {@link TokenTypeDefinition} was updated by this operation
+ */
+ public boolean addTags(String...tag){
+ return add(this.tags, tag);
+ }
+
+ /**
+ * Removes the parsed tags
+ * @param tag the tags
+ * @return if the {@link TokenTypeDefinition} was updated by this operation
+ */
+ public boolean removeTags(String...tag){
+ return remove(this.tags, tag);
+ }
+
+ /**
+ * Checks if a posTag matches against this TokenTypeDefinition
+ * @param posTag the posTag to check
+ * @return <code>true</code> in case of a match. Otherwise <code>false</code>
+ * @throws NullPointerException if the parsed posTag is <code>null</code>
+ */
+ public boolean matches(PosTag posTag){
+ //check against incldues categories, posTags and tags
+ boolean matches =
+ (!Collections.disjoint(posTag.getCategories(), categories)) ||
+ (!Collections.disjoint(posTag.getPosHierarchy(), posTags)) ||
+ tags.contains(posTag.getTag());
+ //if there is a match we need still to check for excluded POS tags
+ return matches ? Collections.disjoint(posTag.getPosHierarchy(),excludedPosTags) :
+ false;
+ }
+
+ private <T> boolean add(Set<T> set, T...types){
+ boolean changed = false;
+ if(types != null){
+ for(T type : types){
+ if(type != null){
+ if(set.add(type)){
+ changed = true;
+ }
}
}
}
+ return changed;
+ }
+
+ private <T> boolean remove(Set<T> set, T...types){
+ boolean changed = false;
+ if(types != null){
+ for(T type : types){
+ if(type != null){
+ if(set.remove(type)){
+ changed = true;
+ }
+ }
+ }
+ }
+ return changed;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ if(!categories.isEmpty()){
+ sb.append("Cat: ");
+ boolean first = true;
+ for(LexicalCategory lc : categories){
+ if(first){
+ first = false;
+ } else {
+ sb.append(", ");
+ }
+ sb.append(lc.name());
+ }
+ }
+ if(!posTags.isEmpty()){
+ if(sb.length() > 0){
+ sb.append(" | ");
+ }
+ sb.append("Pos: ");
+ boolean first = true;
+ for(Pos pos : posTags){
+ if(first){
+ first = false;
+ } else {
+ sb.append(", ");
+ }
+ sb.append(pos.name());
+ }
+ }
+ if(!tags.isEmpty()){
+ if(sb.length() > 0){
+ sb.append(" | ");
+ }
+ sb.append("Tags: ");
+ boolean first = true;
+ for(String tag : tags){
+ if(first){
+ first = false;
+ } else {
+ sb.append(", ");
+ }
+ sb.append(tag);
+ }
+ }
+ if(!excludedPosTags.isEmpty()){
+ if(sb.length() > 0){
+ sb.append(" | ");
+ }
+ sb.append("Excluded: ");
+ boolean first = true;
+ for(Pos pos : excludedPosTags){
+ if(first){
+ first = false;
+ } else {
+ sb.append(", ");
+ }
+ sb.append(pos.name());
+ }
+ }
+ return sb.toString();
}
- return changed;
}
- @Override
- public String toString() {
- return phraseType.name();
- }
}
Modified: stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/engine/PosChunkerEngine.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/engine/PosChunkerEngine.java?rev=1571145&r1=1571144&r2=1571145&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/engine/PosChunkerEngine.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/engine/PosChunkerEngine.java Mon Feb 24 05:12:03 2014
@@ -52,6 +52,7 @@ import org.apache.stanbol.enhancer.nlp.m
import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
import org.apache.stanbol.enhancer.nlp.phrase.PhraseTag;
import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
+import org.apache.stanbol.enhancer.nlp.pos.Pos;
import org.apache.stanbol.enhancer.nlp.utils.LanguageConfiguration;
import org.apache.stanbol.enhancer.servicesapi.ContentItem;
import org.apache.stanbol.enhancer.servicesapi.EngineException;
@@ -126,22 +127,29 @@ public class PosChunkerEngine extends Ab
//TODO: make configurable
static {
PhraseTypeDefinition nounPD = new PhraseTypeDefinition(LexicalCategory.Noun);
+ //NOTE: Pos.Acronym, Pos.Abbreviation, Pos.Foreign are also considered as
+ // nouns by this definition.
+ nounPD.getRequiredType().addPosTags(Pos.Acronym, Pos.Abbreviation, Pos.Foreign);
//start types noun (automatically included) pronoun or determiners, adjectives
- nounPD.addStartType(LexicalCategory.PronounOrDeterminer, LexicalCategory.Adjective);
+ nounPD.getStartType().addCategories(LexicalCategory.PronounOrDeterminer, LexicalCategory.Adjective);
+ nounPD.getStartType().addPosTags(Pos.Acronym, Pos.Abbreviation, Pos.Foreign);
//prefix types are the same as start types (e.g. "the nice trip")
- nounPD.addPrefixType(LexicalCategory.PronounOrDeterminer, LexicalCategory.Adjective);
+ nounPD.getPrefixType().addCategories(LexicalCategory.PronounOrDeterminer, LexicalCategory.Adjective);
+ nounPD.getPrefixType().addPosTags(Pos.Acronym, Pos.Abbreviation, Pos.Foreign);
//continuation types are nouns and punctations.
//NOTE: Adverbs are excluded to avoid phrases like "the nice trip last week"
- nounPD.addContinuationType(LexicalCategory.Punctuation);
+ nounPD.getContinuationType().addCategories(LexicalCategory.Punctuation);
+ nounPD.getContinuationType().addPosTags(Pos.Acronym, Pos.Abbreviation, Pos.Foreign);
//end types are the same as start terms
- nounPD.addEndType(LexicalCategory.PronounOrDeterminer, LexicalCategory.Adjective);
+ nounPD.getEndType().addCategories(LexicalCategory.PronounOrDeterminer, LexicalCategory.Adjective);
+ nounPD.getEndType().addPosTags(Pos.Acronym, Pos.Abbreviation, Pos.Foreign);
//and required types do include a Noun (what is actually included by default)
NOUN_PHRASE_TYPE = nounPD;
PhraseTypeDefinition verbPD = new PhraseTypeDefinition(LexicalCategory.Verb);
- verbPD.addStartType(LexicalCategory.Adverb);
- verbPD.addContinuationType(LexicalCategory.Adverb,LexicalCategory.Punctuation);
- verbPD.addEndType(LexicalCategory.Adverb);
+ verbPD.getStartType().addCategories(LexicalCategory.Adverb);
+ verbPD.getContinuationType().addCategories(LexicalCategory.Adverb,LexicalCategory.Punctuation);
+ verbPD.getEndType().addCategories(LexicalCategory.Adverb);
//and required types do include a Verbs (what is actually included by default)
VERB_PHRASE_TYPE = verbPD;
}