You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2014/01/10 08:29:23 UTC
svn commit: r1557044 - in
/stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking:
FstLinkingEngine.java FstLinkingEngineComponent.java LinkableTokenFilter.java
Author: rwesten
Date: Fri Jan 10 07:29:22 2014
New Revision: 1557044
URL: http://svn.apache.org/r1557044
Log:
STANBOL-1252: merged NUM_TOKEN_FOUND feature of Lucene FST Linking Engine to 0.12 branch
Modified:
stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java
stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java
stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java
Modified: stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java?rev=1557044&r1=1557043&r2=1557044&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java Fri Jan 10 07:29:22 2014
@@ -370,13 +370,13 @@ public class FstLinkingEngine implements
new CharSequenceReader(at.getText()));
LinkableTokenFilter linkableTokenFilter = new LinkableTokenFilter(baseTokenStream,
at, session.getLanguage(), tpConfig.getConfiguration(session.getLanguage()),
- elConfig.getMinChunkMatchScore());
+ elConfig.getMinChunkMatchScore(), elConfig.getMinFoundTokens());
//we use two TagClusterReducer implementations.
// (1) the linkableTokenFilter filters all tags that do not overlap any
// linkable Token
// (2) the LONGEST_DOMINANT_RIGHT reducer (TODO: make configurable)
TagClusterReducer reducer = new ChainedTagClusterReducer(
- linkableTokenFilter,TagClusterReducer.LONGEST_DOMINANT_RIGHT);
+ TagClusterReducer.LONGEST_DOMINANT_RIGHT, linkableTokenFilter);
final long[] time = new long[]{0};
new Tagger(corpus.getFst(), linkableTokenFilter, reducer,session.isSkipAltTokens()) {
Modified: stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java?rev=1557044&r1=1557043&r2=1557044&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java Fri Jan 10 07:29:22 2014
@@ -213,6 +213,13 @@ public class FstLinkingEngineComponent {
* The default size of the Entity Cache is set to 65k entities.
*/
public static final int DEFAULT_ENTITY_CACHE_SIZE = 65536;
+
+ /**
+ * Changed default for the {@link EntityLinkerConfig#MIN_FOUND_TOKENS} property.
+ * This Engine uses <code>2</code> as default. While the {@link EntityLinkerConfig}
+ * currently sets the default to <code>1</code>
+ */
+ private static final Integer FST_DEFAULT_MIN_FOUND_TOKENS = 2;
private final Logger log = LoggerFactory.getLogger(FstLinkingEngineComponent.class);
/**
@@ -352,7 +359,13 @@ public class FstLinkingEngineComponent {
//(1) parse the TextProcessing configuration
//TODO: decide if we should use the TextProcessingConfig for this engine
textProcessingConfig = TextProcessingConfig.createInstance(properties);
+ //change default for EntityLinkerConfig.MIN_FOUND_TOKENS
+ value = properties.get(EntityLinkerConfig.MIN_FOUND_TOKENS);
entityLinkerConfig = EntityLinkerConfig.createInstance(properties, prefixService);
+ if(value == null){ //no MIN_FOUND_TOKENS config present
+ //manually set the default to the value used by this engine
+ entityLinkerConfig.setMinFoundTokens(FST_DEFAULT_MIN_FOUND_TOKENS);
+ }
//(2) parse the configured IndexReference
value = properties.get(SOLR_CORE);
Modified: stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java?rev=1557044&r1=1557043&r2=1557044&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java Fri Jan 10 07:29:22 2014
@@ -164,9 +164,14 @@ public final class LinkableTokenFilter e
* {@link Chunk} so that is is not omitted.
*/
private double minChunkMatchScore;
+ /**
+ * The minimum amount of matched (matchable) Tokens so that an Entity is
+ * considered. Only used within processable chunks
+ */
+ private int minFoundTokens;
protected LinkableTokenFilter(TokenStream input, AnalysedText at,
- String lang, LanguageProcessingConfig lpc, double minChunkMatchScore) {
+ String lang, LanguageProcessingConfig lpc, double minChunkMatchScore, int minFoundTokens) {
super(input);
//STANBOL-1177: add attributes in doPrivileged to avoid
//AccessControlException: access denied ("java.lang.RuntimePermission" "getClassLoader")
@@ -188,6 +193,7 @@ public final class LinkableTokenFilter e
this.isUnicaseLanguage = lang != null && !lang.isEmpty() &&
UNICASE_SCRIPT_LANUAGES.contains(lang);
this.minChunkMatchScore = minChunkMatchScore;
+ this.minFoundTokens = minFoundTokens;
}
@Override
@@ -362,13 +368,13 @@ public final class LinkableTokenFilter e
tag.removeLL(); //remove the tag from the cluster
if(log.isTraceEnabled()){
CharSequence tagSequence = at.getText().subSequence(start, end);
- log.trace(" > reduce tag {}", tagSequence);
+ log.trace(" > reduce tag {} - no overlapp with linkable token", tagSequence);
}
} else { //if the tag overlaps a linkable token
TokenData linkableToken = linkableTokenContext.linkableToken;
List<TokenData> tokens = linkableTokenContext.context;
ChunkData cd = linkableToken.inChunk; //check if it maches > 50% of the chunk
- if(!lpc.isIgnoreChunks() && cd != null &&
+ if(!lpc.isIgnoreChunks() && cd != null &&
cd.isProcessable){
int cstart = cd.getMatchableStartChar() >= 0 ? cd.getMatchableStartChar() :
start;
@@ -388,32 +394,32 @@ public final class LinkableTokenFilter e
}
//only accept tags with more as half of the matchable
//tokens in the Chunk are matched!
- if(((float)match/(float)num) < minChunkMatchScore){
+ if(((float)match/(float)num) < minChunkMatchScore &&
+ match < minFoundTokens){
tag.removeLL(); //ignore
if(log.isTraceEnabled()){
CharSequence text = at.getText();
- log.trace(" - reduce tag {}[{},{}] because it does only match "
+ log.trace(" - reduce tag {}[{},{}] - does only match "
+ "{} of {} of matchable Chunk {}[{},{}]",
new Object[]{text.subSequence(start, end), start, end, match,
num, text.subSequence(cstart, cend), cstart, cend});
}
} else if(log.isTraceEnabled()){
CharSequence text = at.getText();
- log.trace(" + keep tag {}[{},{}] matching {} of {} "
+ log.trace(" + keep tag {}[{},{}] - matches {} of {} "
+ "matchable Tokens for matchable Chunk {}[{},{}]",
new Object[]{text.subSequence(start, end), start, end, match,
num, text.subSequence(cstart, cend), cstart, cend});
}
} else if(log.isTraceEnabled()){
CharSequence text = at.getText();
- log.trace(" + keep tag {}[{},{}] for matchable Chunk {}[{},{}]",
+ log.trace(" + keep tag {}[{},{}] - matches whole Chunk {}[{},{}]",
new Object[]{text.subSequence(start, end), start, end,
text.subSequence(cstart, cend), cstart, cend});
}
- }
- if(log.isTraceEnabled()){
+ } else if(log.isTraceEnabled()){
CharSequence tagSequence = at.getText().subSequence(start, end);
- log.trace(" + keep tag {}", tagSequence);
+ log.trace(" + keep tag {} - not in processable chunk", tagSequence);
}
}
}