You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2013/11/19 11:54:17 UTC
svn commit: r1543372 - in
/stanbol/trunk/enhancement-engines/entitylinking/engine/src:
main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/
main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/
test/java/org/apache/stanbol/en...
Author: rwesten
Date: Tue Nov 19 10:54:17 2013
New Revision: 1543372
URL: http://svn.apache.org/r1543372
Log:
implementation for STANBOL-1211: The ChunkData now provide information about matchable tokens; the EntityLinkerConfig allows to configurethe minimum chunk match score; the EntityLinkingEngine supports chunks as described by STANBOL-1211
Modified:
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/EntityLinkerConfig.java
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ChunkData.java
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/SectionData.java
stanbol/trunk/enhancement-engines/entitylinking/engine/src/test/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngineTest.java
Modified: stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/EntityLinkerConfig.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/EntityLinkerConfig.java?rev=1543372&r1=1543371&r2=1543372&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/EntityLinkerConfig.java (original)
+++ stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/EntityLinkerConfig.java Tue Nov 19 10:54:17 2013
@@ -37,6 +37,7 @@ import org.apache.stanbol.enhancer.engin
import org.apache.stanbol.enhancer.engines.entitylinking.impl.Suggestion;
import org.apache.stanbol.enhancer.engines.entitylinking.impl.Suggestion.MATCH;
import org.apache.stanbol.enhancer.nlp.NlpAnnotations;
+import org.apache.stanbol.enhancer.nlp.model.Chunk;
import org.apache.stanbol.enhancer.nlp.model.Token;
import org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures;
import org.apache.stanbol.enhancer.nlp.pos.Pos;
@@ -168,6 +169,14 @@ public class EntityLinkerConfig {
*/
public static final String MIN_MATCH_FACTOR = "enhancer.engines.linking.minMatchScore";
/**
+ * The minimum score an Entity must match matchable {@link Token}s within a processable
+ * {@link Chunk}. By {@link #DEFAULT_MIN_CHUNK_MATCH_SCORE default} this is
+ * set to <code>51%</code> to filter Entities that do only match a single token
+ * within a NounPhrase of two words. This feature was introduced with
+ * <a href="https://issues.apache.org/jira/browse/STANBOL-1211">STANBOL-1211</a>
+ */
+ public static final String MIN_CHUNK_MATCH_SCORE = "enhancer.engines.linking.minChunkMatchScore";
+ /**
* The maximum number of {@link Token} used as search terms with the
* {@link EntitySearcher#lookup(String, Set, java.util.List, String[], Integer)}
* method
@@ -263,6 +272,13 @@ public class EntityLinkerConfig {
public static final double DEFAULT_MIN_TEXT_SCORE = 0.4;
public static final double DEFAULT_MIN_MATCH_SCORE = 0.3;
/**
+ * By default more as 50% of the matchable tokens of a processable chunk
+ * need to match so that a Entity is considered to be mentioned in the text
+ * (STANBOL-1211)
+ */
+ public static final double DEFAULT_MIN_CHUNK_MATCH_SCORE = 0.51;
+
+ /**
* Default mapping for Concept types to dc:type values added for
* TextAnnotations.
*/
@@ -449,6 +465,11 @@ public class EntityLinkerConfig {
private double minLabelScore = DEFAULT_MIN_LABEL_SCORE;
private double minTextScore = DEFAULT_MIN_TEXT_SCORE;
private double minMatchScore = DEFAULT_MIN_MATCH_SCORE;
+ /**
+ * The minimum score an entity needs to match matchable tokens within a
+ * chunk so that is is considered as a mentions (STANBOL-1211)
+ */
+ private double minChunkMatchScore = DEFAULT_MIN_CHUNK_MATCH_SCORE;
private boolean rankEqualScoresBasedOnEntityRankings = DEFAULT_RANK_EQUAL_SCORES_BASED_ON_ENTITY_RANKINGS;
@@ -632,7 +653,25 @@ public class EntityLinkerConfig {
} catch (IllegalArgumentException e){
throw new ConfigurationException(MIN_MATCH_FACTOR, e.getMessage());
}
-
+
+ value = configuration.get(MIN_CHUNK_MATCH_SCORE);
+ Double minChunkMatchScore = null;
+ if(value instanceof Number){
+ minChunkMatchScore = Double.valueOf(((Number)value).doubleValue());
+ } else if(value != null){
+ try {
+ minChunkMatchScore = Double.valueOf(value.toString());
+ } catch (NumberFormatException e) {
+ throw new ConfigurationException(MIN_CHUNK_MATCH_SCORE, "Parsed value '"
+ +value+"' is not an valid double!");
+ }
+ }
+ try {
+ linkerConfig.setMinChunkMatchScore(minChunkMatchScore);
+ } catch (IllegalArgumentException e){
+ throw new ConfigurationException(MIN_CHUNK_MATCH_SCORE, e.getMessage());
+ }
+
//init LEMMA_MATCHING_STATE
value = configuration.get(LEMMA_MATCHING_STATE);
if(value instanceof Boolean){
@@ -1085,14 +1124,15 @@ public class EntityLinkerConfig {
*/
public UriRef setTypeMapping(String conceptType, UriRef dcType){
if(dcType == null) {
- throw new IllegalArgumentException("The parsed dc:type URI MUST NOT be NULL!");
- }
- if(conceptType == null){ //handle setting of the default dc:type value
- UriRef oldDefault = getDefaultDcType();
- setDefaultDcType(dcType);
- return oldDefault;
+ return typeMappings.remove(conceptType == null ? null : new UriRef(conceptType));
+ } else {
+ if(conceptType == null){ //handle setting of the default dc:type value
+ UriRef oldDefault = getDefaultDcType();
+ setDefaultDcType(dcType);
+ return oldDefault;
+ }
+ return typeMappings.put(new UriRef(conceptType), dcType);
}
- return typeMappings.put(new UriRef(conceptType), dcType);
}
/**
@@ -1306,7 +1346,35 @@ public class EntityLinkerConfig {
} else {
minTextScore = score;
}
- }
+ }
+ /**
+ * Getter for the minimum amount of matchable {@link Token}s an Entity must match
+ * within an {@link Chunk} to be considered (see STANBOL-1211).<p>
+ * The default is <code>>0.5</code> to omit matches for a single token
+ * in a chunk - typically a noun phrase - including two words.
+ * @return the minimum chunk match score.
+ */
+ public double getMinChunkMatchScore() {
+ return minChunkMatchScore;
+ }
+ /**
+ * Setter for the minimum amount of matchable {@link Token}s an Entity must match
+ * within an {@link Chunk} to be considered (see STANBOL-1211).<p>
+ * The default is <code>>0.5</code> to omit matches for a single token
+ * in a chunk - typically a noun phrase - including two words.
+ * @param minChunkMatchScore the minimum chunk match score or <code>null</code>
+ * to reset to the default value
+ */
+ public void setMinChunkMatchScore(Double minChunkMatchScore) {
+ if(minChunkMatchScore == null){
+ this.minChunkMatchScore = DEFAULT_MIN_CHUNK_MATCH_SCORE;
+ } else if(minChunkMatchScore < 0.0 || minChunkMatchScore > 1.0){
+ throw new IllegalArgumentException("The minChunkMatchScore MUST BE "
+ + "in the range [0..1] (parsed: "+minChunkMatchScore+")!");
+ } else {
+ this.minChunkMatchScore = minChunkMatchScore;
+ }
+ }
/**
* Getter for the minimum match Score of Entity labels against the
* Text.<p>
Modified: stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ChunkData.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ChunkData.java?rev=1543372&r1=1543371&r2=1543372&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ChunkData.java (original)
+++ stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ChunkData.java Tue Nov 19 10:54:17 2013
@@ -63,6 +63,23 @@ public class ChunkData {
*/
int matchableCount;
/**
+ * The start position of the first matchable {@link Token} within this
+ * chunk
+ */
+ int matchableStart = -1;
+ /**
+ * The start char offset of the first matchable {@link Token} within this chunk
+ */
+ int matchableStartCharIndex = -1;
+ /**
+ * The end position of the last matchable {@link Token} within this chunk
+ */
+ int matchableEnd = -1;
+ /**
+ * The end char offset of the last matchable {@link Token} within this chunk
+ */
+ int matchableEndCharIndex = -1;
+ /**
* constructs and initializes the meta data for the parsed {@link Chunk}
* @param chunk
*/
@@ -121,4 +138,37 @@ public class ChunkData {
public int getEndTokenIndex() {
return endToken;
}
+ /**
+ * The index of the first matchable Token within the {@link Chunk} or
+ * <code>-1</code> if none
+ * @return
+ */
+ public int getMatchableStart() {
+ return matchableStart;
+ }
+ /**
+ * The index of the last matchable Token within the {@link Chunk} or
+ * <code>-1</code> if none
+ * @return
+ */
+ public int getMatchableEnd() {
+ return matchableEnd;
+ }
+ /**
+ * The char index of the start character of the first matchable {@link Token}
+ * within the {@link Chunk} or <code>-1</code> if none.
+ * @return
+ */
+ public int getMatchableStartChar() {
+ return matchableStartCharIndex;
+ }
+ /**
+ * the char indes of the end character of the last matchable {@link Token}
+ * within the {@link Chunk} or <code>-1</code> if none
+ * @return
+ */
+ public int getMatchableEndChar() {
+ return matchableEndCharIndex;
+ }
+
}
\ No newline at end of file
Modified: stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java?rev=1543372&r1=1543371&r2=1543372&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java (original)
+++ stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java Tue Nov 19 10:54:17 2013
@@ -25,6 +25,7 @@ import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
+import java.util.Locale;
import java.util.Map;
import java.util.NavigableMap;
import java.util.Set;
@@ -35,6 +36,7 @@ import org.apache.clerezza.rdf.core.Trip
import org.apache.clerezza.rdf.core.TripleCollection;
import org.apache.clerezza.rdf.core.UriRef;
import org.apache.clerezza.rdf.core.impl.TripleImpl;
+import org.apache.commons.lang.LocaleUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.stanbol.enhancer.engines.entitylinking.Entity;
import org.apache.stanbol.enhancer.engines.entitylinking.EntitySearcher;
@@ -162,24 +164,19 @@ public class EntityLinker {
//Determine the range we are allowed to search for tokens
final int minIncludeIndex;
final int maxIndcludeIndex;
+ int consumedIndex = state.getConsumedIndex();
//NOTE: testing has shown that using Chunks to restrict search for
// additional matchable tokens does have an negative impact on
// recall. Because of that this restriction is for now deactivated
- //TODO: maybe make configurable via an own property
- boolean restrirctContextByChunks = textProcessingConfig.isIgnoreChunks();
- int consumedIndex = state.getConsumedIndex();
- if(token.inChunk != null && !textProcessingConfig.isIgnoreChunks() &&
- restrirctContextByChunks){
- minIncludeIndex = token.inChunk.getStartTokenIndex();
-// minIncludeIndex = Math.max(
-// state.getConsumedIndex()+1,
-// token.inChunk.getStartTokenIndex());
- maxIndcludeIndex = token.inChunk.getEndTokenIndex();
- } else {
+// if(token.inChunk != null && !textProcessingConfig.isIgnoreChunks()){
+// minIncludeIndex = token.inChunk.getStartTokenIndex();
+// maxIndcludeIndex = token.inChunk.getEndTokenIndex();
+// log.debug(" - restrict context to chunk[{}, {}]",
+// minIncludeIndex, maxIndcludeIndex);
+// } else {
maxIndcludeIndex = state.getTokens().size() - 1;
-// minIncludeIndex = state.getConsumedIndex() + 1;
minIncludeIndex = 0;
- }
+// }
int prevIndex = token.index;
int pastIndex = token.index;
int pastNonMatchable = 0;
@@ -766,12 +763,19 @@ public class EntityLinker {
PlainLiteral label = labels.next();
numLabels++;
String lang = label.getLanguage() != null ? label.getLanguage().toString() : null;
+ String text = label.getLexicalForm();
+ //if case-insensitive matching ... compare lower case versions
+ if(!linkerConfig.isCaseSensitiveMatching()){
+ text = text.toLowerCase(Locale.ROOT);
+ }
if((lang == null && curLang == null) ||
(lang != null && curLang != null && lang.equalsIgnoreCase(curLang))){
- if(!matchedLabels.contains(label.getLexicalForm())){
+ if(!matchedLabels.contains(text)){
matchLabel(searchTokens, match, label);
- matchedLabels.add(label.getLexicalForm());
+ matchedLabels.add(text);
matchedLangLabel = true;
+ } else if(!matchedLangLabel){
+ matchedLangLabel = true; //found a equivalent label in the matchlang
}
} else if((lang == null && mainLang == null) ||
(lang != null && mainLang != null && lang.equalsIgnoreCase(mainLang))){
@@ -1043,6 +1047,43 @@ public class EntityLinker {
final LabelMatch labelMatch;
int coveredTokens = lastFoundIndex-firstFoundIndex+1;
int coveredProcessableTokens = lastProcessableFoundIndex-firstProcessableFoundIndex+1;
+ //check if we lookup Entities within a processable chunk
+ final float chunkMatchScore;
+ if(!textProcessingConfig.isIgnoreChunks() &&
+ state.getToken().inChunk != null && //there is a chunk
+ state.getToken().inChunk.isProcessable){ //the chunk is processable
+ ChunkData cd = state.getToken().inChunk;
+ List<TokenData> tokens = state.getTokens();
+ if(log.isTraceEnabled()){
+ log.trace(" ... checking match with chunk {}: {}",
+ cd.chunk, cd.chunk.getSpan());
+ }
+ int cstart = cd.getMatchableStart() >= 0 ? cd.getMatchableStart() :
+ firstProcessableFoundIndex;
+ int cend = cd.getMatchableEndChar();
+ //if the match does not cover the whole chunk
+ if(cstart < firstProcessableFoundIndex || cend > lastProcessableFoundIndex){
+ int foundInChunk = 0;
+ int numInChunk = 0;
+ for(int i = cd.matchableStart; i <= cd.matchableEnd ; i++){
+ TokenData td = tokens.get(i);
+ if(td.isMatchable){
+ numInChunk++;
+ if(i >= firstProcessableFoundIndex &&
+ i <= lastProcessableFoundIndex){
+ foundInChunk++;
+ }
+ }
+ }
+ chunkMatchScore = (float) foundInChunk / (float) numInChunk;
+ log.trace(" ... label matches {} of {} matchable token in Chunk",
+ foundInChunk, numInChunk);
+ } else { //matches the whole chunk
+ chunkMatchScore = 1f;
+ }
+ } else { //no chunk (or ignoreChuncks == true) .. set chunkMatchScore to 1f
+ chunkMatchScore = 1f;
+ }
//matched tokens only within the span of the first/last processable token
//Matching rules
// - if less than config#minTokenFound() than accept only EXACT
@@ -1050,10 +1091,12 @@ public class EntityLinker {
// foundTokens of the PARTIAL match is > than of the FULL/EXACT
// match (this will be very rare
String currentText = state.getTokenText(firstFoundIndex,coveredTokens);
- if(linkerConfig.isCaseSensitiveMatching() ? currentText.equals(text) : currentText.equalsIgnoreCase(text)){
+ if(chunkMatchScore == 1f && //the whole chunk matches
+ (linkerConfig.isCaseSensitiveMatching() ? currentText.equals(text) : currentText.equalsIgnoreCase(text))){
labelMatch = new LabelMatch(firstFoundIndex, coveredTokens, label);
- } else {
- int coveredLabelTokens = matchedLabelTokens.lastKey().intValue()-matchedLabelTokens.firstKey().intValue()+1;
+ } else if(chunkMatchScore >= linkerConfig.getMinChunkMatchScore()){
+ int coveredLabelTokens = matchedLabelTokens.lastKey().intValue() -
+ matchedLabelTokens.firstKey().intValue() + 1;
if(foundTokens == labelTokens.length && foundTokens == coveredTokens){
//if all token matched set found to covered: May be lower because only
//processable tokens are counted, but FULL also checks
@@ -1064,10 +1107,30 @@ public class EntityLinker {
labelMatch = new LabelMatch(firstProcessableFoundIndex, coveredProcessableTokens,
foundProcessableTokens,foundTokensWithinCoveredProcessableTokens,
foundTokenMatch/(float)foundTokens,label,labelTokens.length, coveredLabelTokens);
+ } else {
+ if(log.isTraceEnabled()){ //trace level logging for STANBOL-1211
+ List<TokenData> tokens = state.getTokens();
+ int start = tokens.get(firstProcessableFoundIndex).token.getStart();
+ int end = tokens.get(lastProcessableFoundIndex).token.getEnd();
+ CharSequence content = state.getToken().token.getContext().getText();
+ CharSequence match = content.subSequence(start, end);
+ ChunkData cd = state.getToken().inChunk;
+ int cStart = tokens.get(cd.matchableStart).token.getStart();
+ int cEnd = tokens.get(cd.matchableEnd).token.getEnd();
+ CharSequence context = content.subSequence(cStart, cEnd);
+ log.trace(" - filter match '{}'@[{},{}] because it does only match "
+ + "{}% (min: {}%) of the matchable Tokens in Chunk '{}'@[{},{}]",
+ new Object[]{match, start, end, Math.round(chunkMatchScore*100),
+ Math.round(linkerConfig.getMinChunkMatchScore()*100),
+ context, cStart, cEnd});
+ }
+ labelMatch = null;
}
- if(labelMatch.getLabelScore() >= linkerConfig.getMinLabelScore() &&
+ if(labelMatch != null &&
+ labelMatch.getLabelScore() >= linkerConfig.getMinLabelScore() &&
labelMatch.getTextScore() >= linkerConfig.getMinTextScore() &&
labelMatch.getMatchScore() >= linkerConfig.getMinMatchScore()){
+ log.trace(" + add suggestion {}", labelMatch);
suggestion.addLabelMatch(labelMatch);
}
} //else NO tokens found -> nothing to do
Modified: stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/SectionData.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/SectionData.java?rev=1543372&r1=1543371&r2=1543372&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/SectionData.java (original)
+++ stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/SectionData.java Tue Nov 19 10:54:17 2013
@@ -162,6 +162,17 @@ public class SectionData {
} else if(tokenData.isMatchable){
activeChunk.matchableCount++;
}
+ if(tokenData.isMatchable){ //for matchable tokens
+ //update the matchable span within the active chunk
+ if(activeChunk.matchableStart < 0){
+ activeChunk.matchableStart = tokenData.index;
+ activeChunk.matchableStartCharIndex = tokenData.token.getStart();
+ }
+ if(activeChunk.matchableStart >= 0){ //if start is set also set end
+ activeChunk.matchableEnd = tokenData.index;
+ activeChunk.matchableEndCharIndex = tokenData.token.getEnd();
+ }
+ }
if (span.getEnd() >= activeChunk.getEndChar()){
//this is the last token in the current chunk
activeChunk.endToken = tokens.size()-1;
Modified: stanbol/trunk/enhancement-engines/entitylinking/engine/src/test/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngineTest.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/test/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngineTest.java?rev=1543372&r1=1543371&r2=1543372&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/entitylinking/engine/src/test/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngineTest.java (original)
+++ stanbol/trunk/enhancement-engines/entitylinking/engine/src/test/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngineTest.java Tue Nov 19 10:54:17 2013
@@ -295,6 +295,7 @@ public class EntityLinkingEngineTest {
LanguageProcessingConfig tpc = new LanguageProcessingConfig();
tpc.setLinkedLexicalCategories(LanguageProcessingConfig.DEFAULT_LINKED_LEXICAL_CATEGORIES);
tpc.setLinkedPos(Collections.EMPTY_SET);
+ tpc.setIgnoreChunksState(true); //to emulate pre STANBOL-1211
EntityLinkerConfig config = new EntityLinkerConfig();
config.setMinFoundTokens(2);//this is assumed by this test
config.setRedirectProcessingMode(RedirectProcessingMode.FOLLOW);