You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2014/07/10 14:22:08 UTC
svn commit: r1609451 - in
/stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking:
FstLinkingEngine.java LinkableTokenFilter.java
Author: rwesten
Date: Thu Jul 10 12:22:08 2014
New Revision: 1609451
URL: http://svn.apache.org/r1609451
Log:
implementation for STANBOL-1362
Modified:
stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java
stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java
Modified: stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java?rev=1609451&r1=1609450&r2=1609451&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java Thu Jul 10 12:22:08 2014
@@ -388,7 +388,7 @@ public class FstLinkingEngine implements
// linkable Token
// (2) the LONGEST_DOMINANT_RIGHT reducer (TODO: make configurable)
TagClusterReducer reducer = new ChainedTagClusterReducer(
- TagClusterReducer.LONGEST_DOMINANT_RIGHT, linkableTokenFilter);
+ linkableTokenFilter,TagClusterReducer.ALL);
final long[] time = new long[]{0};
new Tagger(corpus.getFst(), linkableTokenFilter, reducer,session.isSkipAltTokens()) {
Modified: stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java?rev=1609451&r1=1609450&r2=1609451&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java Thu Jul 10 12:22:08 2014
@@ -21,11 +21,15 @@ import static org.apache.stanbol.enhance
import java.io.IOException;
import java.security.AccessController;
import java.security.PrivilegedAction;
+import java.util.Arrays;
import java.util.Collections;
import java.util.EnumSet;
+import java.util.HashMap;
+import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
+import java.util.Map;
import java.util.Set;
import org.apache.lucene.analysis.TokenFilter;
@@ -379,6 +383,18 @@ public final class LinkableTokenFilter e
}
@Override
public void reduce(TagLL[] head) {
+ //this implements a two phase reduce
+ //(1) reduce Tags with no linkable tokens and not matching enough of the
+ // current chunk.
+ //(2) reduce remaining Tags in the cluster similar to TagClusterReducer
+ // but only considering the "matchable span" of the Tags. Meaning the
+ // span over matchable Tokens and not the full Text.
+
+ //this map holds the matchable spans for Tags. Filled during phase (1) and
+ //used for phase(2)
+ Map<TagLL,int[]> matchableTagSpan = new HashMap<TagLL,int[]>();
+
+ //(1) reduce Tags based on link-/matchable tokens as well as chunks.
LinkableTokenContext linkableTokenContext;
for(TagLL tag = head[0]; tag != null; tag = tag.getNextTag()) {
int start = tag.getStartOffset();
@@ -398,9 +414,42 @@ public final class LinkableTokenFilter e
} else { //if the tag overlaps a linkable token
TokenData linkableToken = linkableTokenContext.linkableToken;
List<TokenData> tokens = linkableTokenContext.context;
- ChunkData cd = linkableToken.inChunk; //check if it maches > 50% of the chunk
- if(!lpc.isIgnoreChunks() && cd != null &&
- cd.isProcessable){
+ //calculate the matchable start/end span of the current TagLL
+ int[] mSpan = new int[]{
+ Math.max(start,linkableToken.token.getStart()),
+ Math.min(end,linkableToken.token.getEnd())};
+ if(mSpan[0] > start){
+ for(int i = linkableToken.index-1; i >= 0; i--){
+ TokenData token = tokens.get(i);
+ int tStart = token.token.getStart();
+ if(tStart < start){
+ break;
+ } else if(token.isMatchable){
+ mSpan[0] = tStart;
+ }
+ }
+ }
+ if(mSpan[1] < end){
+ for(int i= linkableToken.index+1; i < tokens.size();i++){
+ TokenData token = tokens.get(i);
+ int tEnd = token.token.getEnd();
+ if(tEnd > end){
+ break;
+ } else if(token.isMatchable){
+ mSpan[1] = tEnd;
+ }
+ }
+ }
+ if(log.isTraceEnabled()){
+ CharSequence text = at.getText();
+ log.trace(" - matchable Span {}{} for Tag {}[{},{}]",
+ new Object[]{ text.subSequence(mSpan[0],mSpan[1]),
+ Arrays.toString(mSpan), text.subSequence(start, end),
+ start, end});
+ }
+ matchableTagSpan.put(tag, mSpan);
+ ChunkData cd = linkableToken.inChunk; //check if it matches > 50% of the chunk
+ if(!lpc.isIgnoreChunks() && cd != null && cd.isProcessable){
int cstart = cd.getMatchableStartChar() >= 0 ? cd.getMatchableStartChar() :
start;
int cend = cd.getMatchableEndChar();
@@ -422,6 +471,7 @@ public final class LinkableTokenFilter e
if(((float)match/(float)num) < minChunkMatchScore &&
match < minFoundTokens){
tag.removeLL(); //ignore
+ matchableTagSpan.remove(tag);
if(log.isTraceEnabled()){
CharSequence text = at.getText();
log.trace(" - reduce tag {}[{},{}] - does only match "
@@ -436,11 +486,13 @@ public final class LinkableTokenFilter e
new Object[]{text.subSequence(start, end), start, end, match,
num, text.subSequence(cstart, cend), cstart, cend});
}
- } else if(log.isTraceEnabled()){
- CharSequence text = at.getText();
- log.trace(" + keep tag {}[{},{}] - matches whole Chunk {}[{},{}]",
- new Object[]{text.subSequence(start, end), start, end,
- text.subSequence(cstart, cend), cstart, cend});
+ } else {
+ if(log.isTraceEnabled()){
+ CharSequence text = at.getText();
+ log.trace(" + keep tag {}[{},{}] - matches whole Chunk {}[{},{}]",
+ new Object[]{text.subSequence(start, end), start, end,
+ text.subSequence(cstart, cend), cstart, cend});
+ }
}
} else if(log.isTraceEnabled()){
CharSequence tagSequence = at.getText().subSequence(start, end);
@@ -448,7 +500,47 @@ public final class LinkableTokenFilter e
}
}
}
-
+ //(2) reduce Tags base on longest dominant right based on the matchable
+ // spans
+ //NOTE: This is the same code as TagClusterReducer#LONGEST_DOMINANT_RIGHT
+ // but adapted to use the matchable spans instead of the full Tag
+ // spans
+ if (head.length == 0 || head[0] == null || head[0].getNextTag() == null) {
+ return; //no tag left from phase one or single token optimization
+ }
+ Set<TagLL> marked = new HashSet<TagLL>(); //can not use TagLL#mark
+ while (true) {
+ // --Find longest not already marked
+ TagLL longest = null;
+ int longestMCharLen = -1;
+ int[] longestMSpan = null;
+ for (TagLL t = head[0]; t != null; t = t.getNextTag()) {
+ int[] mSpan = matchableTagSpan.get(t);
+ int mCharLen = mSpan[1] - mSpan[0];
+ if (!marked.contains(t) && (longest == null || mCharLen >= longestMCharLen)) {
+ longest = t;
+ longestMSpan = mSpan;
+ longestMCharLen = mCharLen;
+ }
+ }
+ if (longest == null) break;
+ // --Mark longest (so we return it eventually)
+ marked.add(longest);
+ // --Remove tags overlapping this longest
+ for (TagLL t = head[0]; t != null; t = t.getNextTag()) {
+ if (marked.contains(t)) {
+ continue;
+ }
+ int[] mSpan = matchableTagSpan.get(t);
+ boolean overlaps =
+ mSpan[0] < longestMSpan[0] ? mSpan[1] > longestMSpan[1] : mSpan[0] < longestMSpan[1];
+ if (overlaps) {
+ t.removeLL();
+ } else if (mSpan[0] >= longestMSpan[1]) {
+ break;// no subsequent can possibly overlap
+ }
+ }
+ }// loop
}
/**