You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2014/01/22 09:45:15 UTC
svn commit: r1560281 - in
/stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking:
config/ impl/
Author: rwesten
Date: Wed Jan 22 08:45:15 2014
New Revision: 1560281
URL: http://svn.apache.org/r1560281
Log:
implementation of STANBOL-1262 and STANBOL-1266 for the 0.12 brnach: EntityLinking now considers Chunks with NER_ANNOTATIONS if Noun Phrases are marked as processable chunks; Multiple overlapping Chunks are no longer merged, but the latest Chunk is used instead
Modified:
stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java
stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ChunkData.java
stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java
stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/SectionData.java
Modified: stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java?rev=1560281&r1=1560280&r2=1560281&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java Wed Jan 22 08:45:15 2014
@@ -36,6 +36,7 @@ public class LanguageProcessingConfig im
*/
public static final Set<LexicalCategory> DEFAULT_PROCESSED_PHRASE_CATEGORIES =
EnumSet.of(LexicalCategory.Noun);
+
/**
* The default set of {@link LexicalCategory LexicalCategories} used to
* lookup (link) Entities within the {@link EntitySearcher}
Modified: stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ChunkData.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ChunkData.java?rev=1560281&r1=1560280&r2=1560281&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ChunkData.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ChunkData.java Wed Jan 22 08:45:15 2014
@@ -19,12 +19,14 @@ package org.apache.stanbol.enhancer.engi
import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.PHRASE_ANNOTATION;
import org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig;
+import org.apache.stanbol.enhancer.nlp.NlpAnnotations;
import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
import org.apache.stanbol.enhancer.nlp.model.Chunk;
import org.apache.stanbol.enhancer.nlp.model.Sentence;
import org.apache.stanbol.enhancer.nlp.model.Token;
import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
import org.apache.stanbol.enhancer.nlp.phrase.PhraseTag;
+import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
/**
* Represents a Chunk (group of tokens) used as context for EntityLinking.
@@ -41,19 +43,12 @@ import org.apache.stanbol.enhancer.nlp.p
*/
public class ChunkData {
protected final static boolean DEFAULT_PROCESSABLE_STATE = true;
+ /** if this Chunk represents a Named Entity **/
+ protected final boolean isNamedEntity;
/** if the Chunk is processable */
public final boolean isProcessable;
/** the Chunk */
public final Chunk chunk;
- /**
- * In case multiple overlapping and processable {@link Chunk}s the
- * section selected by the chunks are merged. While {@link #chunk}
- * holds the original chunk (the first) this variable holds the
- * last merged one. Enclosed chunks (in case more than two are
- * merged) are not available via this class, but can be retrieved
- * by iterating over the {@link AnalysedText} content part.
- */
- Chunk merged;
/** the start token index relative to the current section (sentence) */
int startToken;
/** the end token index relative to the current section (sentence) */
@@ -100,6 +95,13 @@ public class ChunkData {
break;
} // else probability to low for exclusion
}
+ //fallback for NER chunks in case Noun Phrases are processible and a NER
+ //annotation is present for the parsed chunk.
+ isNamedEntity = chunk.getAnnotation(NlpAnnotations.NER_ANNOTATION) != null;
+ if(process == null && isNamedEntity &&
+ tpc.getProcessedPhraseCategories().contains(LexicalCategory.Noun)){
+ process = true;
+ }
isProcessable = process == null ? DEFAULT_PROCESSABLE_STATE : process;
}
/**
@@ -110,13 +112,11 @@ public class ChunkData {
return chunk.getStart();
}
/**
- * Getter for the end character position of the text selected by
- * possible multiple {@link #merged} chunks.
- * @return the end character position considering possible {@link #merged}
- * chunks.
+ * Getter for the end character position of the text
+ * @return the end character position
*/
public int getEndChar(){
- return merged == null ? chunk.getEnd() : merged.getEnd();
+ return chunk.getEnd();
}
/**
* If this chunk is processable
@@ -125,6 +125,11 @@ public class ChunkData {
public boolean isProcessable() {
return isProcessable;
}
+
+ public boolean isNamedEntity() {
+ return isNamedEntity;
+ }
+
/**
* Getter for the number of matchable tokens contained in this chunk
* @return The number of matchable tokens contained in this chunk
Modified: stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java?rev=1560281&r1=1560280&r2=1560281&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java Wed Jan 22 08:45:15 2014
@@ -20,7 +20,6 @@
package org.apache.stanbol.enhancer.engines.entitylinking.impl;
import static org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig.UNICASE_SCRIPT_LANUAGES;
-import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.POS_ANNOTATION;
import java.util.ArrayList;
import java.util.Collection;
@@ -32,14 +31,10 @@ import java.util.Locale;
import org.apache.commons.collections.Predicate;
import org.apache.commons.collections.iterators.FilterIterator;
-import org.apache.stanbol.commons.namespaceprefix.service.StanbolNamespacePrefixService;
import org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig;
-import org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig;
import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
-import org.apache.stanbol.enhancer.nlp.model.Chunk;
import org.apache.stanbol.enhancer.nlp.model.Section;
import org.apache.stanbol.enhancer.nlp.model.Sentence;
-import org.apache.stanbol.enhancer.nlp.model.Span;
import org.apache.stanbol.enhancer.nlp.model.Span.SpanTypeEnum;
import org.apache.stanbol.enhancer.nlp.model.Token;
import org.apache.stanbol.enhancer.nlp.pos.Pos;
@@ -92,7 +87,7 @@ public class ProcessingState {
protected final LanguageProcessingConfig tpc;
//protected final EntityLinkerConfig elc;
- private AnalysedText at;
+ //private AnalysedText at;
/**
* If the language uses a unicase script and therefore upper case specific
* processing rules can not be used (see STANBOL-1049)
@@ -125,7 +120,7 @@ public class ProcessingState {
if(!tpc.isIgnoreChunks()){
enclosedSpanTypes.add(SpanTypeEnum.Chunk);
}
- this.at = at; //store as field (just used for logging)
+ //this.at = at; //store as field (just used for logging)
this.language = language;
//STANBOL-1049: we need now to know if a language uses a unicase script
//ensure lower case and only use the language part
@@ -244,7 +239,7 @@ public class ProcessingState {
continue; //ignore this section
}
consumedSectionIndex = section.getEnd();
- SectionData sectionData = new SectionData(tpc, section, enclosedSpanTypes, foundLinkableToken);
+ SectionData sectionData = new SectionData(tpc, section, enclosedSpanTypes, isUnicaseLanguage);
//TODO: It would be better to use a SectionData field instead
tokens = sectionData.getTokens();
section = sectionData.section;
@@ -285,9 +280,6 @@ public class ProcessingState {
sb.append("none");
} else {
sb.append(token.inChunk.chunk);
- if(token.inChunk.merged != null){
- sb.append("(merged with ").append(token.inChunk.merged).append(')');
- }
}
sb.append("| sentence: ");
if(section == null){
Modified: stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/SectionData.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/SectionData.java?rev=1560281&r1=1560280&r2=1560281&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/SectionData.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/SectionData.java Wed Jan 22 08:45:15 2014
@@ -19,6 +19,7 @@ package org.apache.stanbol.enhancer.engi
import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.POS_ANNOTATION;
import java.util.ArrayList;
+import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
@@ -54,7 +55,7 @@ public class SectionData {
Set<SpanTypeEnum> enclosedSpanTypes, boolean isUnicaseLanguage){
this.section = section;
Iterator<Span> enclosed = section.getEnclosed(enclosedSpanTypes);
- ChunkData activeChunk = null;
+ List<ChunkData> activeChunks = new ArrayList<ChunkData>();
while(enclosed.hasNext()){
Span span = enclosed.next();
if(span.getStart() >= span.getEnd()){ //save guard against empty spans
@@ -64,27 +65,20 @@ public class SectionData {
if(span.getType() == SpanTypeEnum.Chunk){
ChunkData chunkData = new ChunkData(tpc,(Chunk)span);
if(chunkData.isProcessable()){
- if(activeChunk != null){ //current Chunk not yet closed -> overlapping chunks!
- if(activeChunk.getEndChar() < span.getEnd()){ //merge partly overlapping chunks
- log.info(" - merge overlapping and processable Chunks {} <-> {}",
- activeChunk.merged == null? activeChunk.chunk : activeChunk.merged,span);
- activeChunk.merged = (Chunk)span; //set this one as last merged
- } //ignore completely covered chunks
- } else { // a new Chunk starts
- activeChunk = chunkData;
- activeChunk.startToken = tokens.size();
- if(log.isDebugEnabled()){
- log.debug(">> Chunk: (type:{}, startPos: {}) text: '{}'",
- new Object []{
- activeChunk.chunk.getType(),
- activeChunk.startToken,
- activeChunk.chunk.getSpan()
- });
- }
+ activeChunks.add(0, chunkData);
+ chunkData.startToken = tokens.size();
+ if(log.isDebugEnabled()){
+ log.debug(">> Chunk: (type:{}, startPos: {}) text: '{}'",
+ new Object []{
+ chunkData.chunk.getType(),
+ chunkData.startToken,
+ chunkData.chunk.getSpan()
+ });
}
} //else ignore chunks that are not processable
} else if(span.getType() == SpanTypeEnum.Token){
- TokenData tokenData = new TokenData(tpc,tokens.size(),(Token)span,activeChunk);
+ TokenData tokenData = new TokenData(tpc,tokens.size(),(Token)span,
+ activeChunks.isEmpty() ? null : activeChunks.get(0));
if(log.isDebugEnabled()){
log.debug(" > {}: {} {}(pos:{}) chunk: '{}'",
new Object[]{tokenData.index,tokenData.token,
@@ -155,7 +149,9 @@ public class SectionData {
if(!hasLinkableToken){
hasLinkableToken = tokenData.isLinkable;
}
- if(activeChunk != null){
+ Iterator<ChunkData> activeChunkIt = activeChunks.iterator();
+ while(activeChunkIt.hasNext()){
+ ChunkData activeChunk = activeChunkIt.next();
if (tokenData.isLinkable){
//ignore matchableCount in Chunks with linkable Tokens
activeChunk.matchableCount = -10; //by setting the count to -10
@@ -173,10 +169,14 @@ public class SectionData {
activeChunk.matchableEndCharIndex = tokenData.token.getEnd();
}
}
- if (span.getEnd() >= activeChunk.getEndChar()){
+ if(span.getEnd() >= activeChunk.getEndChar()){
//this is the last token in the current chunk
activeChunk.endToken = tokens.size()-1;
- log.debug(" - end Chunk@pos: {}", activeChunk.endToken);
+ if(log.isDebugEnabled()){
+ log.debug(" << end Chunk {} '{}' @pos: {}", new Object[]{
+ activeChunk.chunk, activeChunk.chunk.getSpan(),
+ activeChunk.endToken});
+ }
if(tpc.isLinkMultiMatchableTokensInChunk() &&
activeChunk.getMatchableCount() > 1 ){
log.debug(" - multi-matchable Chunk:");
@@ -198,7 +198,8 @@ public class SectionData {
}
}
}
- activeChunk = null;
+ //remove the closed chunk from the list with active
+ activeChunkIt.remove();
}
}
}