You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2014/01/22 09:45:15 UTC
svn commit: r1560281 - in /stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking: config/ impl/

Author: rwesten
Date: Wed Jan 22 08:45:15 2014
New Revision: 1560281

URL: http://svn.apache.org/r1560281
Log:
implementation of STANBOL-1262 and STANBOL-1266 for the 0.12 brnach: EntityLinking now considers Chunks with NER_ANNOTATIONS if Noun Phrases are marked as processable chunks; Multiple overlapping Chunks are no longer merged, but the latest Chunk is used instead

Modified:
    stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java
    stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ChunkData.java
    stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java
    stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/SectionData.java

Modified: stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java?rev=1560281&r1=1560280&r2=1560281&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java Wed Jan 22 08:45:15 2014
@@ -36,6 +36,7 @@ public class LanguageProcessingConfig im
      */
     public static final Set<LexicalCategory> DEFAULT_PROCESSED_PHRASE_CATEGORIES = 
             EnumSet.of(LexicalCategory.Noun);
+        
     /**
      * The default set of {@link LexicalCategory LexicalCategories} used to
      * lookup (link) Entities within the {@link EntitySearcher}

Modified: stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ChunkData.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ChunkData.java?rev=1560281&r1=1560280&r2=1560281&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ChunkData.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ChunkData.java Wed Jan 22 08:45:15 2014
@@ -19,12 +19,14 @@ package org.apache.stanbol.enhancer.engi
 import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.PHRASE_ANNOTATION;
 
 import org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig;
+import org.apache.stanbol.enhancer.nlp.NlpAnnotations;
 import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
 import org.apache.stanbol.enhancer.nlp.model.Chunk;
 import org.apache.stanbol.enhancer.nlp.model.Sentence;
 import org.apache.stanbol.enhancer.nlp.model.Token;
 import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
 import org.apache.stanbol.enhancer.nlp.phrase.PhraseTag;
+import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
 
 /** 
  * Represents a Chunk (group of tokens) used as context for EntityLinking.
@@ -41,19 +43,12 @@ import org.apache.stanbol.enhancer.nlp.p
  */
 public class ChunkData {
     protected final static boolean DEFAULT_PROCESSABLE_STATE = true;
+    /** if this Chunk represents a Named Entity **/
+    protected final boolean isNamedEntity;
     /** if the Chunk is processable */
     public final boolean isProcessable;
     /** the Chunk */
     public final Chunk chunk;
-    /** 
-     * In case multiple overlapping and processable {@link Chunk}s the
-     * section selected by the chunks are merged. While {@link #chunk}
-     * holds the original chunk (the first) this variable holds the
-     * last merged one. Enclosed chunks (in case more than two are
-     * merged) are not available via this class, but can be retrieved
-     * by iterating over the {@link AnalysedText} content part.
-     */
-    Chunk merged;
     /** the start token index relative to the current section (sentence) */
     int startToken;
     /** the end token index relative to the current section (sentence) */
@@ -100,6 +95,13 @@ public class ChunkData {
                 break;
             } // else probability to low for exclusion
         }
+        //fallback for NER chunks in case Noun Phrases are processible and a NER
+        //annotation is present for the parsed chunk.
+        isNamedEntity = chunk.getAnnotation(NlpAnnotations.NER_ANNOTATION) != null;
+        if(process == null && isNamedEntity &&
+        		tpc.getProcessedPhraseCategories().contains(LexicalCategory.Noun)){
+        	process = true;
+        }
         isProcessable = process == null ? DEFAULT_PROCESSABLE_STATE : process;
     }
     /**
@@ -110,13 +112,11 @@ public class ChunkData {
         return chunk.getStart();
     }
     /**
-     * Getter for the end character position of the text selected by
-     * possible multiple {@link #merged} chunks.
-     * @return the end character position considering possible {@link #merged}
-     * chunks.
+     * Getter for the end character position of the text
+     * @return the end character position
      */
     public int getEndChar(){
-        return merged == null ? chunk.getEnd() : merged.getEnd();
+        return chunk.getEnd();
     }
     /**
      * If this chunk is processable
@@ -125,6 +125,11 @@ public class ChunkData {
     public boolean isProcessable() {
         return isProcessable;
     }
+    
+    public boolean isNamedEntity() {
+    	return isNamedEntity;
+    }
+    
     /**
      * Getter for the number of matchable tokens contained in this chunk
      * @return The number of matchable tokens contained in this chunk

Modified: stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java?rev=1560281&r1=1560280&r2=1560281&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java Wed Jan 22 08:45:15 2014
@@ -20,7 +20,6 @@
 package org.apache.stanbol.enhancer.engines.entitylinking.impl;
 
 import static org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig.UNICASE_SCRIPT_LANUAGES;
-import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.POS_ANNOTATION;
 
 import java.util.ArrayList;
 import java.util.Collection;
@@ -32,14 +31,10 @@ import java.util.Locale;
 
 import org.apache.commons.collections.Predicate;
 import org.apache.commons.collections.iterators.FilterIterator;
-import org.apache.stanbol.commons.namespaceprefix.service.StanbolNamespacePrefixService;
 import org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig;
-import org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig;
 import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
-import org.apache.stanbol.enhancer.nlp.model.Chunk;
 import org.apache.stanbol.enhancer.nlp.model.Section;
 import org.apache.stanbol.enhancer.nlp.model.Sentence;
-import org.apache.stanbol.enhancer.nlp.model.Span;
 import org.apache.stanbol.enhancer.nlp.model.Span.SpanTypeEnum;
 import org.apache.stanbol.enhancer.nlp.model.Token;
 import org.apache.stanbol.enhancer.nlp.pos.Pos;
@@ -92,7 +87,7 @@ public class ProcessingState {
     protected final LanguageProcessingConfig tpc;
     //protected final EntityLinkerConfig elc;
 
-    private AnalysedText at;
+    //private AnalysedText at;
     /**
      * If the language uses a unicase script and therefore upper case specific
      * processing rules can not be used (see STANBOL-1049)
@@ -125,7 +120,7 @@ public class ProcessingState {
         if(!tpc.isIgnoreChunks()){
             enclosedSpanTypes.add(SpanTypeEnum.Chunk);
         }
-        this.at = at; //store as field (just used for logging)
+        //this.at = at; //store as field (just used for logging)
         this.language = language;
         //STANBOL-1049: we need now to know if a language uses a unicase script
         //ensure lower case and only use the language part 
@@ -244,7 +239,7 @@ public class ProcessingState {
                 continue; //ignore this section
             }
             consumedSectionIndex = section.getEnd();
-            SectionData sectionData = new SectionData(tpc, section, enclosedSpanTypes, foundLinkableToken);
+            SectionData sectionData = new SectionData(tpc, section, enclosedSpanTypes, isUnicaseLanguage);
             //TODO: It would be better to use a SectionData field instead
             tokens = sectionData.getTokens();
             section = sectionData.section;
@@ -285,9 +280,6 @@ public class ProcessingState {
             sb.append("none");
         } else {
             sb.append(token.inChunk.chunk);
-            if(token.inChunk.merged != null){
-                sb.append("(merged with ").append(token.inChunk.merged).append(')');
-            }
         }
         sb.append("| sentence: ");
         if(section == null){

Modified: stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/SectionData.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/SectionData.java?rev=1560281&r1=1560280&r2=1560281&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/SectionData.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/SectionData.java Wed Jan 22 08:45:15 2014
@@ -19,6 +19,7 @@ package org.apache.stanbol.enhancer.engi
 import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.POS_ANNOTATION;
 
 import java.util.ArrayList;
+import java.util.Collection;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Set;
@@ -54,7 +55,7 @@ public class SectionData {
             Set<SpanTypeEnum> enclosedSpanTypes, boolean isUnicaseLanguage){
         this.section = section;
         Iterator<Span> enclosed = section.getEnclosed(enclosedSpanTypes);
-        ChunkData activeChunk = null;
+        List<ChunkData> activeChunks = new ArrayList<ChunkData>();
         while(enclosed.hasNext()){
             Span span = enclosed.next();
             if(span.getStart() >= span.getEnd()){ //save guard against empty spans
@@ -64,27 +65,20 @@ public class SectionData {
             if(span.getType() == SpanTypeEnum.Chunk){
                 ChunkData chunkData = new ChunkData(tpc,(Chunk)span);
                 if(chunkData.isProcessable()){
-                    if(activeChunk != null){ //current Chunk not yet closed -> overlapping chunks!
-                        if(activeChunk.getEndChar() < span.getEnd()){ //merge partly overlapping chunks
-                            log.info("   - merge overlapping and processable Chunks {} <-> {}",
-                                activeChunk.merged == null? activeChunk.chunk : activeChunk.merged,span);
-                            activeChunk.merged = (Chunk)span; //set this one as last merged
-                        } //ignore completely covered chunks
-                    } else { // a new Chunk starts
-                        activeChunk = chunkData;
-                        activeChunk.startToken = tokens.size();
-                        if(log.isDebugEnabled()){
-                            log.debug(">> Chunk: (type:{}, startPos: {}) text: '{}'",
-                                new Object []{
-                                    activeChunk.chunk.getType(),
-                                    activeChunk.startToken,
-                                    activeChunk.chunk.getSpan()
-                                });
-                        }
+                	activeChunks.add(0, chunkData);
+                	chunkData.startToken = tokens.size();
+                    if(log.isDebugEnabled()){
+                        log.debug(">> Chunk: (type:{}, startPos: {}) text: '{}'",
+                            new Object []{
+                        		chunkData.chunk.getType(),
+                        		chunkData.startToken,
+                        		chunkData.chunk.getSpan()
+                            });
                     } 
                 } //else ignore chunks that are not processable
             } else if(span.getType() == SpanTypeEnum.Token){
-                TokenData tokenData = new TokenData(tpc,tokens.size(),(Token)span,activeChunk);
+                TokenData tokenData = new TokenData(tpc,tokens.size(),(Token)span,
+                		activeChunks.isEmpty() ? null : activeChunks.get(0));
                 if(log.isDebugEnabled()){
                     log.debug("  > {}: {} {}(pos:{}) chunk: '{}'",
                         new Object[]{tokenData.index,tokenData.token,
@@ -155,7 +149,9 @@ public class SectionData {
                 if(!hasLinkableToken){
                     hasLinkableToken = tokenData.isLinkable;
                 }
-                if(activeChunk != null){
+                Iterator<ChunkData> activeChunkIt = activeChunks.iterator();
+                while(activeChunkIt.hasNext()){
+                	ChunkData activeChunk = activeChunkIt.next();
                     if (tokenData.isLinkable){
                         //ignore matchableCount in Chunks with linkable Tokens
                         activeChunk.matchableCount = -10; //by setting the count to -10
@@ -173,10 +169,14 @@ public class SectionData {
                             activeChunk.matchableEndCharIndex = tokenData.token.getEnd();
                         }
                     }
-                    if (span.getEnd() >= activeChunk.getEndChar()){
+                    if(span.getEnd() >= activeChunk.getEndChar()){
                         //this is the last token in the current chunk
                         activeChunk.endToken = tokens.size()-1;
-                        log.debug("   - end Chunk@pos: {}", activeChunk.endToken);
+                        if(log.isDebugEnabled()){
+	                        log.debug(" << end Chunk {} '{}' @pos: {}", new Object[]{
+	                        		activeChunk.chunk, activeChunk.chunk.getSpan(),
+	                        		activeChunk.endToken});
+                        }
                         if(tpc.isLinkMultiMatchableTokensInChunk() && 
                                 activeChunk.getMatchableCount() > 1 ){
                             log.debug("   - multi-matchable Chunk:");
@@ -198,7 +198,8 @@ public class SectionData {
                                 }
                             }
                         }
-                        activeChunk = null;
+                        //remove the closed chunk from the list with active
+                        activeChunkIt.remove(); 
                     }
                 }
             }