You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2013/11/30 10:31:06 UTC

svn commit: r1546708 - in /stanbol/branches/release-0.12/enhancement-engines/entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention: EntityCoMentionEngine.java impl/ContentItemMentionBuilder.java

Author: rwesten
Date: Sat Nov 30 09:31:06 2013
New Revision: 1546708

URL: http://svn.apache.org/r1546708
Log:
merged fix for STANBOL-1219 from trunk to the 0.12 releasing branch

Modified:
    stanbol/branches/release-0.12/enhancement-engines/entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention/EntityCoMentionEngine.java
    stanbol/branches/release-0.12/enhancement-engines/entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention/impl/ContentItemMentionBuilder.java

Modified: stanbol/branches/release-0.12/enhancement-engines/entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention/EntityCoMentionEngine.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention/EntityCoMentionEngine.java?rev=1546708&r1=1546707&r2=1546708&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention/EntityCoMentionEngine.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention/EntityCoMentionEngine.java Sat Nov 30 09:31:06 2013
@@ -289,8 +289,20 @@ public class EntityCoMentionEngine exten
                 new Object []{ci.getUri().getUnicodeString(), language, StringUtils.abbreviate(at.getSpan(), 100)});
         }
         //create the in-memory database for the mentioned Entities
-        ContentItemMentionBuilder entityMentionIndex = new ContentItemMentionBuilder(ci, 
+        ContentItemMentionBuilder entityMentionIndex = new ContentItemMentionBuilder(
             labelTokenizer, language, linkerConfig.getDefaultLanguage());
+        MGraph metadata = ci.getMetadata();
+        Set<UriRef> textAnnotations = new HashSet<UriRef>();
+        ci.getLock().readLock().lock();
+        try { //iterate over all TextAnnotations (mentions of Entities)
+            for(Iterator<Triple> it = metadata.filter(null, RDF_TYPE, ENHANCER_TEXTANNOTATION); it.hasNext();){
+                UriRef ta = (UriRef)it.next().getSubject();
+                entityMentionIndex.registerTextAnnotation(ta, metadata);
+                textAnnotations.add(ta); //store the registered text annotations
+            }
+        } finally {
+            ci.getLock().readLock().unlock();
+        }
         EntityLinker entityLinker = new EntityLinker(at,language, 
             languageConfig, entityMentionIndex, linkerConfig, labelTokenizer,entityMentionIndex);
         //process
@@ -303,33 +315,37 @@ public class EntityCoMentionEngine exten
         //TODO: write results
         ci.getLock().writeLock().lock();
         try {
-            writeComentions(ci,entityLinker.getLinkedEntities().values(), language);
+            writeComentions(ci,entityLinker.getLinkedEntities().values(), language, textAnnotations);
         } finally {
             ci.getLock().writeLock().unlock();
         }
     }
 
-    private void writeComentions(ContentItem ci,Collection<LinkedEntity> comentions, String language) {
+    private void writeComentions(ContentItem ci,Collection<LinkedEntity> comentions, String language,
+            Set<UriRef> textAnnotations) {
         Language languageObject = null;
         if(language != null && !language.isEmpty()){
             languageObject = new Language(language);
         }
         
         MGraph metadata = ci.getMetadata();
-        
+        //we MUST adjust the confidence level of existing annotations only once
+        //se we need to keep track of those
+        Set<NonLiteral> adjustedSuggestions = new HashSet<NonLiteral>();
         log.debug("Write Co-Mentions:");
         for(LinkedEntity comention : comentions){
             log.debug(" > {}",comention);
             //URIs of TextAnnotations for the initial mention of this co-mention
-            Collection<UriRef> initialMentions = new ArrayList<UriRef>(comention.getOccurrences().size());
+            Collection<UriRef> initialMentions = new ArrayList<UriRef>(comention.getSuggestions().size());
             for(Suggestion suggestion : comention.getSuggestions()){
                 Entity entity = suggestion.getEntity();
-                if(entity.getData().filter(entity.getUri(),RDF_TYPE,ENHANCER_TEXTANNOTATION).hasNext()){
+                if(textAnnotations.contains(entity.getUri())){
+//                if(entity.getData().filter(entity.getUri(),RDF_TYPE,ENHANCER_TEXTANNOTATION).hasNext()){
                     //this is a textAnnotation
                     initialMentions.add(entity.getUri());
                 } //else TODO support also Entities!!
             }
-            //first create the TextAnnotations for the co-mention
+            //create the TextAnnotations for the co-mention
             for(Occurrence occurrence : comention.getOccurrences()){
                 Literal startLiteral = literalFactory.createTypedLiteral(occurrence.getStart());
                 Literal endLiteral = literalFactory.createTypedLiteral(occurrence.getEnd());
@@ -341,8 +357,8 @@ public class EntityCoMentionEngine exten
                 while(it.hasNext()){
                     Triple t = it.next();
                     Integer end = EnhancementEngineHelper.get(metadata, t.getSubject(), ENHANCER_END, Integer.class, literalFactory);
-                    if(end != null &&
-                            metadata.filter(t.getSubject(), RDF_TYPE, ENHANCER_TEXTANNOTATION).hasNext()){
+                    if(end != null && textAnnotations.contains(t.getSubject())){
+                            //metadata.filter(t.getSubject(), RDF_TYPE, ENHANCER_TEXTANNOTATION).hasNext()){
                         textAnnotation = (UriRef)t.getSubject();
                         if(end > occurrence.getEnd()){
                             // there is an other TextAnnotation selecting a bigger Span
@@ -355,8 +371,8 @@ public class EntityCoMentionEngine exten
                 while(it.hasNext()){
                     Triple t = it.next();
                     Integer start = EnhancementEngineHelper.get(metadata, t.getSubject(), ENHANCER_START, Integer.class, literalFactory);
-                    if(start != null &&
-                            metadata.filter(t.getSubject(), RDF_TYPE, ENHANCER_TEXTANNOTATION).hasNext()){
+                    if(start != null && textAnnotations.contains(t.getSubject())){
+                            //metadata.filter(t.getSubject(), RDF_TYPE, ENHANCER_TEXTANNOTATION).hasNext()){
                         textAnnotation = (UriRef)t.getSubject();
                         if(start < occurrence.getStart()){
                             // there is an other TextAnnotation selecting a bigger Span
@@ -367,10 +383,11 @@ public class EntityCoMentionEngine exten
                 }
                 if(!ignore){
                     //collect confidence values of co-mentions
-                    Double maxConfidence = null;
-                    Double maxExistingConfidence = null;
+                    Double maxConfidence = null; //maximum confidence of suggestions of the initial mention
+                    Double maxExistingConfidence = null; //maximum confidence of existing suggestions
                     if(textAnnotation == null){ //not found ... create a new TextAnnotation for the co-mention
                         textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
+                        textAnnotations.add(textAnnotation); //add it to the set of TextAnnotations
                         metadata.add(new TripleImpl(textAnnotation, 
                             Properties.ENHANCER_START, 
                             startLiteral));
@@ -386,9 +403,8 @@ public class EntityCoMentionEngine exten
                     } else { //if existing add this engine as contributor
                         metadata.add(new TripleImpl(textAnnotation, DC_CONTRIBUTOR, 
                             new PlainLiteralImpl(this.getClass().getName())));
-                        //consider the confidence value of the existing TextAnnotation
-                        maxConfidence = EnhancementEngineHelper.get(metadata, textAnnotation, 
-                            ENHANCER_CONFIDENCE, Double.class, literalFactory);
+                        //maxConfidence = EnhancementEngineHelper.get(metadata, textAnnotation, 
+                        //    ENHANCER_CONFIDENCE, Double.class, literalFactory);
                     }
                     //now process initial mention(s) for the co-mention
                     Set<UriRef> dcTypes = new HashSet<UriRef>();
@@ -398,7 +414,7 @@ public class EntityCoMentionEngine exten
                         while(dcTypesIt.hasNext()){
                             dcTypes.add(dcTypesIt.next());
                         }
-                        //check confidence of the initial one
+                        //check confidence of the initial mention (fise:TextAnnotation)
                         Double confidnece = EnhancementEngineHelper.get(metadata, initialMention, 
                             ENHANCER_CONFIDENCE, Double.class, literalFactory);
                         if(confidnece != null){
@@ -408,15 +424,82 @@ public class EntityCoMentionEngine exten
                                 maxConfidence = confidnece;
                             }
                         }
+                        //now we need to compare the suggestions of the initial
+                        //mention(s) with the existing one. 
+                        //Get information about the suggestions of the initial mention
+                        Map<Resource,Double> initialSuggestions = new HashMap<Resource,Double>();
+                        Map<Resource, Resource> initialSuggestedEntities = new HashMap<Resource,Resource>();
+                        for(Iterator<Triple> suggestions = metadata.filter(null, DC_RELATION, initialMention); suggestions.hasNext();){
+                            if(!textAnnotations.contains(suggestions)) {
+                                NonLiteral suggestion = suggestions.next().getSubject();
+                                Resource suggestedEntity = EnhancementEngineHelper.getReference(metadata, suggestion, ENHANCER_ENTITY_REFERENCE);
+                                if(suggestedEntity != null){ //it has a suggestion
+                                    Double confidence = EnhancementEngineHelper.get(
+                                        metadata, suggestion, ENHANCER_CONFIDENCE, Double.class, literalFactory);
+                                    if(maxConfidence == null){
+                                        maxConfidence = confidence;
+                                    } else if(maxConfidence.compareTo(confidnece) <= 0){
+                                        maxConfidence = confidnece;
+                                    }
+                                    initialSuggestions.put(suggestion,confidence);
+                                    initialSuggestedEntities.put(suggestedEntity, suggestion);
+                                } //no suggestion (dc:relation to some other resource)
+                            } // else ignore dc:relation to other fise:TextAnnotations
+                        }
+                        //now we collect existing Suggestions for this TextAnnoation where we need
+                        //to adjust the confidence (quite some things to check ....)
                         Map<NonLiteral, Double> existingSuggestions = new HashMap<NonLiteral,Double>();
                     	if(maxConfidence != null && confidenceAdjustmentFactor < 1){
-                    		//adapt confidence of existing annotations
+                    	    //suggestions are defined by incoming dc:releation
 	                        for(Iterator<Triple> esIt = metadata.filter(null, DC_RELATION, textAnnotation);esIt.hasNext();){
 	                        	NonLiteral existingSuggestion = esIt.next().getSubject();
-	                        	existingSuggestions.put(existingSuggestion,
-	                        			EnhancementEngineHelper.get(metadata, existingSuggestion, 
-	                        					ENHANCER_CONFIDENCE, Double.class, literalFactory));
-	                        }
+	                        	//but not all of them are suggestions
+	                        	if(!textAnnotations.contains(existingSuggestion)) { //ignore fise:TextAnnotations
+	                                Double existingConfidence = EnhancementEngineHelper.get(metadata, existingSuggestion, 
+                                        ENHANCER_CONFIDENCE, Double.class, literalFactory);
+	                                //ignore fise:TextAnnotations also suggested for the initial mention
+                                    if(!initialSuggestions.containsKey(existingSuggestion)){
+                                        Resource suggestedEntity = EnhancementEngineHelper.getReference(metadata, existingSuggestion, ENHANCER_ENTITY_REFERENCE);
+                                        //we might also have different fise:TextAnnotations that
+                                        //fise:entity-reference to an Entity present in the
+                                        //suggestions for the initial mention
+                                        if(!initialSuggestedEntities.containsKey(suggestedEntity)){
+                                            //finally make sure that we adjust confidences only once
+                                            if(!adjustedSuggestions.contains(existingSuggestion)){
+                                                existingSuggestions.put(existingSuggestion, existingConfidence);
+                                            } //else confidence already adjusted
+                                        } else { // different fise:EntityAnnotation, but same reference Entity
+                                            //we need to check confidences to decide what to do
+                                            Resource initialSuggestion = initialSuggestedEntities.get(suggestedEntity);
+                                            Double initialConfidence = initialSuggestions.get(initialSuggestion);
+                                            if((existingConfidence == null && initialConfidence == null) ||
+                                                    (existingConfidence != null && 
+                                                    existingConfidence.compareTo(initialConfidence) >= 0)){
+                                                //existing confidence >= initial .. keep existing
+                                                initialSuggestions.remove(initialSuggestion); 
+                                                if(maxExistingConfidence == null){
+                                                    maxExistingConfidence = existingConfidence;
+                                                } else if(maxExistingConfidence.compareTo(existingConfidence) <= 0){
+                                                    maxExistingConfidence = existingConfidence;
+                                                }
+                                            } else { //initial has higher confidence
+                                                //adjust this one (if not yet adjusted)
+                                                if(!adjustedSuggestions.contains(existingSuggestion)){
+                                                    existingSuggestions.put(existingSuggestion, existingConfidence);
+                                                } 
+                                            }
+                                        }
+                                    } else { //a initial mention already present
+                                        //no need to process initial mention
+                                        initialSuggestions.remove(existingSuggestion);
+                                        if(maxExistingConfidence == null){
+                                            maxExistingConfidence = existingConfidence;
+                                        } else if(maxExistingConfidence.compareTo(existingConfidence) <= 0){
+                                            maxExistingConfidence = existingConfidence;
+                                        }
+                                    }
+	                        	} //else ignore dc:relations to other fise:TextAnnotations
+ 	                        }
 	                        for(Entry<NonLiteral,Double> entry : existingSuggestions.entrySet()){
 	                        	if(entry.getValue() != null){
 	                        		double adjustedConfidence = entry.getValue() * confidenceAdjustmentFactor;
@@ -425,15 +508,12 @@ public class EntityCoMentionEngine exten
 	                        		}
 	                        		EnhancementEngineHelper.set(metadata, entry.getKey(), 
 	                        				ENHANCER_CONFIDENCE, adjustedConfidence, literalFactory);
+	                        		adjustedSuggestions.add(entry.getKey()); //mark as adjusted
 	                        	}
 	                        }
                     	}
-                        //add the suggestions of the initial mention to this one
-                        Set<Resource> values = new HashSet<Resource>();
-                        for(Iterator<Triple> suggestions = metadata.filter(null, DC_RELATION, initialMention); suggestions.hasNext();){
-                            values.add(suggestions.next().getSubject());
-                        }
-                        for(Resource suggestion : values){
+                    	//add the suggestions of the initial mention to this one
+                        for(Resource suggestion : initialSuggestions.keySet()){
                             metadata.add(new TripleImpl((NonLiteral)suggestion, DC_RELATION, textAnnotation));
     
                         }

Modified: stanbol/branches/release-0.12/enhancement-engines/entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention/impl/ContentItemMentionBuilder.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention/impl/ContentItemMentionBuilder.java?rev=1546708&r1=1546707&r2=1546708&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention/impl/ContentItemMentionBuilder.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention/impl/ContentItemMentionBuilder.java Sat Nov 30 09:31:06 2013
@@ -28,12 +28,15 @@ import java.util.ArrayList;
 import java.util.Collection;
 import java.util.HashSet;
 import java.util.Iterator;
+import java.util.Set;
 import java.util.SortedMap;
 import java.util.TreeMap;
 
 import org.apache.clerezza.rdf.core.LiteralFactory;
 import org.apache.clerezza.rdf.core.MGraph;
+import org.apache.clerezza.rdf.core.NonLiteral;
 import org.apache.clerezza.rdf.core.Triple;
+import org.apache.clerezza.rdf.core.TripleCollection;
 import org.apache.clerezza.rdf.core.UriRef;
 import org.apache.stanbol.enhancer.engines.entitycomention.CoMentionConstants;
 import org.apache.stanbol.enhancer.engines.entitylinking.LabelTokenizer;
@@ -50,51 +53,36 @@ public class ContentItemMentionBuilder e
     private static final Logger log = LoggerFactory.getLogger(ContentItemMentionBuilder.class);
     private static final LiteralFactory lf = LiteralFactory.getInstance();
     
-    private ContentItem ci;
     /**
      * The last index notified via {@link #startToken(Token)}
      */
     private Integer lastIndex = 0; 
     
     private SortedMap<Integer,Collection<EntityMention>> mentionIndex = new TreeMap<Integer,Collection<EntityMention>>();
-
     
-    public ContentItemMentionBuilder(ContentItem ci, LabelTokenizer labelTokenizer,
-            String...languages){
+    public ContentItemMentionBuilder(LabelTokenizer labelTokenizer, String...languages){
         super(labelTokenizer,CoMentionConstants.CO_MENTION_LABEL_FIELD, languages);
-        this.ci = ci;
-        ci.getLock().readLock().lock();
-        try {
-            initContext();
-        } finally {
-            ci.getLock().readLock().unlock();
-        }
     }
 
-
-    private void initContext() {
-        MGraph m = ci.getMetadata();
-        for(Iterator<Triple> it = m.filter(null, RDF_TYPE, ENHANCER_TEXTANNOTATION); it.hasNext();){
-            UriRef ta = (UriRef)it.next().getSubject();
-            String selectedText = EnhancementEngineHelper.getString(m, ta, ENHANCER_SELECTED_TEXT);
-            if(selectedText != null){
-                //NOTE: Typically it is not possible to find co-mentions for Entities with a
-                //      single Token, so can ignore those.
-                //      The only exception would be to use proper-nouns for initial linking and
-                //      Nouns for the co-mention resolution. In such cases this might result
-                //      in additional extractions.
-                String[] tokens = tokenizer.tokenize(selectedText, language);
-                if(tokens.length > 1){ //TODO make configurable
-                    Double confidence = EnhancementEngineHelper.get(m,ta,ENHANCER_CONFIDENCE,Double.class,lf);
-                    if(confidence == null || confidence > 0.85){ //TODO make configurable
-                        Integer start = EnhancementEngineHelper.get(m,ta,ENHANCER_START,Integer.class,lf);
-                        Integer end = EnhancementEngineHelper.get(m,ta,ENHANCER_END,Integer.class,lf);
-                        registerMention(new EntityMention(ta,m, ENHANCER_SELECTED_TEXT, DC_TYPE, 
-                            start != null && end != null ? new Integer[]{start,end} : null));
-                    } // else confidence to low
-                } //else ignore Tokens with a single token
-            } // else no selected text
-        }
+    public void registerTextAnnotation(UriRef textAnnotation, TripleCollection metadata){
+        String selectedText = EnhancementEngineHelper.getString(metadata, textAnnotation, ENHANCER_SELECTED_TEXT);
+        if(selectedText != null){
+            //NOTE: Typically it is not possible to find co-mentions for Entities with a
+            //      single Token, so can ignore those.
+            //      The only exception would be to use proper-nouns for initial linking and
+            //      Nouns for the co-mention resolution. In such cases this might result
+            //      in additional extractions.
+            String[] tokens = tokenizer.tokenize(selectedText, language);
+            if(tokens.length > 1){ //TODO make configurable
+                Double confidence = EnhancementEngineHelper.get(metadata,textAnnotation,ENHANCER_CONFIDENCE,Double.class,lf);
+                if(confidence == null || confidence > 0.85){ //TODO make configurable
+                    Integer start = EnhancementEngineHelper.get(metadata,textAnnotation,ENHANCER_START,Integer.class,lf);
+                    Integer end = EnhancementEngineHelper.get(metadata,textAnnotation,ENHANCER_END,Integer.class,lf);
+                    registerMention(new EntityMention(textAnnotation,metadata, ENHANCER_SELECTED_TEXT, DC_TYPE, 
+                        start != null && end != null ? new Integer[]{start,end} : null));
+                } // else confidence to low
+            } //else ignore Tokens with a single token
+        } // else no selected text
     }
 
     private void registerMention(EntityMention entityMention){