You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2014/01/21 09:16:20 UTC
svn commit: r1559933 - in /stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker: PhraseBuilder.java PhraseTypeDefinition.java engine/PosChunkerEngine.java

Author: rwesten
Date: Tue Jan 21 08:16:20 2014
New Revision: 1559933

URL: http://svn.apache.org/r1559933
Log:
STANBOL-1251: added prefix type category to PhraseDefinition; added detailed trace level logging to the PhraseBuilder - commit for 0.12 branch

Modified:
    stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseBuilder.java
    stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseTypeDefinition.java
    stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/engine/PosChunkerEngine.java

Modified: stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseBuilder.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseBuilder.java?rev=1559933&r1=1559932&r2=1559933&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseBuilder.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseBuilder.java Tue Jan 21 08:16:20 2014
@@ -32,9 +32,13 @@ import org.apache.stanbol.enhancer.nlp.m
 import org.apache.stanbol.enhancer.nlp.phrase.PhraseTag;
 import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
 import org.apache.stanbol.enhancer.nlp.pos.PosTag;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 public class PhraseBuilder {
     
+	private final Logger log = LoggerFactory.getLogger(getClass());
+	
     /**
      * Just a fallback in case Pos annotations do not provide probabilities. 
      * In most cases the value of this will not have any effect as typically 
@@ -75,6 +79,7 @@ public class PhraseBuilder {
             throw new IllegalArgumentException("The parsed PhraseTypeDefinition MUST NOT be NULL!");
         }
         this.phraseType = phraseType;
+        log.debug("Create {} for {}",getClass().getSimpleName(),phraseType);
         this.phraseTag = new PhraseTag(phraseType.getPhraseType().name(), 
             phraseType.getPhraseType());
         if(chunkFactory == null){
@@ -100,6 +105,7 @@ public class PhraseBuilder {
     
     public void nextSection(Section section){
         buildPhrase(null);
+    	log.trace("-- next {} --", section);
     }
     
 
@@ -109,6 +115,11 @@ public class PhraseBuilder {
             phraseType.getRequiredType());
         if(states[0]){
             current.add(token);
+            if(log.isTraceEnabled()) {
+	        	log.trace("-- {} phrase start --", phraseType.getPhraseType().name());
+	        	log.trace(" {}. {} {}", new Object[]{ current.size(), token, 
+	        			logPosCategories(token)});
+            }
             valid = states[1];
         }
     }
@@ -116,14 +127,18 @@ public class PhraseBuilder {
     @SuppressWarnings("unchecked") //varargs with generic types
     private boolean checkContinuation(Token token){
         final boolean[] states;
-        if(!valid){
-            states = checkCategories(token, phraseType.getContinuationType(),
+        if(!valid){ //check for prefix types and required types
+            states = checkCategories(token, phraseType.getPrefixType(),
                 phraseType.getRequiredType());
-        } else {
+        } else { //check for continuation types
             states = checkCategories(token, phraseType.getContinuationType());
         }
         if(states[0]){
             current.add(token);
+            if(log.isTraceEnabled()) {
+	        	log.trace(" {}. {} {}", new Object[]{ current.size(), token, 
+	        			logPosCategories(token)});
+            }
         }
         if(states.length > 1){
             valid = states[1];
@@ -148,7 +163,19 @@ public class PhraseBuilder {
                 Chunk chunk = chunkFactory.createChunk(current.get(0), lastConsumedToken);
                 //TODO: add support for confidence
                 chunk.addAnnotation(PHRASE_ANNOTATION, Value.value(phraseTag));
+                if(log.isTraceEnabled()){
+                	log.trace("  << add {} phrase {} '{}'", new Object[]{
+                			phraseType.getPhraseType().name(), chunk,chunk.getSpan()});
+                }
+            } else if(log.isTraceEnabled()){
+            	log.trace("  >> ignore {} phrase with single {} ", 
+            			phraseType.getPhraseType().name() ,
+            			current.get(0));
             }
+        } else if(!current.isEmpty() && log.isTraceEnabled()){
+        	log.trace("  << ignore invalid {} phrase [{},{}]",  new Object[]{ 
+        			phraseType.getPhraseType().name(), current.get(0).getStart(), 
+        			current.get(current.size()-1).getEnd()});
         }
         //cleanup
         current.clear();
@@ -210,6 +237,27 @@ public class PhraseBuilder {
         }
         return matches;
     }
+    
+    /**
+     * used for trace level logging of Tokens part of a chunk
+     * @param token
+     * @return
+     */
+    private String logPosCategories(Token token){
+    	List<Value<PosTag>> posTags = token.getAnnotations(POS_ANNOTATION);
+    	List<String> catNames = new ArrayList<String>(posTags.size());
+    	for(Value<PosTag> tag : posTags){
+    		Set<LexicalCategory> cats = tag.value().getCategories();
+    		if(cats.size() > 1){
+    			catNames.add(cats.toString());
+    		} else if(!cats.isEmpty()){
+    			catNames.add(cats.iterator().next().toString());
+    		} else {
+    			catNames.add(tag.value().getTag());
+    		}
+    	}
+    	return catNames.toString();
+    }
 
     public static interface ChunkFactory {
         

Modified: stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseTypeDefinition.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseTypeDefinition.java?rev=1559933&r1=1559932&r2=1559933&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseTypeDefinition.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseTypeDefinition.java Tue Jan 21 08:16:20 2014
@@ -21,13 +21,36 @@ import java.util.EnumSet;
 import java.util.Set;
 
 import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
+import org.apache.stanbol.enhancer.nlp.pos.Pos;
 
+/**
+ * Definition of a phrase type<p>
+ * 
+ * Phrases are defined by a set of POS tags that can <ul>
+ * <li> required Tokens - typically noun for noun phrases, verbs for verb phrases.
+ * <li> start types - types that can start a new phrase
+ * <li> prefix types - types that can continue a phrase not yet containing a
+ * required token
+ * <li> continuation types - types that can continue a phrase already containing
+ * a required token
+ * <li> end types - types that can end a phrase. Used to remove tailing tokens
+ * from a phrase (typically punctations).
+ * </ul>
+ * 
+ * <b>TODO:</b> Add support for {@link Pos} and String tags in addition to
+ * {@link LexicalCategory}.
+ * 
+ * @author Rupert Westenthaler
+ *
+ */
 public class PhraseTypeDefinition {
 
     protected final LexicalCategory phraseType;
     
     private final Set<LexicalCategory> startTypes;
     protected final Set<LexicalCategory> readOnlyStartTypes;
+    private final Set<LexicalCategory> prefixTypes;
+    protected final Set<LexicalCategory> readOnlyPrefixTypes;
     private final Set<LexicalCategory> continuationTypes;
     protected final Set<LexicalCategory> readOnlyContinuationTypes;
     private final Set<LexicalCategory> requiredTypes;
@@ -42,6 +65,8 @@ public class PhraseTypeDefinition {
         this.phraseType = phraseType;
         startTypes = EnumSet.of(phraseType);
         readOnlyStartTypes = Collections.unmodifiableSet(startTypes);
+        prefixTypes = EnumSet.of(phraseType);
+        readOnlyPrefixTypes = Collections.unmodifiableSet(prefixTypes);
         continuationTypes = EnumSet.of(phraseType);
         readOnlyContinuationTypes = Collections.unmodifiableSet(continuationTypes);
         requiredTypes = EnumSet.of(phraseType);
@@ -53,6 +78,10 @@ public class PhraseTypeDefinition {
     public boolean addStartType(LexicalCategory...types){
         return add(startTypes,types);
     }
+
+    public boolean addPrefixType(LexicalCategory...types){
+        return add(prefixTypes,types);
+    }
     
     public boolean addContinuationType(LexicalCategory...types){
         return add(continuationTypes,types);
@@ -69,6 +98,10 @@ public class PhraseTypeDefinition {
         return remove(startTypes,types);
     }
     
+    public boolean removePrefixType(LexicalCategory...types){
+        return remove(prefixTypes,types);
+    }
+    
     public boolean removeContinuationType(LexicalCategory...types){
         return remove(continuationTypes,types);
     }
@@ -89,18 +122,34 @@ public class PhraseTypeDefinition {
     }
     
     /**
-     * Getter for the read only set with the start types
+     * Getter for the read only set with the start types.
      * @return the read only set with {@link LexicalCategory LexicalCategories}
      * that can start a phrase of that type
      */
     public Set<LexicalCategory> getStartType(){
         return readOnlyStartTypes;
     }
+    /**
+     * Getter for the read only set with the prefix types
+     * @return the read only set with {@link LexicalCategory LexicalCategories}
+     * that can continue a phrase that does not yet include a token classified
+     * with a {@link #getRequiredType() required type}. A typical Example are
+     * {@link LexicalCategory#Adjective} in Noun Phrases that need to be
+     * considered in prefixes (e.g. "A nice weekend") but excluded after the
+     * first noun (e.g. "the trip last week"). 
+     */
+    public Set<LexicalCategory> getPrefixType(){
+        return readOnlyPrefixTypes;
+    }
     
     /**
      * Getter for the read only set with the continuation types
      * @return the read only set with {@link LexicalCategory LexicalCategories}
-     * that can continue a phrase of that type
+     * that can continue a phrase that does already include a token classified
+     * with a {@link #getRequiredType() required type}. A typical Example are
+     * {@link LexicalCategory#Adjective} in Noun Phrases that need to be
+     * considered in prefixes (e.g. "A nice weekend") but excluded after the
+     * first noun (e.g. "the trip last week"). 
      */
     public Set<LexicalCategory> getContinuationType(){
         return readOnlyContinuationTypes;
@@ -151,4 +200,9 @@ public class PhraseTypeDefinition {
         }
         return changed;
     }
+    
+    @Override
+    public String toString() {
+    	return phraseType.name();
+    }
 }

Modified: stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/engine/PosChunkerEngine.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/engine/PosChunkerEngine.java?rev=1559933&r1=1559932&r2=1559933&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/engine/PosChunkerEngine.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/engine/PosChunkerEngine.java Tue Jan 21 08:16:20 2014
@@ -83,7 +83,8 @@ import org.slf4j.LoggerFactory;
 @Service
 @Properties(value={
         @Property(name=EnhancementEngine.PROPERTY_NAME,value="pos-chunker"),
-        @Property(name=PosChunkerEngine.CONFIG_LANGUAGES, value = {"*"}),
+        @Property(name=PosChunkerEngine.CONFIG_LANGUAGES, 
+        	cardinality=Integer.MAX_VALUE, value = {"*"}),
         @Property(name=PosChunkerEngine.MIN_POS_SCORE, 
             doubleValue=PosChunkerEngine.DEFAULT_MIN_POS_SCORE),
         @Property(name=PosChunkerEngine.NOUN_PHRASE_STATE, 
@@ -121,14 +122,17 @@ public class PosChunkerEngine extends Ab
     private static final PhraseTypeDefinition VERB_PHRASE_TYPE;
 
     //TODO: maybe move this to PhraseTypeDefinition
+    //TODO: this might be language specific
+    //TODO: make configurable
     static {
         PhraseTypeDefinition nounPD = new PhraseTypeDefinition(LexicalCategory.Noun);
         //start types noun (automatically included) pronoun or determiners, adjectives 
         nounPD.addStartType(LexicalCategory.PronounOrDeterminer, LexicalCategory.Adjective);
-        //continuation types are nouns, adpositions , pronouns, determiner, adjectives and punctations
-        //optionally one could also allow Adverbs, PronounOrDeterminer
-        nounPD.addContinuationType(LexicalCategory.Adjective, LexicalCategory.Adposition,
-            LexicalCategory.Punctuation); //LexicalCategory.PronounOrDeterminer, LexicalCategory.Adverb, );
+        //prefix types are the same as start types (e.g. "the nice trip")
+        nounPD.addPrefixType(LexicalCategory.PronounOrDeterminer, LexicalCategory.Adjective);
+        //continuation types are nouns and punctations. 
+        //NOTE: Adverbs are excluded to avoid phrases like "the nice trip last week"
+        nounPD.addContinuationType(LexicalCategory.Punctuation);
         //end types are the same as start terms
         nounPD.addEndType(LexicalCategory.PronounOrDeterminer, LexicalCategory.Adjective);
         //and required types do include a Noun (what is actually included by default)
@@ -230,37 +234,38 @@ public class PosChunkerEngine extends Ab
         for(PhraseBuilder pb : phraseBuilders){
             pb.nextSection(null);
         }
-        if(log.isTraceEnabled()){
-            logChunks(at);
-        }
+//        if(log.isTraceEnabled()){
+//            logChunks(at);
+//        }
     }
     
     @Override
     public Map<String,Object> getServiceProperties() {
         return SERVICE_PROPERTIES;
     }
-    
-    private void logChunks(AnalysedText at){
-        Iterator<Span> it = at.getEnclosed(EnumSet.of(SpanTypeEnum.Sentence, SpanTypeEnum.Chunk));
-        while(it.hasNext()){
-            Span span = it.next();
-            if(span.getType() == SpanTypeEnum.Chunk){
-                Value<PhraseTag> phraseAnno = span.getAnnotation(PHRASE_ANNOTATION);
-                log.trace(" > {} Phrase: {} {}", new Object[]{
-                    phraseAnno != null ? phraseAnno.value().getTag() : "unknown",
-                    span, span.getSpan()});
-                log.trace("  Tokens: ");
-                int i = 1;
-                for(Iterator<Token> tokens = ((Chunk)span).getTokens(); tokens.hasNext();i++){
-                    Token token = tokens.next();
-                    log.trace("    {}. {}{}", new Object[]{i,token.getSpan(),
-                            token.getAnnotations(NlpAnnotations.POS_ANNOTATION)});
-                }
-            } else {
-                log.trace("--- {}",span);
-            }
-        }
-    }
+
+//logging is now done by the PhraseBuilder
+//    private void logChunks(AnalysedText at){
+//        Iterator<Span> it = at.getEnclosed(EnumSet.of(SpanTypeEnum.Sentence, SpanTypeEnum.Chunk));
+//        while(it.hasNext()){
+//            Span span = it.next();
+//            if(span.getType() == SpanTypeEnum.Chunk){
+//                Value<PhraseTag> phraseAnno = span.getAnnotation(PHRASE_ANNOTATION);
+//                log.trace(" > {} Phrase: {} {}", new Object[]{
+//                    phraseAnno != null ? phraseAnno.value().getTag() : "unknown",
+//                    span, span.getSpan()});
+//                log.trace("  Tokens: ");
+//                int i = 1;
+//                for(Iterator<Token> tokens = ((Chunk)span).getTokens(); tokens.hasNext();i++){
+//                    Token token = tokens.next();
+//                    log.trace("    {}. {}{}", new Object[]{i,token.getSpan(),
+//                            token.getAnnotations(NlpAnnotations.POS_ANNOTATION)});
+//                }
+//            } else {
+//                log.trace("--- {}",span);
+//            }
+//        }
+//    }
 
     /**
      * Activate and read the properties. Configures and initialises a ChunkerHelper for each language configured in