You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2014/01/21 09:16:20 UTC
svn commit: r1559933 - in
/stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker:
PhraseBuilder.java PhraseTypeDefinition.java engine/PosChunkerEngine.java
Author: rwesten
Date: Tue Jan 21 08:16:20 2014
New Revision: 1559933
URL: http://svn.apache.org/r1559933
Log:
STANBOL-1251: added prefix type category to PhraseDefinition; added detailed trace level logging to the PhraseBuilder - commit for 0.12 branch
Modified:
stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseBuilder.java
stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseTypeDefinition.java
stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/engine/PosChunkerEngine.java
Modified: stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseBuilder.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseBuilder.java?rev=1559933&r1=1559932&r2=1559933&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseBuilder.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseBuilder.java Tue Jan 21 08:16:20 2014
@@ -32,9 +32,13 @@ import org.apache.stanbol.enhancer.nlp.m
import org.apache.stanbol.enhancer.nlp.phrase.PhraseTag;
import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
import org.apache.stanbol.enhancer.nlp.pos.PosTag;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
public class PhraseBuilder {
+ private final Logger log = LoggerFactory.getLogger(getClass());
+
/**
* Just a fallback in case Pos annotations do not provide probabilities.
* In most cases the value of this will not have any effect as typically
@@ -75,6 +79,7 @@ public class PhraseBuilder {
throw new IllegalArgumentException("The parsed PhraseTypeDefinition MUST NOT be NULL!");
}
this.phraseType = phraseType;
+ log.debug("Create {} for {}",getClass().getSimpleName(),phraseType);
this.phraseTag = new PhraseTag(phraseType.getPhraseType().name(),
phraseType.getPhraseType());
if(chunkFactory == null){
@@ -100,6 +105,7 @@ public class PhraseBuilder {
public void nextSection(Section section){
buildPhrase(null);
+ log.trace("-- next {} --", section);
}
@@ -109,6 +115,11 @@ public class PhraseBuilder {
phraseType.getRequiredType());
if(states[0]){
current.add(token);
+ if(log.isTraceEnabled()) {
+ log.trace("-- {} phrase start --", phraseType.getPhraseType().name());
+ log.trace(" {}. {} {}", new Object[]{ current.size(), token,
+ logPosCategories(token)});
+ }
valid = states[1];
}
}
@@ -116,14 +127,18 @@ public class PhraseBuilder {
@SuppressWarnings("unchecked") //varargs with generic types
private boolean checkContinuation(Token token){
final boolean[] states;
- if(!valid){
- states = checkCategories(token, phraseType.getContinuationType(),
+ if(!valid){ //check for prefix types and required types
+ states = checkCategories(token, phraseType.getPrefixType(),
phraseType.getRequiredType());
- } else {
+ } else { //check for continuation types
states = checkCategories(token, phraseType.getContinuationType());
}
if(states[0]){
current.add(token);
+ if(log.isTraceEnabled()) {
+ log.trace(" {}. {} {}", new Object[]{ current.size(), token,
+ logPosCategories(token)});
+ }
}
if(states.length > 1){
valid = states[1];
@@ -148,7 +163,19 @@ public class PhraseBuilder {
Chunk chunk = chunkFactory.createChunk(current.get(0), lastConsumedToken);
//TODO: add support for confidence
chunk.addAnnotation(PHRASE_ANNOTATION, Value.value(phraseTag));
+ if(log.isTraceEnabled()){
+ log.trace(" << add {} phrase {} '{}'", new Object[]{
+ phraseType.getPhraseType().name(), chunk,chunk.getSpan()});
+ }
+ } else if(log.isTraceEnabled()){
+ log.trace(" >> ignore {} phrase with single {} ",
+ phraseType.getPhraseType().name() ,
+ current.get(0));
}
+ } else if(!current.isEmpty() && log.isTraceEnabled()){
+ log.trace(" << ignore invalid {} phrase [{},{}]", new Object[]{
+ phraseType.getPhraseType().name(), current.get(0).getStart(),
+ current.get(current.size()-1).getEnd()});
}
//cleanup
current.clear();
@@ -210,6 +237,27 @@ public class PhraseBuilder {
}
return matches;
}
+
+ /**
+ * used for trace level logging of Tokens part of a chunk
+ * @param token
+ * @return
+ */
+ private String logPosCategories(Token token){
+ List<Value<PosTag>> posTags = token.getAnnotations(POS_ANNOTATION);
+ List<String> catNames = new ArrayList<String>(posTags.size());
+ for(Value<PosTag> tag : posTags){
+ Set<LexicalCategory> cats = tag.value().getCategories();
+ if(cats.size() > 1){
+ catNames.add(cats.toString());
+ } else if(!cats.isEmpty()){
+ catNames.add(cats.iterator().next().toString());
+ } else {
+ catNames.add(tag.value().getTag());
+ }
+ }
+ return catNames.toString();
+ }
public static interface ChunkFactory {
Modified: stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseTypeDefinition.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseTypeDefinition.java?rev=1559933&r1=1559932&r2=1559933&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseTypeDefinition.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseTypeDefinition.java Tue Jan 21 08:16:20 2014
@@ -21,13 +21,36 @@ import java.util.EnumSet;
import java.util.Set;
import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
+import org.apache.stanbol.enhancer.nlp.pos.Pos;
+/**
+ * Definition of a phrase type<p>
+ *
+ * Phrases are defined by a set of POS tags that can <ul>
+ * <li> required Tokens - typically noun for noun phrases, verbs for verb phrases.
+ * <li> start types - types that can start a new phrase
+ * <li> prefix types - types that can continue a phrase not yet containing a
+ * required token
+ * <li> continuation types - types that can continue a phrase already containing
+ * a required token
+ * <li> end types - types that can end a phrase. Used to remove tailing tokens
+ * from a phrase (typically punctations).
+ * </ul>
+ *
+ * <b>TODO:</b> Add support for {@link Pos} and String tags in addition to
+ * {@link LexicalCategory}.
+ *
+ * @author Rupert Westenthaler
+ *
+ */
public class PhraseTypeDefinition {
protected final LexicalCategory phraseType;
private final Set<LexicalCategory> startTypes;
protected final Set<LexicalCategory> readOnlyStartTypes;
+ private final Set<LexicalCategory> prefixTypes;
+ protected final Set<LexicalCategory> readOnlyPrefixTypes;
private final Set<LexicalCategory> continuationTypes;
protected final Set<LexicalCategory> readOnlyContinuationTypes;
private final Set<LexicalCategory> requiredTypes;
@@ -42,6 +65,8 @@ public class PhraseTypeDefinition {
this.phraseType = phraseType;
startTypes = EnumSet.of(phraseType);
readOnlyStartTypes = Collections.unmodifiableSet(startTypes);
+ prefixTypes = EnumSet.of(phraseType);
+ readOnlyPrefixTypes = Collections.unmodifiableSet(prefixTypes);
continuationTypes = EnumSet.of(phraseType);
readOnlyContinuationTypes = Collections.unmodifiableSet(continuationTypes);
requiredTypes = EnumSet.of(phraseType);
@@ -53,6 +78,10 @@ public class PhraseTypeDefinition {
public boolean addStartType(LexicalCategory...types){
return add(startTypes,types);
}
+
+ public boolean addPrefixType(LexicalCategory...types){
+ return add(prefixTypes,types);
+ }
public boolean addContinuationType(LexicalCategory...types){
return add(continuationTypes,types);
@@ -69,6 +98,10 @@ public class PhraseTypeDefinition {
return remove(startTypes,types);
}
+ public boolean removePrefixType(LexicalCategory...types){
+ return remove(prefixTypes,types);
+ }
+
public boolean removeContinuationType(LexicalCategory...types){
return remove(continuationTypes,types);
}
@@ -89,18 +122,34 @@ public class PhraseTypeDefinition {
}
/**
- * Getter for the read only set with the start types
+ * Getter for the read only set with the start types.
* @return the read only set with {@link LexicalCategory LexicalCategories}
* that can start a phrase of that type
*/
public Set<LexicalCategory> getStartType(){
return readOnlyStartTypes;
}
+ /**
+ * Getter for the read only set with the prefix types
+ * @return the read only set with {@link LexicalCategory LexicalCategories}
+ * that can continue a phrase that does not yet include a token classified
+ * with a {@link #getRequiredType() required type}. A typical Example are
+ * {@link LexicalCategory#Adjective} in Noun Phrases that need to be
+ * considered in prefixes (e.g. "A nice weekend") but excluded after the
+ * first noun (e.g. "the trip last week").
+ */
+ public Set<LexicalCategory> getPrefixType(){
+ return readOnlyPrefixTypes;
+ }
/**
* Getter for the read only set with the continuation types
* @return the read only set with {@link LexicalCategory LexicalCategories}
- * that can continue a phrase of that type
+ * that can continue a phrase that does already include a token classified
+ * with a {@link #getRequiredType() required type}. A typical Example are
+ * {@link LexicalCategory#Adjective} in Noun Phrases that need to be
+ * considered in prefixes (e.g. "A nice weekend") but excluded after the
+ * first noun (e.g. "the trip last week").
*/
public Set<LexicalCategory> getContinuationType(){
return readOnlyContinuationTypes;
@@ -151,4 +200,9 @@ public class PhraseTypeDefinition {
}
return changed;
}
+
+ @Override
+ public String toString() {
+ return phraseType.name();
+ }
}
Modified: stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/engine/PosChunkerEngine.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/engine/PosChunkerEngine.java?rev=1559933&r1=1559932&r2=1559933&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/engine/PosChunkerEngine.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/engine/PosChunkerEngine.java Tue Jan 21 08:16:20 2014
@@ -83,7 +83,8 @@ import org.slf4j.LoggerFactory;
@Service
@Properties(value={
@Property(name=EnhancementEngine.PROPERTY_NAME,value="pos-chunker"),
- @Property(name=PosChunkerEngine.CONFIG_LANGUAGES, value = {"*"}),
+ @Property(name=PosChunkerEngine.CONFIG_LANGUAGES,
+ cardinality=Integer.MAX_VALUE, value = {"*"}),
@Property(name=PosChunkerEngine.MIN_POS_SCORE,
doubleValue=PosChunkerEngine.DEFAULT_MIN_POS_SCORE),
@Property(name=PosChunkerEngine.NOUN_PHRASE_STATE,
@@ -121,14 +122,17 @@ public class PosChunkerEngine extends Ab
private static final PhraseTypeDefinition VERB_PHRASE_TYPE;
//TODO: maybe move this to PhraseTypeDefinition
+ //TODO: this might be language specific
+ //TODO: make configurable
static {
PhraseTypeDefinition nounPD = new PhraseTypeDefinition(LexicalCategory.Noun);
//start types noun (automatically included) pronoun or determiners, adjectives
nounPD.addStartType(LexicalCategory.PronounOrDeterminer, LexicalCategory.Adjective);
- //continuation types are nouns, adpositions , pronouns, determiner, adjectives and punctations
- //optionally one could also allow Adverbs, PronounOrDeterminer
- nounPD.addContinuationType(LexicalCategory.Adjective, LexicalCategory.Adposition,
- LexicalCategory.Punctuation); //LexicalCategory.PronounOrDeterminer, LexicalCategory.Adverb, );
+ //prefix types are the same as start types (e.g. "the nice trip")
+ nounPD.addPrefixType(LexicalCategory.PronounOrDeterminer, LexicalCategory.Adjective);
+ //continuation types are nouns and punctations.
+ //NOTE: Adverbs are excluded to avoid phrases like "the nice trip last week"
+ nounPD.addContinuationType(LexicalCategory.Punctuation);
//end types are the same as start terms
nounPD.addEndType(LexicalCategory.PronounOrDeterminer, LexicalCategory.Adjective);
//and required types do include a Noun (what is actually included by default)
@@ -230,37 +234,38 @@ public class PosChunkerEngine extends Ab
for(PhraseBuilder pb : phraseBuilders){
pb.nextSection(null);
}
- if(log.isTraceEnabled()){
- logChunks(at);
- }
+// if(log.isTraceEnabled()){
+// logChunks(at);
+// }
}
@Override
public Map<String,Object> getServiceProperties() {
return SERVICE_PROPERTIES;
}
-
- private void logChunks(AnalysedText at){
- Iterator<Span> it = at.getEnclosed(EnumSet.of(SpanTypeEnum.Sentence, SpanTypeEnum.Chunk));
- while(it.hasNext()){
- Span span = it.next();
- if(span.getType() == SpanTypeEnum.Chunk){
- Value<PhraseTag> phraseAnno = span.getAnnotation(PHRASE_ANNOTATION);
- log.trace(" > {} Phrase: {} {}", new Object[]{
- phraseAnno != null ? phraseAnno.value().getTag() : "unknown",
- span, span.getSpan()});
- log.trace(" Tokens: ");
- int i = 1;
- for(Iterator<Token> tokens = ((Chunk)span).getTokens(); tokens.hasNext();i++){
- Token token = tokens.next();
- log.trace(" {}. {}{}", new Object[]{i,token.getSpan(),
- token.getAnnotations(NlpAnnotations.POS_ANNOTATION)});
- }
- } else {
- log.trace("--- {}",span);
- }
- }
- }
+
+//logging is now done by the PhraseBuilder
+// private void logChunks(AnalysedText at){
+// Iterator<Span> it = at.getEnclosed(EnumSet.of(SpanTypeEnum.Sentence, SpanTypeEnum.Chunk));
+// while(it.hasNext()){
+// Span span = it.next();
+// if(span.getType() == SpanTypeEnum.Chunk){
+// Value<PhraseTag> phraseAnno = span.getAnnotation(PHRASE_ANNOTATION);
+// log.trace(" > {} Phrase: {} {}", new Object[]{
+// phraseAnno != null ? phraseAnno.value().getTag() : "unknown",
+// span, span.getSpan()});
+// log.trace(" Tokens: ");
+// int i = 1;
+// for(Iterator<Token> tokens = ((Chunk)span).getTokens(); tokens.hasNext();i++){
+// Token token = tokens.next();
+// log.trace(" {}. {}{}", new Object[]{i,token.getSpan(),
+// token.getAnnotations(NlpAnnotations.POS_ANNOTATION)});
+// }
+// } else {
+// log.trace("--- {}",span);
+// }
+// }
+// }
/**
* Activate and read the properties. Configures and initialises a ChunkerHelper for each language configured in