You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2012/11/24 10:40:11 UTC
svn commit: r1413155 [4/4] - in
/stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking: ./
src/ src/license/ src/main/ src/main/java/ src/main/java/org/
src/main/java/org/apache/ src/main/java/org/apache/stanbol/
src/main/java/org/apac...
Added: stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java?rev=1413155&view=auto
==============================================================================
--- stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java (added)
+++ stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java Sat Nov 24 09:40:08 2012
@@ -0,0 +1,675 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/**
+ *
+ */
+package org.apache.stanbol.enhancer.engines.entitylinking.impl;
+
+import static java.util.Collections.disjoint;
+import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.PHRASE_ANNOTATION;
+import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.POS_ANNOTATION;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.EnumSet;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.commons.collections.Predicate;
+import org.apache.commons.collections.iterators.FilterIterator;
+import org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig;
+import org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig;
+import org.apache.stanbol.enhancer.nlp.NlpAnnotations;
+import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
+import org.apache.stanbol.enhancer.nlp.model.Chunk;
+import org.apache.stanbol.enhancer.nlp.model.Section;
+import org.apache.stanbol.enhancer.nlp.model.Sentence;
+import org.apache.stanbol.enhancer.nlp.model.Span;
+import org.apache.stanbol.enhancer.nlp.model.Span.SpanTypeEnum;
+import org.apache.stanbol.enhancer.nlp.model.Token;
+import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
+import org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures;
+import org.apache.stanbol.enhancer.nlp.phrase.PhraseTag;
+import org.apache.stanbol.enhancer.nlp.pos.PosTag;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class ProcessingState {
+
+ private final Logger log = LoggerFactory.getLogger(ProcessingState.class);
+
+ /**
+ * Iterator over the sentences (might be
+ * the whole {@link AnalysedText} if no sentences are
+ * defined).
+ */
+ private final Iterator<? extends Section> sections;
+ /**
+ * The sentence currently processed
+ */
+ private Section section;
+ /**
+ * Holds the {@link Token}s of the current {@link #sentence}
+ * to allow fast index based access.
+ */
+ private List<TokenData> tokens = new ArrayList<TokenData>(64);
+
+ @SuppressWarnings("unchecked")
+ private Iterator<TokenData> processableTokensIterator = Collections.EMPTY_LIST.iterator();
+
+ private final EnumSet<SpanTypeEnum> enclosedSpanTypes;
+ /**
+ * The current token
+ */
+ private TokenData token;
+ /**
+ * The position of the last consumed position
+ */
+ private int consumedIndex = -1;
+ /**
+ * The language of the text
+ */
+ private String language;
+
+ protected final LanguageProcessingConfig tpc;
+ protected final EntityLinkerConfig elc;
+
+ private static final Predicate PROCESSABLE_TOKEN_OREDICATE = new Predicate() {
+ @Override
+ public boolean evaluate(Object object) {
+ return ((TokenData)object).isProcessable;
+ }
+ };
+
+ public ProcessingState(AnalysedText at, String language, LanguageProcessingConfig tpc, EntityLinkerConfig elc){
+ if(at == null){
+ throw new IllegalArgumentException("The parsed AnalysedText MUST NOT be NULL!");
+ }
+ if(language == null || language.isEmpty()){
+ throw new IllegalArgumentException("The parsed Language MUST NOT be NULL nor empty!");
+ }
+ if(tpc == null){
+ throw new IllegalArgumentException("The parsed TextProcessingConfig MUST NOT be NULL!");
+ }
+ if(elc == null){
+ throw new IllegalArgumentException("The parsed EntityLinkerConfig MUST NOT be NULL!");
+ }
+ this.tpc = tpc;
+ this.elc = elc;
+ enclosedSpanTypes = EnumSet.of(SpanTypeEnum.Token);
+
+ if(!tpc.isIgnoreChunks()){
+ enclosedSpanTypes.add(SpanTypeEnum.Chunk);
+ }
+
+ this.language = language;
+ //prefer to iterate over sentences
+ Iterator<Sentence> sentences = at.getSentences();
+ this.sections = sentences.hasNext() ? sentences : Collections.singleton(at).iterator();
+ //init the first sentence
+ initNextSentence();
+ }
+ /**
+ * Getter for the current section. This is typically a {@link Sentence}
+ * but might also be the whole {@link AnalysedText} in case no sentence
+ * annotations are available
+ * @return the currently processed {@link Section}
+ */
+ public final Section getSentence() {
+ return section;
+ }
+ /**
+ * Getter for the current token
+ * @return the token for the currently processed word
+ */
+ public TokenData getToken(){
+ return token;
+ }
+ /**
+ * Getter for the Tokens of the currently processed section
+ * @return the Tokens of the currently processed section
+ */
+ public List<TokenData> getTokens(){
+ return tokens;
+ }
+
+ /**
+ * Getter for the last consumed index
+ * @return the index of the last consumed token
+ */
+ public final int getConsumedIndex() {
+ return consumedIndex;
+ }
+
+
+ /**
+ * Getter for the language of the current Token (based on the current
+ * sentence)
+ * @return the language
+ */
+ public final String getLanguage() {
+ return language;
+ }
+// /**
+// * Getter for the next {@link Token} to be processed. Calling {@link #next()}
+// * is guaranteed to skip all tokens in between {@link #getTokenIndex()}
+// * and {@link #getNextToken()}, but it might even skip more tokens (e.g.
+// * in case that the token referenced by {@link #getNextToken()} is not
+// * within a {@link Chunk}
+// * @return the nextToken
+// */
+// public final int getNextToken() {
+// return nextToken;
+// }
+
+ /**
+ * The index of an consumed Token. The consumed index MUST BE equals or
+ * greater as {@link #getTokenIndex()}. If the consumed index is set to a
+ * value greater that {@link #getTokenIndex()} than consumed tokens are
+ * skipped on the next call to {@link #next()}
+ * @param pos the position of the last consumed token.
+ */
+ public void setConsumed(int pos){
+ if(pos >= token.index){
+ this.consumedIndex = pos;
+// this.nextToken = pos+1;
+ } else {
+ throw new IllegalArgumentException("The lastConsumedPos "+pos+
+ " MUST BE equals or gerater than the current Pos "+token.index);
+ }
+ }
+
+ /**
+ * Moves the state to next processable token after the index #nextToken
+ * @return <code>true</code> if there are further elements to process or
+ * <code>false</code> if there are no further elements to process.
+ */
+ public boolean next() {
+ while(processableTokensIterator.hasNext() || initNextSentence()){
+ TokenData token = processableTokensIterator.next();
+ if(token.index > consumedIndex){
+ this.token = token;
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Correctly initialise {@link #sentence}, {@link #chunks}, {@link #chunk}
+ * and {@link #tokenIndex} for the next element of {@link #sections}. If
+ * no further sentences are to process it simple sets {@link #sentence},
+ * {@link #chunks}, {@link #chunk} and {@link #tokenIndex} to <code>null</code>
+ */
+ private boolean initNextSentence() {
+ section = null;
+ tokens.clear();
+ processableTokensIterator = null;
+ consumedIndex = -1;
+ boolean foundProcessable = false;
+ while(!foundProcessable && sections.hasNext()){
+ section = sections.next();
+ Iterator<Span> enclosed = section.getEnclosed(enclosedSpanTypes);
+ ChunkData activeChunk = null;
+ while(enclosed.hasNext()){
+ Span span = enclosed.next();
+ if(span.getType() == SpanTypeEnum.Chunk){
+ ChunkData chunkData = new ChunkData((Chunk)span);
+ if(chunkData.isProcessable){
+ if(activeChunk != null){ //current Chunk not yet closed -> overlapping chunks!
+ if(activeChunk.getEndChar() < span.getEnd()){ //merge partly overlapping chunks
+ log.info(" - merge overlapping and processable Chunks {} <-> {}",
+ activeChunk.merged == null? activeChunk.chunk : activeChunk.merged,span);
+ activeChunk.merged = (Chunk)span; //set this one as last merged
+ } //ignore completely covered chunks
+ } else { // a new Chunk starts
+ activeChunk = chunkData;
+ activeChunk.startToken = tokens.size();
+ if(log.isDebugEnabled()){
+ log.debug(">> Chunk: (type:{}, startPos: {}) text: '{}'",
+ new Object []{
+ activeChunk.chunk.getType(),
+ activeChunk.startToken,
+ activeChunk.chunk.getSpan()
+ });
+ }
+ }
+ } //else ignore chunks that are not processable
+ } else if(span.getType() == SpanTypeEnum.Token){
+ TokenData tokenData = new TokenData(tokens.size(),(Token)span,activeChunk);
+ if(log.isDebugEnabled()){
+ log.debug(" > Token {}: {} (pos:{}) chunk: '{}' |Â morpho: {}",
+ new Object[]{tokenData.index,tokenData.token,
+ tokenData.token.getAnnotations(POS_ANNOTATION),
+ tokenData.inChunk != null ? tokenData.inChunk.chunk.getSpan() : "none",
+ tokenData.morpho != null ? tokenData.morpho : "none"});
+ }
+ tokens.add(tokenData);
+ if(!foundProcessable){
+ foundProcessable = tokenData.isProcessable;
+ }
+ if(activeChunk != null){
+ if(tokenData.isMatchable ){
+ activeChunk.matchableCount++;
+ }
+ if (span.getEnd() >= activeChunk.getEndChar()){
+ //this is the last token in the current chunk
+ activeChunk.endToken = tokens.size()-1;
+ log.debug(" - end Chunk@pos: {}", activeChunk.endToken);
+ if(tpc.isLinkMultiMatchableTokensInChunk() &&
+ activeChunk.matchableCount > 1 ){
+ log.debug(" - multi-matchable Chunk:");
+ //mark the last of two immediate following matchable
+ //tokens as processable
+ for(int i = activeChunk.endToken-1;i >= activeChunk.startToken+1;i--){
+ TokenData ct = tokens.get(i);
+ TokenData pt = tokens.get(i-1);
+ if(ct.isMatchable && pt.isMatchable){
+ if(!ct.isProcessable) { //if not already processable
+ log.debug(" > convert Token {}: {} (pos:{}) from matchable to processable",
+ new Object[]{i,ct.token.getSpan(),ct.token.getAnnotations(POS_ANNOTATION)});
+ ct.isProcessable = true;
+ if(!foundProcessable){
+ foundProcessable = true;
+ }
+ }
+ i--;//mark both (ct & pt) as processed
+ }
+ }
+ }
+ activeChunk = null;
+ }
+ }
+ }
+ }
+ if(activeChunk != null) { //close the last chunk (if not done)
+ activeChunk.endToken = tokens.size()-1;
+ }
+ }
+ processableTokensIterator = new FilterIterator(tokens.iterator(), PROCESSABLE_TOKEN_OREDICATE);
+ return foundProcessable;
+ }
+ /**
+ * Getter for the text covered by the next tokenCount tokens relative to
+ * {@link #token}. It uses the {@link #textCache} to lookup/store such texts.
+ * Given the Tokens
+ * <pre>
+ * [This, is, an, Example]
+ * </pre>
+ * and the parameter <code>3</code> this method will return
+ * <pre>
+ * This is an
+ * </pre>
+ * @param tokenCount the number of tokens to be included relative to
+ * {@link #tokenIndex}
+ * @return the text covered by the span start of {@link #token} to end of
+ * token at <code>{@link #tokenIndex}+tokenCount</code>.
+ */
+ public String getTokenText(int start, int tokenCount){
+ int offset = section.getStart();
+ return section.getSpan().substring(
+ tokens.get(start).token.getStart()-offset,
+ tokens.get(start+(tokenCount-1)).token.getEnd()-offset);
+ }
+
+// /**
+
+// */
+// protected boolean getProcessablePosTag(Token token) {
+// for(Value<PosTag> posAnnotation : token.getAnnotations(POS_ANNOTATION)){
+// // check three possible match
+// // 1. the LexicalCategory matches
+// // 2. the Pos matches
+// // 3. the String tag matches
+// PosTag posTag = posAnnotation.value();
+//// log.debug(" ... check PosAnntation {} (lc:{}|pos:{}|tag:{}",
+//// new Object[]{posAnnotation,posTag.getCategories(),
+//// posTag.getPosHierarch(),posTag.getTag()});
+// if((!Collections.disjoint(tpc.getProcessedLexicalCategories(),
+// posTag.getCategories())) ||
+// (!Collections.disjoint(tpc.getProcessedPos(),
+// posTag.getPosHierarchy())) ||
+// tpc.getProcessedPosTags().contains(
+// posTag.getTag())){
+// if(posAnnotation.probability() >= tpc.getMinPosAnnotationProbability()){
+// return true;
+// } // else probability to low for inclusion
+// } else if(posAnnotation.probability() >= tpc.getMinExcludePosAnnotationProbability()){
+// return false;
+// } // else probability to low for exclusion
+// }
+// return token.getSpan().length() >= elc.getMinSearchTokenLength();
+// }
+
+// Both
+// protected boolean isMatchableToken(Token token){
+// for(Value<PosTag> posAnnotation : token.getAnnotations(POS_ANNOTATION)){
+// PosTag posTag = posAnnotation.value();
+// if(posTag.isMapped()){
+// if(!Collections.disjoint(tpc.getMatchableLexicalCategories(),
+// posTag.getCategories())){
+// if(posAnnotation.probability() >= tpc.getMinPosAnnotationProbability()){
+// return true;
+// } // else probability to low for inclusion
+// } else if(posAnnotation.probability() >= tpc.getMinExcludePosAnnotationProbability()){
+// return false;
+// } // else probability to low for exclusion
+// } //else not matched ... search next one
+// }
+// return token.getSpan().length() >= elc.getMinSearchTokenLength();
+// }
+//
+//
+// protected boolean isProcesableChunk(Chunk chunk){
+// for(Value<PhraseTag> phraseAnnotation : chunk.getAnnotations(PHRASE_ANNOTATION)){
+// if(tpc.getProcessedPhraseCategories().contains(
+// phraseAnnotation.value().getCategory()) ||
+// tpc.getProcessedPhraseTags().contains(
+// phraseAnnotation.value().getTag())){
+// if(phraseAnnotation.probability() >= tpc.getMinPhraseAnnotationProbability()){
+// return true;
+// } // else probability to low for inclusion
+// } else if(phraseAnnotation.probability() >= tpc.getMinExcludePhraseAnnotationProbability()){
+// return false;
+// } // else probability to low for exclusion
+// }
+// //neither a clear accept/reject ...
+// return true;
+// }
+
+ @Override
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append('[').append(token.index).append(',').append(token.token);
+ sb.append("] chunk: ");
+ if(token.inChunk == null){
+ sb.append("none");
+ } else {
+ sb.append(token.inChunk.chunk);
+ if(token.inChunk.merged != null){
+ sb.append("(merged with ").append(token.inChunk.merged).append(')');
+ }
+ }
+ sb.append("| sentence: ");
+ if(section == null){
+ sb.append("none");
+ } else if(section.getSpan().length() > 45){
+ sb.append(section.getSpan().substring(0, 45)).append(" ...");
+ } else {
+ sb.append(section.getSpan());
+ }
+ return sb.toString();
+ }
+
+ /**
+ * Internally used to store additional Metadata for Tokens of the current Sentence
+ * <p>
+ * Checks if the parsed {@link Token} is processable. This decision is taken first based on the POS
+ * annotation ( Lexical Category, POS tag) and second on the
+ * {@link EntityLinkerConfig#getMinSearchTokenLength()} if no POS annotations are available or the
+ * probability of the POS annotations is to low.
+ * <p>
+ * Since STANBOL-685two POS Probabilities are used <ul>
+ * <li> {@link LanguageProcessingConfig#getMinPosAnnotationProbability()} for accepting POS tags that are
+ * processed - included in {@link LanguageProcessingConfig#getLinkedLexicalCategories()} or
+ * {@link LanguageProcessingConfig#getLinkedPosTags()}.
+ * <li> {@link LanguageProcessingConfig#getMinExcludePosAnnotationProbability()} for those that are not
+ * processed. By default the exclusion probability is set to half of the inclusion one.
+ * </ul>
+ * Assuming that the <code>minPosTypePropb=0.667</code> a
+ * <ul>
+ * <li>noun with the prop 0.8 would result in returning <code>true</code>
+ * <li>noun with prop 0.5 would return <code>null</code>
+ * <li>verb with prop 0.4 would return <code>false</code>
+ * <li>verb with prop 0.3 would return <code>null</code>
+ * </ul>
+ * This algorithm makes it less likely that the {@link EntityLinkerConfig#getMinSearchTokenLength()} needs
+ * to be used as fallback for Tokens (what typically still provides better estimations as the token
+ * length).
+ * <p>
+ * (see also STANBOL-685 even that this Issue refers a version of this Engine that has not yet used the
+ * Stanbol NLP processing chain)
+ *
+ * @param token
+ * the {@link Token} to check.
+ * @return <code>true</code> if the parsed token needs to be processed. Otherwise <code>false</code>
+ */
+ class TokenData {
+ /** The Token */
+ final Token token;
+ /** The index of the Token within the current Section (Sentence) */
+ final int index;
+ /** If this Token should be linked with the Vocabulary */
+ boolean isProcessable;
+ /** If this Token should be used for multi word searches in the Vocabulary */
+ boolean isMatchable;
+ /** if this Token has an alpha or numeric char */
+ final boolean hasAlphaNumeric;
+ /** the chunk of this Token */
+ final ChunkData inChunk;
+ /** the morphological features of the Token (selected based on the POS Tag) */
+ final MorphoFeatures morpho;
+
+ /**
+ * Constructs and initializes meta data needed for linking based
+ * on the current tokens (and its NLP annotation)
+ * @param index the index of the Token within the current section
+ * @param token the token
+ * @param chunk the current chunk or <code>null</code> if none
+ */
+ TokenData(int index,Token token, ChunkData chunk) {
+ //(0) init fields
+ this.token = token;
+ this.index = index;
+ this.inChunk = chunk;
+ this.hasAlphaNumeric = Utils.hasAlphaNumericChar(token.getSpan());
+
+ PosTag selectedPosTag = null;
+ boolean matchedPosTag = false; //matched any of the POS annotations
+
+ //(1) check if this Token should be linked against the Vocabulary (isProcessable)
+ boolean upperCase = index > 0 && Character.isUpperCase(token.getSpan().codePointAt(0));
+ if(tpc.isLinkUpperCaseTokens() && upperCase){
+ isProcessable = true;
+ } else { //else use POS tag & token length
+ for(Value<PosTag> posAnnotation : token.getAnnotations(POS_ANNOTATION)){
+ // check three possible match
+ // 1. the LexicalCategory matches
+ // 2. the Pos matches
+ // 3. the String tag matches
+ PosTag posTag = posAnnotation.value();
+ if((!disjoint(tpc.getLinkedLexicalCategories(), posTag.getCategories())) ||
+ (!disjoint(tpc.getLinkedPos(), posTag.getPosHierarchy())) ||
+ tpc.getLinkedPosTags().contains(posTag.getTag())){
+ if(posAnnotation.probability() >= tpc.getMinPosAnnotationProbability()){
+ selectedPosTag = posTag;
+ isProcessable = true;
+ matchedPosTag = true;
+ break;
+ } // else probability to low for inclusion
+ } else if(posAnnotation.probability() >= tpc.getMinExcludePosAnnotationProbability()){
+ selectedPosTag = posTag; //also rejected PosTags are selected
+ matchedPosTag = true;
+ isProcessable = false;
+ break;
+ } // else probability to low for exclusion
+ }
+ if(!matchedPosTag) { //not matched against a POS Tag ...
+ // ... fall back to the token length
+ isProcessable = token.getSpan().length() != elc.getMinSearchTokenLength();
+ }
+ }
+
+ //(2) check if this token should be considered to match labels of suggestions
+ if(isProcessable){ //processable tokens are also matchable
+ isMatchable = true;
+ } else if(tpc.isMatchUpperCaseTokens() && upperCase){
+ //match upper case tokens regardless of POS and length
+ isMatchable = true;
+ } else { //check POS and length to see if token is matchable
+ matchedPosTag = false; //reset to false!
+ for(Value<PosTag> posAnnotation : token.getAnnotations(POS_ANNOTATION)){
+ PosTag posTag = posAnnotation.value();
+ if(posTag.isMapped()){
+ if(!Collections.disjoint(tpc.getMatchedLexicalCategories(),
+ posTag.getCategories())){
+ if(posAnnotation.probability() >= tpc.getMinPosAnnotationProbability()){
+ //override selectedPosTag if present
+ selectedPosTag = posTag; //mark the matchable as selected PosTag
+ isMatchable = true;
+ matchedPosTag = true;
+ break;
+ } // else probability to low for inclusion
+ } else if(posAnnotation.probability() >= tpc.getMinExcludePosAnnotationProbability()){
+ if(selectedPosTag == null){ //do not override existing values
+ selectedPosTag = posTag; //also rejected PosTags are selected
+ }
+ isMatchable = false;
+ matchedPosTag = true;
+ break;
+ } // else probability to low for exclusion
+ } //else not matched ... search next one
+ }
+ if(!matchedPosTag){ //not matched against POS tag ...
+ //fall back to the token length
+ isMatchable = token.getSpan().length() >= elc.getMinSearchTokenLength();
+ }
+ }
+
+ //(3) check for morpho analyses
+ if(selectedPosTag == null){ //token is not processable or matchable
+ //we need to set the selectedPoas tag to the first POS annotation
+ Value<PosTag> posAnnotation = token.getAnnotation(POS_ANNOTATION);
+ if(posAnnotation != null) {
+ selectedPosTag = posAnnotation.value();
+ }
+ }
+ List<Value<MorphoFeatures>> morphoAnnotations = token.getAnnotations(NlpAnnotations.MORPHO_ANNOTATION);
+ if(selectedPosTag == null){ //no POS information ... use the first morpho annotation
+ morpho = morphoAnnotations.isEmpty() ? null : morphoAnnotations.get(0).value();
+ } else { //select the correct morpho annotation based on the POS tag
+ MorphoFeatures mf = null;
+ selectMorphoFeature :
+ for(Value<MorphoFeatures> morphoAnnotation : morphoAnnotations){
+ for(PosTag posTag : morphoAnnotation.value().getPosList()){
+ if(!disjoint(selectedPosTag.getCategories(),posTag.getCategories())){
+ mf = morphoAnnotation.value();
+ break selectMorphoFeature; //stop after finding the first one
+ }
+ }
+ }
+ morpho = mf;
+ }
+ }
+ /**
+ * Getter for the text as used for searching/matching
+ * Entities in the linked vocabulary. If
+ * {@link EntityLinkerConfig#isLemmaMatching()} is
+ * enabled this will return the
+ * {@link MorphoFeatures#getLemma()} (if available).
+ * Otherwise the {@link Token#getSpan()} is returned
+ * @return the text of the token as to be used for
+ * matching. Guaranteed to be NOT NULL.
+ */
+ public String getTokenText(){
+ if(elc.isLemmaMatching() && morpho != null){
+ return morpho.getLemma();
+ } else {
+ return token.getSpan();
+ }
+ }
+ }
+ /**
+ * Represents a Chunk (group of tokens) used as context for EntityLinking.
+ * Typically a single {@link ChunkData#chunk} is used, but in case of
+ * overlapping and {@link ChunkData#isProcessable processable} chunks
+ * multiple {@link Chunk}s might be merged to a single {@link ChunkData}
+ * instance. In such cases {@link ChunkData#chunk} represents the
+ * first and {@link ChunkData#merged} the last of the merged chunks.<p>
+ * {@link ChunkData#startToken} and {@link ChunkData#endToken} represent
+ * the covered [start,end) {@link Token} indices relative to the current
+ * sections (typically a {@link Sentence}). {@link ChunkData#getStartChar()}
+ * and {@link ChunkData#getEndChar()} are the absolute [start,end) character
+ * indices within the {@link AnalysedText#getSpan()}
+ */
+ class ChunkData {
+ protected final static boolean DEFAULT_PROCESSABLE_STATE = true;
+ /** if the Chunk is processable */
+ final boolean isProcessable;
+ /** the Chunk */
+ final Chunk chunk;
+ /**
+ * In case multiple overlapping and processable {@link Chunk}s the
+ * section selected by the chunks are merged. While {@link #chunk}
+ * holds the original chunk (the first) this variable holds the
+ * last merged one. Enclosed chunks (in case more than two are
+ * merged) are not available via this class, but can be retrieved
+ * by iterating over the {@link AnalysedText} content part.
+ */
+ Chunk merged;
+ /** the start token index relative to the current section (sentence) */
+ int startToken;
+ /** the end token index relative to the current section (sentence) */
+ int endToken;
+ /**
+ * The number of processable Tokens enclosed by this Chunk
+ */
+ int processableCount;
+ /**
+ * The number of matchable Tokens enclosed by this Chunk
+ */
+ int matchableCount;
+ /**
+ * constructs and initializes the meta data for the parsed {@link Chunk}
+ * @param chunk
+ */
+ ChunkData(Chunk chunk){
+ this.chunk = chunk;
+ Boolean process = null;
+ for (Value<PhraseTag> phraseAnnotation : chunk.getAnnotations(PHRASE_ANNOTATION)) {
+ if (tpc.getProcessedPhraseCategories().contains(phraseAnnotation.value().getCategory())
+ || tpc.getProcessedPhraseTags().contains(phraseAnnotation.value().getTag())) {
+ if (phraseAnnotation.probability() >= tpc.getMinPhraseAnnotationProbability()) {
+ process = true;
+ break;
+ } // else probability to low for inclusion
+ } else if (phraseAnnotation.probability() >= tpc.getMinExcludePhraseAnnotationProbability()) {
+ process = false;
+ break;
+ } // else probability to low for exclusion
+ }
+ isProcessable = process == null ? DEFAULT_PROCESSABLE_STATE : process;
+ }
+ /**
+ * Getter for the start character position
+ * @return the start character position of the selected text span.
+ */
+ public int getStartChar(){
+ return chunk.getStart();
+ }
+ /**
+ * Getter for the end character position of the text selected by
+ * possible multiple {@link #merged} chunks.
+ * @return the end character position considering possible {@link #merged}
+ * chunks.
+ */
+ public int getEndChar(){
+ return merged == null ? chunk.getEnd() : merged.getEnd();
+ }
+ }
+
+}
\ No newline at end of file
Added: stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/Suggestion.java
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/Suggestion.java?rev=1413155&view=auto
==============================================================================
--- stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/Suggestion.java (added)
+++ stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/Suggestion.java Sat Nov 24 09:40:08 2012
@@ -0,0 +1,317 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/**
+ *
+ */
+package org.apache.stanbol.enhancer.engines.entitylinking.impl;
+
+import static org.apache.stanbol.enhancer.engines.entitylinking.impl.LabelMatch.DEFAULT_LABEL_TOKEN_COMPARATOR;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.stanbol.enhancer.engines.entitylinking.EntitySearcher;
+import org.apache.stanbol.entityhub.servicesapi.model.Representation;
+import org.apache.stanbol.entityhub.servicesapi.model.Text;
+import org.apache.stanbol.entityhub.servicesapi.model.rdf.RdfResourceEnum;
+
+/**
+ * A suggestion of an entity in the {@link EntitySearcher} for a part of the
+ * text. This class does not include the actual position within the Text,
+ * because it is intended to be used in combination with {@link LinkedEntity}.<p>
+ * This class also manages redirected entities and a state if redirects where
+ * already processed for this suggestion.<p>
+ * In addition this class also defines a set of {@link Comparator}s that are
+ * used to sort suggestions base on how well the fit the text.
+ * @author Rupert Westenthaler
+ *
+ */
+public class Suggestion {
+
+ private List<LabelMatch> labelMatches = new ArrayList<LabelMatch>();
+ private boolean labelMatchesSorted = true;
+ private final Representation result;
+ private Representation redirectsTo;
+ private boolean redirectProcessed;
+ private double score;
+ /**
+ * used to allow overriding the MATCH of this suggestion
+ */
+ private MATCH match;
+
+ public static enum MATCH {
+ /**
+ * No match (to less tokens, wrong oder ...)
+ */
+ NONE,
+ /**
+ * Not all tokens but sufficient to suggest (with lower score)
+ */
+ PARTIAL,
+ /**
+ * All requested Tokens match, but it is no exact match e.g. because
+ * the label defines some additional tokens
+ */
+ FULL,
+ /**
+ * The label of the suggested Entity is exactly the requested string
+ */
+ EXACT,
+ }
+ protected Suggestion(Representation result){
+ if(result == null){
+ throw new IllegalArgumentException("The parsed Result MUST NOT be NULL!");
+ }
+ this.result = result;
+ //TODO Do no longer use the resultScore as the score. We need to provide an
+ //own algorithm to calculate scores!
+// this.resultScore = result.getFirst(RdfResourceEnum.resultScore.getUri(), Float.class);
+ }
+ /**
+ * Adds an new LabelMatch to this suggestion
+ * @param labelMatch the labelMatch
+ */
+ public void addLabelMatch(LabelMatch labelMatch){
+ if(labelMatch == null || labelMatch.getMatch() == MATCH.NONE){
+ return; //ignore null an MATCH.NONE entries
+ }
+ labelMatches.add(labelMatch);
+ if(labelMatches.size() > 1){
+ labelMatchesSorted = false;
+ }
+ }
+
+ /**
+ * Getter for the best label in the given language
+ * @param suggestion the suggestion
+ * @param nameField the field used to search for labels
+ * @param language the language
+ * @return the best match or {@link Suggestion#getMatchedLabel()} if non is found
+ */
+ public Text getBestLabel(String nameField, String language){
+ Representation rep = getRepresentation();
+ //start with the matched label -> so if we do not find a better one
+ //we will use the matched!
+ Text matchedLabel = getMatchedLabel();
+ Text label = matchedLabel;
+ // 1. check if the returned Entity does has a label -> if not return null
+ // add labels (set only a single label. Use "en" if available!
+ Iterator<Text> labels = rep.getText(nameField);
+ boolean matchFound = false;
+ while (labels.hasNext() && !matchFound) {
+ Text actLabel = labels.next();
+ if(label == null){
+ label = actLabel;
+ }
+ //now we have already a label check the language
+ String actLang = actLabel.getLanguage();
+ //use startWith to match also en-GB and en-US ...
+ if (actLang != null && actLang.startsWith(language)) {
+ //prefer labels with the correct language
+ label = actLabel;
+ if(matchedLabel != null && matchedLabel.getText().equalsIgnoreCase(label.getText())){
+ //found label in that language that exactly matches the
+ //label used to match the text
+ matchFound = true;
+ }
+ }
+ }
+ return label;
+
+ }
+
+ /**
+ * Shorthand for {@link #getLabelMatch()}.getMatchedLabel()
+ * @return the label or <code>null</code> if {@link MATCH#NONE}
+ */
+ public Text getMatchedLabel() {
+ return getLabelMatch().getMatchedLabel();
+ }
+ protected void setMatch(MATCH matchType) {
+ this.match = matchType;
+ }
+ /**
+ * Getter for the {@link MATCH}. If not manually set
+ * this forwards to {@link #getLabelMatch()}.getMatch()
+ * @return the {@link MATCH} of this suggestion
+ */
+ public MATCH getMatch() {
+ return match != null ? match : getLabelMatch().getMatch();
+ }
+
+ public final Representation getResult(){
+ return result;
+ }
+ /**
+ * The {@link RdfResourceEnum#entityRank entity rank} of the {@link #getResult() result}.
+ * The entity rank is the relative importance of an entity within an
+ * Collection of Entities (ReferencedSite, Thesaurus, Taxonomy ...).<p>
+ * This method returns the rank of the entity returned by
+ * {@link #getRepresentation()}. Therefore if an redirect is active it will
+ * be the rank of the redirected entity and not of the suggested result.
+ * @return the rank of the entity or <code>null</code> if not available
+ */
+ public Float getEntityRank() {
+ return getRepresentation().getFirst(RdfResourceEnum.entityRank.getUri(), Float.class);
+ }
+ /**
+ * @param score the score to set
+ */
+ public void setScore(double score) {
+ this.score = score;
+ }
+ /**
+ * @return the score
+ */
+ public double getScore() {
+ return score;
+ }
+ /**
+ * Returns <code>true</code> if the result has a registered redirect
+ * @return <code>true</code> if a redirect is present. Otherwise <code>false</code>
+ */
+ public boolean isRedirect(){
+ return redirectsTo != null;
+ }
+ /**
+ * Setter for Entity the {@link #getResult() result} of this match redirects
+ * to. Also sets {@link #setRedirectProcessed(boolean)} to <code>true</code>
+ * @param redirect the redirected entity or <code>null</code> if no redirect
+ * is present
+ */
+ protected void setRedirect(Representation redirect){
+ this.redirectsTo = redirect;
+ setRedirectProcessed(true);
+ }
+ /**
+ * Setter for the state if the redirects for this resultMatch where already
+ * processed. Calling {@link #setRedirect(Representation)} will set this
+ * automatically to <code>true</code>
+ * @param state the state.
+ */
+ protected void setRedirectProcessed(boolean state){
+ this.redirectProcessed = state;
+ }
+ /**
+ * Getter for the state if the redirect was processed for this ResultMatch
+ * @return the state
+ */
+ protected boolean isRedirectedProcessed(){
+ return redirectProcessed;
+ }
+ /**
+ * Getter for the Entity the {@link #getResult()} of this Entity redirects
+ * to. Returns <code>null</code> if there is no redirect.
+ * @return the entity the {@link #getResult()} redirects to or <code>null</code>
+ * if there is no redirect
+ */
+ public Representation getRedirect(){
+ return redirectsTo;
+ }
+
+ /**
+ * getter for the Representation of this result. In case of
+ * <code>{@link #isRedirect()} == true</code> it returns the the
+ * {@link #getRedirect()} otherwise it returns the {@link #getResult()}.<p>
+ * To check explicitly for the result of the redirect one needs to use
+ * {@link #getRedirect()} and {@link #getRedirect()} instead.
+ * @return The representation for this match. might be directly the
+ * {@link #getResult() result} or if present the
+ * {@link #getRedirect() redirected} resource.
+ */
+ public final Representation getRepresentation(){
+ return redirectsTo == null ? result : redirectsTo;
+ }
+ /**
+ * Getter for the top ranked LabelMatch.
+ * @return the top ranked {@link LabelMatch} or {@link LabelMatch#NONE}
+ * if no match is present.
+ */
+ public final LabelMatch getLabelMatch(){
+ if(!labelMatchesSorted){
+ Collections.sort(labelMatches, LabelMatch.DEFAULT_LABEL_TOKEN_COMPARATOR);
+ }
+ return labelMatches.isEmpty() ? LabelMatch.NONE : labelMatches.get(0);
+ }
+ /**
+ * Getter for the sorted list with all {@link LabelMatch}s of this Suggestion
+ * @return the sorted LabelMatches. Guaranteed NOT <code>null</code> and
+ * NOT empty. In case no match is present a singleton list containing
+ * {@link LabelMatch#NONE} is returned.
+ */
+ public final List<LabelMatch> getLabelMatches(){
+ if(!labelMatchesSorted){
+ Collections.sort(labelMatches, LabelMatch.DEFAULT_LABEL_TOKEN_COMPARATOR);
+ }
+ if(labelMatches.isEmpty()){
+ return Collections.singletonList(LabelMatch.NONE);
+ } else {
+ return labelMatches;
+ }
+ }
+ @Override
+ public String toString() {
+ return labelMatches.isEmpty() ? "no match" :labelMatches.get(0)
+ + " for "+result.getId()
+ +(redirectsTo != null ? " redirected to "+redirectsTo.getId() : "");
+ }
+
+ /**
+ * Compares {@link Suggestion}s based on the {@link Suggestion#getScore()}.
+ * In case the scores are equals the call is forwarded to the
+ * {@link Suggestion#DEFAULT_LABEL_TOKEN_COMPARATOR}.<p>
+ * This is NOT the default {@link Comparator} because score values are
+ * usually only calculated relative to the best matching suggestions and
+ * therefore only available later.
+ */
+ public static final Comparator<Suggestion> SCORE_COMPARATOR = new Comparator<Suggestion>() {
+ @Override
+ public int compare(Suggestion arg0, Suggestion arg1) {
+ return arg0.getScore() > arg1.getScore() ? -1 : //bigger score first
+ arg0.getScore() < arg1.getScore() ? 1 :
+ DEFAULT_LABEL_TOKEN_COMPARATOR.compare(arg0.getLabelMatch(), arg1.getLabelMatch());
+ }
+ };
+ /**
+ * Compares {@link Suggestion} first based on the {@link Suggestion#getMatch()} value
+ * and secondly based on the {@link RdfResourceEnum#entityRank}.
+ */
+ public static final Comparator<Suggestion> MATCH_TYPE_SUGGESTION_COMPARATOR = new Comparator<Suggestion>() {
+ @Override
+ public int compare(Suggestion arg0, Suggestion arg1) {
+ int labelMatch = DEFAULT_LABEL_TOKEN_COMPARATOR.compare(arg0.getLabelMatch(), arg1.getLabelMatch());
+ if(labelMatch == 0){
+ Float arg0Rank = arg0.getEntityRank();
+ if(arg0Rank == null){
+ arg0Rank = Float.valueOf(0);
+ }
+ Float arg1Rank = arg1.getEntityRank();
+ if(arg1Rank == null){
+ arg1Rank = Float.valueOf(0);
+ }
+ return arg1Rank.compareTo(arg0Rank); //higher ranks first
+ } else {
+ return labelMatch;
+ }
+ }
+ };
+
+
+}
\ No newline at end of file
Added: stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/Utils.java
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/Utils.java?rev=1413155&view=auto
==============================================================================
--- stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/Utils.java (added)
+++ stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/Utils.java Sat Nov 24 09:40:08 2012
@@ -0,0 +1,19 @@
+package org.apache.stanbol.enhancer.engines.entitylinking.impl;
+
+public final class Utils {
+
+ private Utils(){}
+
+ public static boolean hasAlphaNumericChar(String label){
+ if (label == null) {
+ return false;
+ }
+ int sz = label.length();
+ for (int i = 0; i < sz; i++) {
+ if (Character.isLetterOrDigit(label.codePointAt(i))) {
+ return true;
+ }
+ }
+ return false;
+ }
+}
Added: stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/labeltokenizer/OpenNlpLabelTokenizer.java
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/labeltokenizer/OpenNlpLabelTokenizer.java?rev=1413155&view=auto
==============================================================================
--- stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/labeltokenizer/OpenNlpLabelTokenizer.java (added)
+++ stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/labeltokenizer/OpenNlpLabelTokenizer.java Sat Nov 24 09:40:08 2012
@@ -0,0 +1,89 @@
+package org.apache.stanbol.enhancer.engines.entitylinking.labeltokenizer;
+
+
+import opennlp.tools.tokenize.Tokenizer;
+import opennlp.tools.tokenize.TokenizerME;
+import opennlp.tools.tokenize.TokenizerModel;
+
+import org.apache.felix.scr.annotations.Activate;
+import org.apache.felix.scr.annotations.Component;
+import org.apache.felix.scr.annotations.Deactivate;
+import org.apache.felix.scr.annotations.Properties;
+import org.apache.felix.scr.annotations.Property;
+import org.apache.felix.scr.annotations.Reference;
+import org.apache.felix.scr.annotations.Service;
+import org.apache.stanbol.commons.opennlp.OpenNLP;
+import org.apache.stanbol.enhancer.engines.entitylinking.LabelTokenizer;
+import org.apache.stanbol.enhancer.nlp.utils.LanguageConfiguration;
+import org.osgi.framework.Constants;
+import org.osgi.service.cm.ConfigurationException;
+import org.osgi.service.component.ComponentContext;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Implementation of a LabelTokenizer based on OpenNLP that
+ * allows to configure custom Tokenizer models for specific
+ * languages.<p>
+ * <b>NOTE:</b> This component requires the optional dependency
+ * to <code>o.a.stanbol.commons.opennlp</code> as it dependes
+ * on the the {@link OpenNLP} service to retrieve {@link Tokenizer}
+ * and load {@link TokenizerModel}s.<p>
+ * This component registers itself with a service ranking of <code>-100</code>
+ * @author Rupert Westenthaler
+ *
+ */
+@Component(immediate=true)
+@Service
+@Properties(value={
+ @Property(name=Constants.SERVICE_RANKING,intValue=-100),
+ @Property(name=LabelTokenizer.SUPPORTED_LANUAGES,value="*")})
+public class OpenNlpLabelTokenizer implements LabelTokenizer {
+
+ private final Logger log = LoggerFactory.getLogger(OpenNlpLabelTokenizer.class);
+
+ public static final String PARAM_MODEL = "model";
+
+ @Reference
+ protected OpenNLP openNlp;
+
+ public OpenNlpLabelTokenizer(){}
+
+ public OpenNlpLabelTokenizer(OpenNLP openNLP){
+ this.openNlp = openNLP;
+ }
+
+ private LanguageConfiguration languageConfig = new LanguageConfiguration(
+ LabelTokenizer.SUPPORTED_LANUAGES, new String[]{"*"});
+
+ @Activate
+ protected void activate(ComponentContext ctx) throws ConfigurationException {
+ languageConfig.setConfiguration(ctx.getProperties());
+ }
+ @Deactivate
+ protected void deactivate(ComponentContext ctx){
+ languageConfig.setDefault();
+ }
+
+ @Override
+ public String[] tokenize(String label, String language) {
+ if(languageConfig.isLanguage(language)){
+ String modelName = languageConfig.getParameter(language, PARAM_MODEL);
+ if(modelName != null){
+ try {
+ TokenizerModel model = openNlp.getModel(TokenizerModel.class, modelName, null);
+ return new TokenizerME(model).tokenize(label);
+ } catch (Exception e) {
+ log.warn("Unable to load configured TokenizerModel '"+modelName
+ + "' for language '"+language
+ + "! Fallback to default Tokenizers",e);
+ }
+ }
+ //fallback to the defaults
+ return openNlp.getTokenizer(language).tokenize(label);
+ } else { //language not configured
+ return null;
+ }
+ }
+
+}
Added: stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/labeltokenizer/SimpleLabelTokenizer.java
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/labeltokenizer/SimpleLabelTokenizer.java?rev=1413155&view=auto
==============================================================================
--- stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/labeltokenizer/SimpleLabelTokenizer.java (added)
+++ stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/labeltokenizer/SimpleLabelTokenizer.java Sat Nov 24 09:40:08 2012
@@ -0,0 +1,68 @@
+package org.apache.stanbol.enhancer.engines.entitylinking.labeltokenizer;
+
+import java.util.ArrayList;
+
+import org.apache.felix.scr.annotations.Component;
+import org.apache.felix.scr.annotations.Properties;
+import org.apache.felix.scr.annotations.Property;
+import org.apache.felix.scr.annotations.Service;
+import org.apache.stanbol.enhancer.engines.entitylinking.LabelTokenizer;
+import org.osgi.framework.Constants;
+/**
+ * Simple Tokenizer that behaves equals as the
+ * OpenNLP <code>opennlp.tools.tokenize.SimpleTokenizer</code>
+ * @author Rupert Westenthaler
+ *
+ */
+@Component(immediate=true)
+@Service
+@Properties(value={
+ @Property(name=Constants.SERVICE_RANKING,intValue=-1000),
+ @Property(name=LabelTokenizer.SUPPORTED_LANUAGES,value="*")
+})
+public class SimpleLabelTokenizer implements LabelTokenizer {
+
+ private enum CT {WHITESPACE,LETTER,NUMBER,OTHER}
+
+ @Override
+ public String[] tokenize(String label, String language) {
+ ArrayList<String> tokens = new ArrayList<String>();
+ int start = -1;
+ int pc = 0;
+ CT state = CT.WHITESPACE;
+ CT charType = CT.WHITESPACE;
+ for (int i = 0; i < label.length(); i++) {
+ int c = label.codePointAt(i);
+ charType = getType(c);
+ if (state == CT.WHITESPACE) {
+ if (charType != CT.WHITESPACE) {
+ start = i;
+ }
+ } else {
+ if (charType != state || charType == CT.OTHER && c != pc) {
+ tokens.add(label.substring(start, i));
+ start = i;
+ }
+ }
+ state = charType;
+ pc = c;
+ }
+ if (charType != CT.WHITESPACE) {
+ tokens.add(label.substring(start, label.length()));
+ }
+ return tokens.toArray(new String[tokens.size()]);
+ }
+
+ private CT getType(int c){
+ if(Character.isLetter(c)){
+ return CT.LETTER;
+ } else if(Character.isDigit(c)){
+ return CT.NUMBER;
+ } else if(Character.isWhitespace(c) ||
+ Character.getType(c) == Character.SPACE_SEPARATOR){
+ return CT.WHITESPACE;
+ } else {
+ return CT.OTHER;
+ }
+ }
+}
Added: stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/resources/OSGI-INF/metatype/metatype.properties
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/resources/OSGI-INF/metatype/metatype.properties?rev=1413155&view=auto
==============================================================================
--- stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/resources/OSGI-INF/metatype/metatype.properties (added)
+++ stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/resources/OSGI-INF/metatype/metatype.properties Sat Nov 24 09:40:08 2012
@@ -0,0 +1,40 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+org.apache.stanbol.enhancer.engines.entitylinking.labeltokenizer.SimpleLabelTokenizer.name=Stanbol Enhancer \
+EntityLinking SimpleLabelTokenizer
+org.apache.stanbol.enhancer.engines.entitylinking.labeltokenizer.SimpleLabelTokenizer.description=This \
+is the default LabelTokenizer implementation. It behaves like the OpenNLP SimpleTokenizer but does not \
+have any external dependencies.
+
+org.apache.stanbol.enhancer.engines.entitylinking.labeltokenizer.OpenNlpLabelTokenizer.name=Stanbol Enhancer \
+EntityLinking LabelTokenizer using OpenNLP
+org.apache.stanbol.enhancer.engines.entitylinking.labeltokenizer.OpenNlpLabelTokenizer.description=This \
+LabelTokenizer implementation uses the OpenNLP Tokenizers API for tokenizing Entity labels processed \
+by the EntityLinkingEngine. It can be configured to load custom tokenizer models for specific \
+languages.
+
+service.ranking.name=Ranking
+service.ranking.description=If two LabelTokenizer support the same language, that the one with the \
+higher ranking is called first. Lower ranking LabelTokenizer are only used if others return NULL \
+on tokenize requests.
+
+enhancer.engines.entitylinking.labeltokenizer.languages.name=Supported Languages
+enhancer.engines.entitylinking.labeltokenizer.languages.description=The supported languages \
+of this Tokenizer. Values can use '!{lang}' to exclude languages; '{lang}' to explicitly \
+include languages and '*' as wildcard. In addition parameters are supported by using the \
+'{lang};{param}=value' syntax. Example: "!zh,de;model=my-de-tokenizer-model.zip,*" This will disable \
+processing of Chinese use the model with the name "my-de-tokenizer-model.zip" and the defaults \
+for all other languages.
\ No newline at end of file