You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2013/08/23 11:31:49 UTC
svn commit: r1516784 [3/3] - in /stanbol/trunk/enhancement-engines/lucenefstlinking: ./ src/ src/main/ src/main/java/ src/main/java/org/ src/main/java/org/apache/ src/main/java/org/apache/stanbol/ src/main/java/org/apache/stanbol/enhancer/ src/main/jav...

Added: stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilterStream.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilterStream.java?rev=1516784&view=auto
==============================================================================
--- stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilterStream.java (added)
+++ stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilterStream.java Fri Aug 23 09:31:48 2013
@@ -0,0 +1,232 @@
+package org.apache.stanbol.enhancer.engines.lucenefstlinking;
+
+import static org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig.UNICASE_SCRIPT_LANUAGES;
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.EnumSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig;
+import org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig;
+import org.apache.stanbol.enhancer.engines.entitylinking.engine.EntityLinkingEngine;
+import org.apache.stanbol.enhancer.engines.entitylinking.impl.ProcessingState;
+import org.apache.stanbol.enhancer.engines.entitylinking.impl.SectionData;
+import org.apache.stanbol.enhancer.engines.entitylinking.impl.TokenData;
+import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
+import org.apache.stanbol.enhancer.nlp.model.Section;
+import org.apache.stanbol.enhancer.nlp.model.Sentence;
+import org.apache.stanbol.enhancer.nlp.model.Span.SpanTypeEnum;
+import org.apache.stanbol.enhancer.nlp.model.Token;
+import org.opensextant.solrtexttagger.TaggingAttribute;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Classifies Tokens in the Solr {@link TokenStream} with the {@link TaggingAttribute}
+ * based on NLP processing results present in the {@link AnalysedText}. This
+ * implementation Classifies Token similar to the {@link EntityLinkingEngine}.
+ * It uses the {@link TextProcessingConfig} for its configuration.<p>
+ * <b> Implementation Details</b><p>
+ * While this code does not directly use {@link ProcessingState} it serves a
+ * similar purpose.<p>
+ * <ul>
+ * <li>This code needs to deal with potential different tokenization present
+ * in the {@link AnalysedText} and the {@link TokenStream}. The implemented 
+ * semantics does mark Tokens in the {@link TokenStream} as 
+ * <code>{@link TaggingAttribute#isTaggable()} == ture</code> if the do overlap 
+ * with a {@link TokenData#isLinkable} token in the {@link AnalysedText}.
+ * <li> {@link TokenData#isMatchable} tokens are also considered as
+ * <code>{@link TaggingAttribute#isTaggable()} == ture</code> if a 
+ * {@link TokenData#isMatchable} token is following within two tokens of the
+ * {@link AnalysedText}. This Range is extended if other matchable tokens are
+ * within the lookahead range. However the range is never extended over a
+ * section border.
+ * </ul>
+ * @author Rupert Westenthaler
+ *
+ */
+public class LinkableTokenFilterStream extends TokenFilter {
+
+    private final Logger log = LoggerFactory.getLogger(LinkableTokenFilterStream.class);
+    
+    /**
+     * Required to use {@link SectionData}
+     */
+    private static final Set<SpanTypeEnum> PROCESSED_SPAN_TYPES = EnumSet.of(
+        SpanTypeEnum.Chunk,SpanTypeEnum.Token);
+    /**
+     * The NLP processing results
+     */
+    private AnalysedText at;
+    /**
+     * The language of the text
+     */
+    //private String lang;
+    /**
+     * If the language is unicase or not
+     */
+    private boolean isUnicaseLanguage;
+    /**
+     * Defines how NLP processing results are processed to determine Words that
+     * need to be looked-up in the vocabulary
+     */
+    private LanguageProcessingConfig lpc;
+
+    /**
+     * Iterator over all sections of the {@link AnalysedText}
+     */
+    private Iterator<? extends Section> sections;
+    /**
+     * The current section
+     */
+    private SectionData sectionData;
+    /**
+     * Iterator over all {@link Token}s in the current section
+     */
+    private Iterator<TokenData> tokenIt;
+    /**
+     * The current Token
+     */
+    private TokenData token;
+    
+    private int lookupCount = 0;
+    private int incrementCount = 0;
+    
+    private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+    private final OffsetAttribute offset = addAttribute(OffsetAttribute.class);
+    private final TaggingAttribute taggable = addAttribute(TaggingAttribute.class);
+    
+    protected LinkableTokenFilterStream(TokenStream input, AnalysedText at, 
+            String lang, LanguageProcessingConfig lpc) {
+        super(input);
+        this.at = at;
+        //this.lang = lang;
+        this.lpc = lpc;
+        this.isUnicaseLanguage = lang != null && !lang.isEmpty() &&
+                UNICASE_SCRIPT_LANUAGES.contains(lang);
+    }
+
+    @Override
+    public void reset() throws IOException {
+        super.reset();
+        Iterator<Sentence> sentences = at.getSentences();
+        this.sections = sentences.hasNext() ? sentences : Collections.singleton(at).iterator();
+        sectionData = null;
+        tokenIt = null;
+        incrementCount = 0;
+        lookupCount = 0;
+    }
+    
+    @Override
+    public boolean incrementToken() throws IOException {
+        if(input.incrementToken()){
+            incrementCount++;
+            boolean first = true;
+            TokenData token; 
+            boolean lookup = false;
+            int lastMatchable = -1;
+            int lastIndex = -1;
+            while((token = nextToken(first)) != null){
+                first = false;
+                if(token.isLinkable){
+                    lookup = true;
+                } else if (token.isMatchable){
+                    lastMatchable = token.index;
+                    lastIndex = lastMatchable;
+                } //else if(token.hasAlphaNumeric){
+                //    lastIndex = token.index;
+                //}
+            }
+            //lookahead
+            if(!lookup && lastIndex >= 0 && sectionData != null){
+                List<TokenData> tokens = sectionData.getTokens();
+                int maxLookahead = Math.max(lastIndex, lastMatchable+3);
+                for(int i = lastIndex+1;!lookup && i < maxLookahead && i < tokens.size(); i++){
+                    token = tokens.get(i);
+                    if(token.isLinkable){
+                        lookup = true;
+                    } else if(token.isMatchable && (i+1) == maxLookahead){
+                        maxLookahead++; //increase lookahead for matchable tokens
+                    }
+                }
+            }
+            this.taggable.setTaggable(lookup);
+            if(lookup){
+                if(log.isTraceEnabled()){
+                    log.trace("Solr Token: [{},{}]: {}", new Object[]{
+                            offset.startOffset(), offset.endOffset(), termAtt});
+                }
+                lookupCount++;
+            }
+            return true;
+        } else {
+            log.debug("lookup percentage: {}",lookupCount*100/(float)incrementCount);
+            return false;
+        }
+    }
+
+    /**
+     * Iterating over TokensData requires to iterate over two hierarchy levels:
+     * (1) sections (likely Sentences) and (2) Tokens <p>
+     * <b>NOTE</b> that this method modifies a lot of fields to update the
+     * state of the iteration accordingly. If the {@link #token} field is
+     * <code>null</code> after a call to this method this indicates that the
+     * end of the {@link Token} in the {@link AnalysedText} was reached.
+     * @param first is this the first call for the current {@link #offset} state?
+     * @return the token or <code>null</code> if there are no more tokens for
+     * the current {@link #offset}
+     */
+    private TokenData nextToken(boolean first){
+        final boolean isToken;
+        if(token == null || //on the first call 
+                !first || //not the first call within on #incrementToken()
+                //current Token is before the current offset
+                token.token.getEnd() <= offset.startOffset()){
+            if(incrementTokenData()){ //get the next token
+                //the next token still overlaps with the current offset
+                isToken = token.token.getStart() < offset.endOffset(); 
+            } else { //end of stream
+                isToken = false;
+            }
+        } else { //check the current #token
+            isToken = token.token.getStart() < offset.endOffset(); 
+        }
+        return isToken ? token : null;
+    }
+    /**
+     * Increments the {@link #token} and - if necessary also the {@link #sectionData
+     * section}.
+     * @return <code>true</code> unless there are no more tokens
+     */
+    private boolean incrementTokenData(){
+        if(tokenIt == null || !tokenIt.hasNext()){
+            sectionData = null;
+            tokenIt = null;
+            while(sections.hasNext() && (tokenIt == null || !tokenIt.hasNext())){
+                //analyse NLP results for the next Section
+                sectionData = new SectionData(lpc, sections.next(), 
+                    PROCESSED_SPAN_TYPES, isUnicaseLanguage);
+                tokenIt = sectionData.getTokens().iterator();
+            }
+            if(tokenIt != null && tokenIt.hasNext()){
+                token = tokenIt.next(); //first token of the next section
+                return true;
+            } else { //reached the end .. clean up
+                sectionData = null;
+                tokenIt = null;
+                return false;
+            }
+        } else { //more token in the same section
+            token = tokenIt.next();
+            return true;
+        }
+    }
+    
+}

Added: stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/Match.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/Match.java?rev=1516784&view=auto
==============================================================================
--- stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/Match.java (added)
+++ stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/Match.java Fri Aug 23 09:31:48 2013
@@ -0,0 +1,189 @@
+package org.apache.stanbol.enhancer.engines.lucenefstlinking;
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.Map;
+
+import org.apache.clerezza.rdf.core.Literal;
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.stanbol.enhancer.engines.entitylinking.impl.Suggestion;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Represents a Entity that Matches somewhere in the tagged text.
+ * <p>
+ * Matches are generated for {@link #id Lucene Document IDs} and
+ * {@link #uri Solr Document ids} (the URI of the matching entity). On the
+ * first access to the {@link #getLabels() labels}, {@link #getTypes() types} 
+ * or {@link #getRedirects()} all those information are lazily retrieved by 
+ * accessing the data stored in the index. The {@link FieldLoader} instance
+ * parsed in the constructor is used to load those information.
+ * Typically this is implemented by the {@link MatchPool} instance used to
+ * instantiate Match instances.
+ * 
+ * 
+ * @author Rupert Westenthaler
+ *
+ */
+public class Match {
+    
+    private static final Logger log = LoggerFactory.getLogger(Match.class);
+
+    /**
+     * Lucene document id
+     */
+    public final int id;
+    
+    private FieldLoader fieldLoader;
+    
+    private Map<FieldType,Object> values;
+    private boolean error = false;
+    
+    private Literal matchLabel;
+    /**
+     * The score of the Match
+     */
+    private double score;
+
+    Match(int id, FieldLoader fieldLoader){
+        this.id = id;
+        this.fieldLoader = fieldLoader;
+    }
+
+    public String getUri() {
+        return getValue(FieldType.id);
+    }
+    
+    public Collection<Literal> getLabels(){
+        return getValues(FieldType.label);
+    }
+    
+    public Collection<UriRef> getTypes(){
+        return getValues(FieldType.type);
+    }
+    
+    public Collection<UriRef> getRedirects(){
+        return getValues(FieldType.redirect);
+    }
+    public Double getRanking(){
+        return getValue(FieldType.ranking);
+    }
+    private <T>  Collection<T> getValues(FieldType type){
+        if(!type.isMultivalued()){
+            throw new IllegalArgumentException("The parsed field Type '" + type
+                + "' is not multi valued!");
+        }
+        Object value = getValue(type);
+        return value == null ? Collections.EMPTY_SET : (Collection<T>)value;
+    }
+    private <T> T getValue(FieldType type){
+        if(error){
+            return null;
+        } else if(values == null){
+            try {
+                values = fieldLoader.load(id);
+            } catch (IOException e) {
+                log.warn("Unable to load Entity for Lucene DocId '"+id+"'!",e);
+                error = true;
+                return null;
+            } catch (RuntimeException e) {
+                log.warn("Error while loading Entity for Lucene DocId '"+id+"'!",e);
+                error = true;
+                return null;
+            }
+        }
+        return (T) values.get(type);
+    }
+    
+    public void setMatch(double score, Literal matchLabel){
+        this.score = score;
+        this.matchLabel = matchLabel;
+    }
+    /**
+     * Allows to update the {@link #getScore() score} without changing the
+     * {@link #getMatchLabel() match}.
+     * @param score the new score
+     */
+    public void updateScore(double score) {
+        this.score = score;
+    }
+    public double getScore() {
+        return score;
+    }
+
+    public Literal getMatchLabel() {
+        return matchLabel;
+    }
+    
+    @Override
+    public int hashCode() {
+        return id;
+    }
+    
+    @Override
+    public boolean equals(Object o) {
+        return o instanceof Match && id == ((Match)o).id;
+    }
+    
+    @Override
+    public String toString() {
+        String uri = getUri();
+        return uri != null ? uri : "Match[id: "+id+"|(uri unknown)]";
+    }
+    
+    static enum FieldType {
+        id(String.class),
+        label(Literal.class, true), 
+        type(UriRef.class,true), 
+        redirect(UriRef.class,true), 
+        ranking(Double.class);
+        
+        Class<?> valueType;
+        boolean multivalued;
+        
+        FieldType(Class<?> type){
+            this(type,false);
+        }
+        FieldType(Class<?> type, boolean multivalued){
+            this.valueType = type;
+            this.multivalued = multivalued;
+        }
+        public Class<?> getValueType() {
+            return valueType;
+        }
+        public boolean isMultivalued() {
+            return multivalued;
+        }
+    }
+    
+    static interface FieldLoader {
+        Map<FieldType,Object> load(int id) throws IOException;
+    }
+    /**
+     * Compares {@link Match} instances based on the {@link Match#getScore()}
+     */
+    public static final Comparator<Match> SCORE_COMPARATOR = new Comparator<Match>() {
+
+        @Override
+        public int compare(Match a, Match b) {
+            return Double.compare(b.score,a.score); //higher first
+        }
+        
+    };
+    /**
+     * Compares {@link Match} instances based on the {@link Match#getRanking()}.
+     * <code>null</code> values are assumed to be the smallest.
+     */
+    public static final Comparator<Match> ENTITY_RANK_COMPARATOR = new Comparator<Match>(){
+        @Override
+        public int compare(Match arg0, Match arg1) {
+            Double r1 = arg0.getRanking();
+            Double r2 = arg1.getRanking();
+            return r2 == null ? r1 == null ? 0 : -1 : r1 == null ? 1 : r2.compareTo(r1);
+        }
+    };
+
+}

Added: stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/Tag.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/Tag.java?rev=1516784&view=auto
==============================================================================
--- stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/Tag.java (added)
+++ stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/Tag.java Fri Aug 23 09:31:48 2013
@@ -0,0 +1,125 @@
+package org.apache.stanbol.enhancer.engines.lucenefstlinking;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig;
+import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
+import org.apache.stanbol.enhancer.nlp.model.Span;
+
+/**
+ * minimal helper class to represent a Tag.<p>
+ * TODO: This will need to collect additional values from the suggested
+ * SolrDocuments:<ul>
+ * <li> the type information - {@link EntityLinkerConfig#TYPE_FIELD} values
+ * <li>
+ * <li>
+ * <li>
+ * </ul>
+ * @author Rupert Westenthaler
+ *
+ */
+class Tag {
+    
+    /**
+     * the start index within the {@link AnalysedText}
+     */
+    final int[] span;
+    /**
+     * Matching documents
+     */
+    private Set<Match> ids;
+    
+    private List<Match> suggestions;
+    private String anchor;
+
+    Tag(int start, int end) {
+        span = new int[]{start,end};
+    }
+    Tag(int[] span) {
+        this.span = span;
+    }
+    
+    public void addIds(Set<Match> ids){
+        if(this.ids == null){
+            this.ids = ids;
+        } else {
+            this.ids.addAll(ids);
+        }
+    }
+    @SuppressWarnings("unchecked")
+    public Set<Match> getMatches(){
+        return ids == null ? Collections.EMPTY_SET : ids;
+    }
+    public int getStart() {
+        return span[0];
+    }
+    
+    public int getEnd() {
+        return span[1];
+    }
+    /**
+     * Setter for the Anchor text 
+     * @param anchor
+     */
+    public void setAnchor(String anchor) {
+        this.anchor = anchor;
+    }
+    /**
+     * Getter for the Anchor text
+     * @return the fise:selected-text value
+     */
+    public String getAnchor() {
+        return anchor;
+    }
+    
+    public void setSuggestions(List<Match> suggestions) {
+        this.suggestions = suggestions;
+    }
+    
+    public List<Match> getSuggestions() {
+        return suggestions;
+    }
+    
+    @Override
+    public int hashCode() {
+        return Arrays.hashCode(span);
+    }
+    
+    @Override
+    public boolean equals(Object o) {
+        return o instanceof Tag && Arrays.equals(span, ((Tag)o).span);
+    }
+    
+    @Override
+    public String toString() {
+        return new StringBuilder("Tag").append(Arrays.toString(span)).toString();
+    }
+    
+    static final Comparator<int[]> SPAN_COMPARATOR = new Comparator<int[]>() {
+
+        @Override
+        public int compare(int[] a, int[] b) {
+            int c = a[0] < b[0] ? -1 : a[0] > b[0] ? 1 : 0;
+            if(c == 0){
+                c = a[1] > b[1] ? -1 : a[1] < b[1] ? 1 : 0;
+            }
+            return c;
+        }
+        
+    };
+
+    /**
+     * Returns the score of the best {@link #getSuggestions() suggestion}
+     * @return
+     */
+    public double getScore() {
+        return suggestions == null || suggestions.isEmpty() ? 0 : 
+            suggestions.get(0).getScore();
+    }
+    
+}
\ No newline at end of file

Added: stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/TaggingSession.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/TaggingSession.java?rev=1516784&view=auto
==============================================================================
--- stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/TaggingSession.java (added)
+++ stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/TaggingSession.java Fri Aug 23 09:31:48 2013
@@ -0,0 +1,510 @@
+package org.apache.stanbol.enhancer.engines.lucenefstlinking;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.EnumMap;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.clerezza.rdf.core.Language;
+import org.apache.clerezza.rdf.core.Literal;
+import org.apache.clerezza.rdf.core.Resource;
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
+import org.apache.commons.lang.StringUtils;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.Field.Store;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexableField;
+import org.apache.lucene.queries.function.valuesource.IfFunction;
+import org.apache.solr.schema.SchemaField;
+import org.apache.solr.search.SolrIndexSearcher;
+import org.apache.solr.util.RefCounted;
+import org.apache.stanbol.enhancer.engines.lucenefstlinking.Match.FieldLoader;
+import org.apache.stanbol.enhancer.engines.lucenefstlinking.Match.FieldType;
+import org.apache.stanbol.enhancer.engines.lucenefstlinking.cache.EntityCache;
+import org.apache.stanbol.enhancer.engines.lucenefstlinking.impl.ValueSourceAccessor;
+import org.apache.stanbol.enhancer.servicesapi.ContentItem;
+import org.apache.stanbol.enhancer.servicesapi.EngineException;
+import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
+import org.opensextant.solrtexttagger.TaggerFstCorpus;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Profile created based on the {@link IndexConfiguration} for processing a
+ * parsed ContentItem. <p>
+ * 
+ * @author Rupert Westenthaler
+ *
+ */
+public class TaggingSession implements Closeable {
+    
+    private final Logger log = LoggerFactory.getLogger(TaggingSession.class);
+    
+    private String language;
+    
+    private Corpus langCorpus;
+    
+    private Corpus defaultCorpus;
+    
+    /**
+     * The Solr document id field holding the URI of the Entity.
+     */
+    protected final String idField;
+    
+    /**
+     * The Solr field holding the labels in the language of the current Document
+     */
+    protected final String labelField;
+    
+    protected final Language labelLang;
+    /**
+     * The Solr field holding the labels in the default matching language or
+     * <code>null</code> if the same as {@link #labelField}
+     */
+    protected final String defaultLabelField;
+    
+    protected final Language defaultLabelLang;
+    
+    protected final Set<String> solrDocfields = new HashSet<String>();
+
+    protected final IndexConfiguration config;
+    
+    protected final String typeField;
+    protected final String redirectField;
+    protected final String rankingField;
+    private final RefCounted<SolrIndexSearcher> searcherRef;
+    /*
+     * Document Cache and session statistics for the cache
+     */
+    private RefCounted<EntityCache> documentCacheRef;
+    private int docLoaded = 0;
+    private int docCached = 0;
+    private int docAppended = 0;
+    private final ValueSourceAccessor uniqueKeyCache;
+    //private final Map<Integer,Match> matchPool = new HashMap<Integer,Match>(2048);
+    private final FieldLoaderImpl fieldLoader;
+
+    
+    TaggingSession(String language, IndexConfiguration config) throws CorpusException {
+        this.language = language;
+        this.config = config;
+        CorpusInfo langCorpusInfo = config.getCorpus(language);
+        CorpusInfo defaultCorpusInfo = config.getDefaultCorpus();
+        
+        //obtain the Solr Document Id field
+        SchemaField idSchemaField = config.getIndex().getSchema().getUniqueKeyField();
+        idField = idSchemaField.getName();
+        solrDocfields.add(idField);
+
+        //obtain the language specific fields for the session
+        if(langCorpusInfo == null && defaultCorpusInfo == null){
+            //this should not happen, because the canEnhance method of the 
+            //engine should  already reject such calls
+            throw new IllegalStateException("No FST Corpus configured for language '"
+                +language+"' and also no default FST Corpus is present.!");
+        }
+        if(langCorpusInfo != null){
+            this.langCorpus = new Corpus(langCorpusInfo,
+                obtainFstCorpus(langCorpusInfo));
+            this.labelField = langCorpusInfo.storedField;
+            solrDocfields.add(labelField);
+            this.labelLang = langCorpusInfo.language == null || 
+                    StringUtils.isBlank(langCorpusInfo.language) ? null : 
+                        new Language(langCorpusInfo.language);
+        } else {
+            this.labelField = null;
+            this.labelLang = null; 
+        }
+        if(defaultCorpusInfo != null && !defaultCorpusInfo.equals(langCorpusInfo)){
+            this.defaultCorpus = new Corpus(defaultCorpusInfo,
+                obtainFstCorpus(defaultCorpusInfo));
+            this.defaultLabelField = defaultCorpusInfo.storedField;
+            solrDocfields.add(defaultLabelField);
+            this.defaultLabelLang = defaultCorpusInfo.language == null || 
+                    StringUtils.isBlank(defaultCorpusInfo.language) ? null : 
+                        new Language(defaultCorpusInfo.language);
+        } else {
+            this.defaultCorpus = null;
+            this.defaultLabelField = null;
+            this.defaultLabelLang = null;
+        }
+        if(this.defaultCorpus == null && this.langCorpus == null){
+            throw new CorpusException("Unable to initialise a FST corpus for language '"
+                + language+"'. Neigher the language specific Coprpus (field : "
+                + langCorpusInfo != null ? langCorpusInfo.indexedField : "<undefined>" 
+                + ") nor for the default language (field: " 
+                + defaultCorpusInfo != null ? defaultCorpusInfo.indexedField : "<undefined>"
+                + ") is currently available!",null);
+        }
+        if(config.getTypeField() != null){
+            this.typeField = config.getTypeField();
+            solrDocfields.add(typeField);
+        } else {
+            this.typeField = null;
+        }
+        if(config.getRedirectField() != null){
+            this.redirectField = config.getRedirectField();
+            solrDocfields.add(redirectField);
+        } else {
+            this.redirectField = null;
+        }
+        if(config.getRankingField() != null){
+            this.rankingField = config.getRankingField();
+            solrDocfields.add(rankingField);
+        } else {
+            this.rankingField = null;
+        }
+        searcherRef = config.getIndex().getSearcher();
+        SolrIndexSearcher searcher = searcherRef.get();
+        documentCacheRef = config.getEntityCacheManager().getCache(searcher);
+        uniqueKeyCache = null; //no longer used.
+//        uniqueKeyCache = new ValueSourceAccessor(searcher, idSchemaField.getType()
+//            .getValueSource(idSchemaField, null));
+        fieldLoader = new FieldLoaderImpl(searcher.getIndexReader());
+
+    }
+    /**
+     * Used to instantiate {@link Match}es 
+     * @param docId the Lucene document Id as returned by the FST corpus
+     * @return the Match instance
+     */
+    public Match createMatch(int docId){
+        return new Match(docId,fieldLoader);
+    }
+    
+    public void close(){
+        //matchPool.clear(); //clean up the matchpool
+        searcherRef.decref(); //clean up the Solr index searcher reference
+        documentCacheRef.decref(); //clean up the DocumentCache reference
+    }
+    /**
+     * The language of this Session. This is typically the language detected for
+     * the document.
+     * @return the language of this Session
+     */
+    public String getLanguage() {
+        return language;
+    }
+    
+//    public String getTypeField() {
+//        return config.getTypeField();
+//    }
+    
+//    public String getRedirectField() {
+//        return config.getRedirectField();
+//    }
+    
+//    public String getDefaultLabelField() {
+//        return defaultLabelField;
+//    }
+    
+//    public Language getDefaultLabelLanguage() {
+//        return defaultLabelLang;
+//    }
+
+//    public String getLabelField() {
+//        return labelField;
+//    }
+    
+//    public Language getLabelLanguage() {
+//        return labelLang;
+//    }
+
+//    /**
+//     * @return the langCorpus
+//     */
+//    public final CorpusInfo getLangCorpus() {
+//        return langCorpusInfo;
+//    }
+
+//    /**
+//     * @return the defaultCorpus
+//     */
+//    public final CorpusInfo getDefaultCorpus() {
+//        return defaultCorpusInfo;
+//    }
+
+
+    public Corpus getDefaultCorpus() {
+        return defaultCorpus;
+    }
+    
+    public Corpus getLanguageCorpus() {
+        return langCorpus;
+    }
+    
+    public SolrIndexSearcher getSearcher() {
+        return searcherRef.get();
+    }
+    
+    public static TaggingSession createSession(IndexConfiguration indexConfig, 
+            String language) throws CorpusException {
+        TaggingSession session = new TaggingSession(language, indexConfig);
+        return session;
+    }
+    
+    public EntityCache getDocumentCache(){
+        return documentCacheRef.get();
+    }
+    /**
+     * The number of Lucene Documents loaded form disc in this session so far
+     * @return
+     */
+    public int getSessionDocLoaded(){
+        return docLoaded;
+    }
+    /**
+     * The number of Lucene Documents retrieved from the {@link #getDocumentCache()}
+     * in this session so far
+     * @return
+     */
+    public int getSessionDocCached(){
+        return docCached;
+    }
+    /**
+     * The number of Lucene Documents retrived from the {@link #getDocumentCache()},
+     * but with missing fields from the Cache. For such documents the additional
+     * fields (typically labels of different languages) where readed from disc and
+     * added to the cached document.
+     * @return
+     */
+    public int getSessionDocAppended(){
+        return docAppended;
+    }
+
+    
+    /**
+     * Obtains the FST corpus for the parsed CorpusInfo. The other parameters
+     * are just used for error messages in case this is not successful.
+     * @param fstInfo the info about the corpus
+     * @param ci the contentIteem (just used for logging and error messages)
+     * @return
+     * @throws CorpusException
+     */
+    private TaggerFstCorpus obtainFstCorpus(CorpusInfo fstInfo) throws CorpusException {
+        TaggerFstCorpus fstCorpus;
+        synchronized (fstInfo) { // one at a time
+            fstCorpus = fstInfo.getCorpus(); 
+            if (fstCorpus == null) {
+                if (fstInfo.isEnqueued()) {
+                    throw new CorpusException("The FST corpus for language '"
+                            + fstInfo.language + "' is enqueued for creation, but not yet "
+                            + "available. Try at a  later point in time", null);
+                }
+                if (fstInfo.isFstCreationError()) {
+                    throw new CorpusException(fstInfo.getErrorMessage(), null);
+                }
+                if (fstInfo.isFstFileError() && fstInfo.allowCreation) {
+                    //try to recreate the FST corpus
+                    if(config.getExecutorService() != null){
+                        // TODO: this code should get moved to a CorpusManager class
+                        config.getExecutorService().execute(
+                            new CorpusCreationTask(config.getIndex(), fstInfo));
+                        throw new CorpusException("The FST corpus for language '"
+                                + fstInfo.language + "' was invalid and is now "
+                                + "enqueued for re-creation. Retry at a  later "
+                                + "point in time.", null);
+                    } else {
+                        throw new CorpusException(fstInfo.getErrorMessage(), null);
+                    }
+                }
+            }
+
+        }
+        return fstCorpus;
+    }
+    /**
+     * {@link FieldLoader} implementation used to create {@link Match} instances
+     */
+    private class FieldLoaderImpl implements FieldLoader {
+        
+        private static final String LOADED_FIELDS_FIELD_NAME = "__loadedFields__";
+        
+        private List<Field> loadedFieldsFields;
+        
+        private final IndexReader reader;
+        /**
+         * Cache similar to the {@link EntityCache}, but with a scope bound to
+         * life cycle of this FieldLoaderImpl instance (a single TaggingSession).
+         * This cache ensures the Lucene Documents are not loaded twice while
+         * processing the same document (even if no EntiyCache is configured or
+         * the size of the EntityCache is to small).
+         */
+        private final Map<Integer,Document> sessionCache = new HashMap<Integer,Document>();
+        /**
+         * The EntityCache instance that caches entity data over multiple sessions
+         */
+        private final EntityCache cache;
+        
+        public FieldLoaderImpl(IndexReader reader) {
+            this.reader = reader;
+            loadedFieldsFields = new ArrayList<Field>(solrDocfields.size());
+            for(String loadedFieldName : solrDocfields){
+                loadedFieldsFields.add(new StringField(LOADED_FIELDS_FIELD_NAME, 
+                    loadedFieldName, Store.NO));
+            }
+            this.cache = documentCacheRef.get();
+        }
+        
+        @Override
+        public Map<FieldType,Object> load(int id) throws IOException {
+            //load the Lucene Document for the id 
+            Integer ID = Integer.valueOf(id);
+            Document doc = sessionCache.get(ID);
+            if(doc == null){
+                doc = cache.get(ID);
+                if(doc == null){
+                    doc = reader.document(id, solrDocfields);
+                    //if we read a doc from the index we need to add information about
+                    //the fields we loaded (especially the languages of labels loaded
+                    //NOTE that those information will never be stored in the index. They
+                    //are only kept in-memory when caching this document.
+                    for(Field loadedFieldsField : loadedFieldsFields){
+                        doc.add(loadedFieldsField);
+                    }
+                    docLoaded++;
+                    cache.cache(ID, doc);
+                } else {
+                    //we need to check if the fields of the cached doc are sufficient
+                    //for the requested Solr Document fields
+                    Set<String> fields = new HashSet<String>(solrDocfields);
+                    String[] loaded = doc.getValues(LOADED_FIELDS_FIELD_NAME);
+                    for(int i=0;i < loaded.length && !fields.isEmpty(); i++){
+                        fields.remove(loaded[i]);
+                    }
+                    if(!fields.isEmpty()){ //we are missing some fields
+                        //need to load it from the index
+                        Document tmp = reader.document(id, fields);
+                        //add the additional fields to the cached doc
+                        for(IndexableField field : tmp.getFields()){
+                            doc.add(field);
+                        }
+                        //also update the loaded fields
+                        for(String loadedField : fields){
+                            doc.add(new StringField(LOADED_FIELDS_FIELD_NAME, 
+                                loadedField, Store.NO));
+                        }
+                        //NOTE: no need to update the cache, as we have updated the
+                        //cached value.
+                        //cache.cache(ID, doc);
+                        docAppended++;
+                    } else {
+                        docCached++;
+                    }
+                    //and put the doc in the sessionCache
+                }
+                //add this doc to the session cache
+                sessionCache.put(ID, doc);
+            } //else { //document is in the session cache ... just use it
+                //NOTE: The session cache has a minor side effect on the
+                // EntityCache. Because multiple occurrences of an Entity
+                // within the Document are not requested on the EntityCache
+                // LRU based implementations will get slightly different
+                // statistics. Assuming that the maximum size of the EntityCache
+                // is >> as the number of Documents matching for the current Text
+                // this effect can be considered as negligible.
+            //}
+            if(doc != null){
+                Map<FieldType,Object> values = 
+                        new EnumMap<Match.FieldType,Object>(FieldType.class);
+                //load the ID
+                values.put(FieldType.id, doc.get(idField));
+                //load the labels
+                Set<Literal> labels = new HashSet<Literal>();
+                for(String label : doc.getValues(labelField)){
+                    labels.add(new PlainLiteralImpl(label, labelLang));
+                }
+                if(defaultLabelField != null){
+                    for(String label : doc.getValues(defaultLabelField)){
+                        labels.add(new PlainLiteralImpl(label, defaultLabelLang));
+                    }
+                }
+                values.put(FieldType.label, labels);
+                //load the types
+                if(typeField != null){
+                    Set<UriRef> types = new HashSet<UriRef>();
+                    for(String type : doc.getValues(typeField)){
+                        types.add(new UriRef(type));
+                    }
+                    values.put(FieldType.type, types);
+                }
+                //load the redirects
+                if(redirectField != null){
+                    Set<UriRef> redirects = new HashSet<UriRef>();
+                    for(String redirect : doc.getValues(redirectField)){
+                        redirects.add(new UriRef(redirect));
+                    }
+                    values.put(FieldType.redirect, redirects);
+                }
+                //load the rankings
+                if(rankingField != null){
+                    Number num = doc.getField(rankingField).numericValue();
+                    Double ranking;
+                    if(num instanceof Double){
+                        ranking = (Double)num;
+                    } else if (num != null){
+                        ranking = Double.valueOf(num.doubleValue());
+                    } else { //num == null
+                        String value = doc.get(rankingField);
+                        if(value != null){
+                            try {
+                                ranking = Double.valueOf(value);
+                            } catch (NumberFormatException e) {
+                                ranking = null;
+                            }
+                        } else {
+                            ranking = null;
+                        }
+                    }
+                    if(ranking != null){
+                        values.put(FieldType.ranking, ranking);
+                    }
+                }
+                return values;
+            } else {
+                throw new IOException("No document found for Lucene doc id '"+id+"'!");
+            }
+        }
+    }
+
+    public class Corpus {
+        
+        private CorpusInfo corpusInfo;
+        private TaggerFstCorpus fst;
+
+        Corpus(CorpusInfo corpusInfo, TaggerFstCorpus fst){
+            this.corpusInfo = corpusInfo;
+            this.fst = fst;
+        }
+        
+        public String getLanugage(){
+            return corpusInfo.language;
+        }
+        
+        public Analyzer getAnalyzer(){
+            return corpusInfo.analyzer;
+        }
+        
+        public TaggerFstCorpus getFst(){
+            return fst;
+        }
+
+        public String getIndexedField() {
+            return corpusInfo.indexedField;
+        }
+        
+        public String getStoredField(){
+            return corpusInfo.storedField;
+        }
+    }
+    
+}

Added: stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/cache/EntityCache.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/cache/EntityCache.java?rev=1516784&view=auto
==============================================================================
--- stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/cache/EntityCache.java (added)
+++ stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/cache/EntityCache.java Fri Aug 23 09:31:48 2013
@@ -0,0 +1,46 @@
+package org.apache.stanbol.enhancer.engines.lucenefstlinking.cache;
+
+import org.apache.lucene.document.Document;
+import org.apache.solr.common.SolrDocument;
+
+/**
+ * A Cache for {@link SolrDocument}s holding Entity information required for
+ * entity linking. This cache is intended to avoid disc access for loading
+ * entity data of entities detected by the FST tagging in the parsed document.
+ * @author Rupert Westenthaler
+ *
+ */
+public interface EntityCache {
+    
+    /**
+     * if the current version of the index does not equals this version
+     * the Cache need to be renewed.
+     * @return the version this cache is build upon
+     */
+    Object getVersion();
+    /**
+     * Getter for the Document based on the Lucene Document ID
+     * @param docId the Lucene document ID (the unique key)
+     * @return the Document or <code>null</code> if not in the cache
+     */
+    Document get(Integer docId);
+    
+    /**
+     * Caches the document for the parsed Lucene document id
+     * @param docId the Lucene document id
+     * @param doc the Document
+     */
+    void cache(Integer docId, Document doc);
+
+    /**
+     * The size of the cache of <code>-1</code> if not available
+     * @return the size or <code>-1</code> if not known
+     */
+    int size();
+    
+    /**
+     * The statistics for this Cache
+     * @return 
+     */
+    String printStatistics();
+}

Added: stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/cache/EntityCacheManager.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/cache/EntityCacheManager.java?rev=1516784&view=auto
==============================================================================
--- stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/cache/EntityCacheManager.java (added)
+++ stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/cache/EntityCacheManager.java Fri Aug 23 09:31:48 2013
@@ -0,0 +1,36 @@
+package org.apache.stanbol.enhancer.engines.lucenefstlinking.cache;
+
+import org.apache.lucene.document.Document;
+import org.apache.solr.search.SolrIndexSearcher;
+import org.apache.solr.util.RefCounted;
+
+/**
+ * Manages a Cache for Entity data required for EntityLinking. Cached entity
+ * data are represented by Lucene {@link Document}s.
+ * <p>
+ * This is expected to manage a single {@link EntityCache} for the current
+ * version of the Lucene index. A 'new' version is expected as soon as
+ * {@link #getCache(Object)} is called for a different version. In that case
+ * the current {@link EntityCache} should be cleared and a new empty one
+ * needs to be created. The new Cache might get autowarmed (if supported and
+ * configured)
+ * <p>
+ * Implementations need to wait with clearing/closing outdated {@link EntityCache}
+ * instances until the old version is no longer used 
+ * ({@link RefCounted#close()} is called).
+ */
+public interface EntityCacheManager {
+    
+    /**
+     * Getter for a reference counting instance of the {@link EntityCache}.
+     * Callers need to ensure that {@link RefCounted#decref()} is called when the
+     * do no longer need the obtained DocumentCache instance.
+     * @param version the version object. Typically the current 
+     * {@link SolrIndexSearcher} instance can be used as version object as a new 
+     * cache instance should be created if a new index searcher was opened by 
+     * the SolrCore.
+     * @return A counting reference to the EntityCache
+     */
+    RefCounted<EntityCache> getCache(Object version);
+
+}

Added: stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/cache/FastLRUCacheManager.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/cache/FastLRUCacheManager.java?rev=1516784&view=auto
==============================================================================
--- stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/cache/FastLRUCacheManager.java (added)
+++ stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/cache/FastLRUCacheManager.java Fri Aug 23 09:31:48 2013
@@ -0,0 +1,108 @@
+package org.apache.stanbol.enhancer.engines.lucenefstlinking.cache;
+
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.felix.scr.annotations.Component;
+import org.apache.felix.scr.annotations.Service;
+import org.apache.lucene.document.Document;
+import org.apache.solr.search.CacheRegenerator;
+import org.apache.solr.search.FastLRUCache;
+import org.apache.solr.search.SolrCache;
+import org.apache.solr.util.RefCounted;
+
+/**
+ * Implementation of the {@link EntityCacheManager} based on the Solr
+ * {@link FastLRUCache} implementation
+ * 
+ * @author Rupert Westenthaler
+ *
+ */
+public class FastLRUCacheManager implements EntityCacheManager {
+
+    RefCounted<EntityCache> current;
+    private final CacheRegenerator regenerator;
+    private final Map<String,String> config;
+    
+    /**
+     * Creates a cache manager instance with the parsed maximum size and no 
+     * support for autowarming
+     * @param size the maximum size or values <= 0 to use the default size
+     */
+    public FastLRUCacheManager(int size){
+        this(size,0,null);
+    }
+    /**
+     * Creates a cache manager instance with the parsed maximum size and support
+     * for autowarming.
+     * @param size the maximum size
+     * @param autowarmCount the number of documents added to the new cache based
+     * on entries in an old version
+     * @param regenerator the regenerator instance used for autowarming
+     */
+    public FastLRUCacheManager(int size, int autowarmCount, CacheRegenerator regenerator){
+        Map<String,String> config = new HashMap<String,String>();
+        config.put("name", "Tagging Document Cache");
+        if(size > 0){
+            config.put("size",Integer.toString(size));
+        }
+        if(regenerator != null){
+            config.put("autowarmCount",Integer.toString(autowarmCount));
+        }
+        this.config = Collections.unmodifiableMap(config);
+        this.regenerator = regenerator;
+    }
+    
+    
+    @Override
+    public RefCounted<EntityCache> getCache(Object version) {
+        if(current == null || !current.get().getVersion().equals(version)){
+            if(current != null){
+                //the the old one as outdated!
+                ((RefCountedImpl)current).setOutdated();
+            }
+            //create a new cache
+            SolrCache<Integer,Document> cache = new FastLRUCache<Integer,Document>();
+            cache.init(config, null, regenerator);
+            current = new RefCountedImpl(new SolrEntityCache(version, cache));
+        }
+        current.incref();
+        return current;
+    }
+
+    /**
+     * {@link RefCounted} implementation that ensures that outdated caches are
+     * cleared and closed as soon as they are no longer in use.
+     * 
+     * @author Rupert Westenthaler
+     *
+     */
+    protected class RefCountedImpl extends RefCounted<EntityCache>{
+        
+        public RefCountedImpl(SolrEntityCache resource) {
+            super(resource);
+        }
+
+        private boolean outdated;
+
+        /**
+         * Used by the manager implementation to set the RefCounted EntityCache
+         * as outdated
+         */
+        protected void setOutdated() {
+            outdated = true;
+        }
+
+        /**
+         * clears the cache if outdated
+         */
+        protected void close(){
+            if(outdated){
+                ((SolrEntityCache)get()).close();
+            }
+        }
+
+    }
+    
+}

Added: stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/cache/SolrEntityCache.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/cache/SolrEntityCache.java?rev=1516784&view=auto
==============================================================================
--- stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/cache/SolrEntityCache.java (added)
+++ stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/cache/SolrEntityCache.java Fri Aug 23 09:31:48 2013
@@ -0,0 +1,61 @@
+package org.apache.stanbol.enhancer.engines.lucenefstlinking.cache;
+
+import org.apache.lucene.document.Document;
+import org.apache.solr.search.FastLRUCache;
+import org.apache.solr.search.SolrCache;
+
+/**
+ * Implementation of the {@link EntityCache} interface by using the
+ * {@link SolrCache} API.
+ * 
+ * @author Rupert Westenthaler
+ *
+ */
+public class SolrEntityCache implements EntityCache {
+
+    private final SolrCache<Integer,Document> cache;
+    private final Object version;
+    private boolean closed;
+    
+    public SolrEntityCache(Object version, SolrCache<Integer,Document> cache) {
+        this.cache = cache;
+        this.version = version;
+    }
+    
+    @Override
+    public Object getVersion() {
+        return version;
+    }
+
+    @Override
+    public Document get(Integer docId) {
+        return !closed ? cache.get(docId) : null;
+    }
+
+    @Override
+    public void cache(Integer docId, Document doc) {
+        if(!closed){
+            cache.put(docId, doc);
+        }
+    }
+
+    @Override
+    public int size() {
+        return cache.size();
+    }
+    @Override
+    public String printStatistics() {
+        return cache.getStatistics().toString();
+    }
+    
+    @Override
+    public String toString() {
+        return cache.getDescription();
+    }
+    
+    void close(){
+        closed = true;
+        cache.clear();
+        cache.close();
+    }
+}

Added: stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/impl/ValueSourceAccessor.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/impl/ValueSourceAccessor.java?rev=1516784&view=auto
==============================================================================
--- stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/impl/ValueSourceAccessor.java (added)
+++ stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/impl/ValueSourceAccessor.java Fri Aug 23 09:31:48 2013
@@ -0,0 +1,50 @@
+package org.apache.stanbol.enhancer.engines.lucenefstlinking.impl;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.lucene.index.AtomicReaderContext;
+import org.apache.lucene.index.ReaderUtil;
+import org.apache.lucene.queries.function.FunctionValues;
+import org.apache.lucene.queries.function.ValueSource;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.solr.common.SolrException;
+
+/** See LUCENE-4541 or {@link org.apache.solr.response.transform.ValueSourceAugmenter}. */
+public class ValueSourceAccessor {
+  // implement FunctionValues ?
+  private final List<AtomicReaderContext> readerContexts;
+  private final FunctionValues[] docValuesArr;
+  private final ValueSource valueSource;
+  private final Map fContext;
+
+  private int localId;
+  private FunctionValues values;
+
+  public ValueSourceAccessor(IndexSearcher searcher, ValueSource valueSource) {
+    readerContexts = searcher.getIndexReader().leaves();
+    this.valueSource = valueSource;
+    docValuesArr = new FunctionValues[readerContexts.size()];
+    fContext = ValueSource.newContext(searcher);
+  }
+
+  private void setState(int docid) {
+    int idx = ReaderUtil.subIndex(docid, readerContexts);
+    AtomicReaderContext rcontext = readerContexts.get(idx);
+    values = docValuesArr[idx];
+    if (values == null) {
+      try {
+        docValuesArr[idx] = values = valueSource.getValues(fContext, rcontext);
+      } catch (IOException e) {
+        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
+      }
+    }
+    localId = docid - rcontext.docBase;
+  }
+
+  public Object objectVal(int docid) {
+    setState(docid);
+    return values.objectVal(localId);
+  }
+}
\ No newline at end of file

Added: stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/resources/OSGI-INF/metatype/metatype.properties
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/resources/OSGI-INF/metatype/metatype.properties?rev=1516784&view=auto
==============================================================================
--- stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/resources/OSGI-INF/metatype/metatype.properties (added)
+++ stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/resources/OSGI-INF/metatype/metatype.properties Fri Aug 23 09:31:48 2013
@@ -0,0 +1,144 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+stanbol.enhancer.engine.name.name=Name
+stanbol.enhancer.engine.name.description=The name of the enhancement engine as \
+used in the RESTful interface '/engine/<name>'
+
+service.ranking.name=Ranking
+service.ranking.description=If two enhancement engines with the same name are active the \
+one with the higher ranking will be used to process parsed content items.
+
+#===============================================================================
+#Properties specific to the FST linking engine 
+#===============================================================================
+org.apache.stanbol.enhancer.engines.lucenefstlinking.FstLinkingEngineComponent.name=Apache \
+Stanbol Enhancer Engine: FST Linking
+org.apache.stanbol.enhancer.engines.lucenefstlinking.FstLinkingEngineComponent.description=Lucene \
+FST based Entity Linking Engine implementation.
+
+enhancer.engines.linking.solrfst.solrcore.name=Solr Core
+enhancer.engines.linking.solrfst.solrcore.description=The reference to the SolrCore. \
+Supports the '{server-name}:{core-name}' syntax to reference a specific Managed- / \
+Referenced SolrServer. If {server-name} is not present the configured {core-name} is \
+assumed to be available on the default SolrServer. Remote SolrServer are NOT supported!
+
+enhancer.engines.linking.solrfst.fieldEncoding.name=Field Name Encoding
+enhancer.engines.linking.solrfst.fieldEncoding.description=Specifies how FieldNames \
+of the SolrCore are encoded. This is mainly used to specify the pattern used to \
+name fields holding entity labels of different languages. The 'SolrYard' supports \
+the encoding used by the Stanbol Entityhub SolrYard implementation. If 'None' is \
+selected the exact field names used by the SolrCore need to be configured.
+enhancer.engines.linking.solrfst.fieldEncoding.option.none=None
+enhancer.engines.linking.solrfst.fieldEncoding.option.solrYard=SolrYard
+enhancer.engines.linking.solrfst.fieldEncoding.option.minusPrefix='-' Prefix: '{lang}-{name}'
+enhancer.engines.linking.solrfst.fieldEncoding.option.underscorePrefix='_' Prefix: '{lang}_{name}'
+enhancer.engines.linking.solrfst.fieldEncoding.option.minusSuffix='-' Suffix: '{name}-{lang}'
+enhancer.engines.linking.solrfst.fieldEncoding.option.underscoreSuffix='_' Suffix: '{name}_{lang}'
+enhancer.engines.linking.solrfst.fieldEncoding.option.atPrefix='@' Prefix: '{lang}@{name}'
+enhancer.engines.linking.solrfst.fieldEncoding.option.atSuffix='@' Suffix: '{name}@{lang}'
+
+enhancer.engines.linking.solrfst.fstconfig.name=FST Corpora configuration
+enhancer.engines.linking.solrfst.fstconfig.description=Configuration for the FST \
+Corpora. Syntax: '{lang};{param-name}={param-value};{param-name}={param-value};...' \
+Supported {param-name}s: 'field' ... the field name of the SolrIndex used for the \
+FST corpus (default: rdfs:label). The configured field name is encoded using the \
+Field Name Encoding. \
+'fst' ... the {base-name} of the file with the serialized FST model (default: {field} with \
+none alpha-numeric chars replaced by '_'). The actual file name is '{base-name}.{lang}.fst'. \
+Files are located in the 'fst' folder relative to the instance directory of the \
+configured SolrCore. \
+'generate' ... Boolean switch that allows to enable runtime generation of FST \
+corpora (default: false) \
+
+enhancer.engines.linking.solrfst.typeField.name=Entity Type Field
+enhancer.engines.linking.solrfst.typeField.description=The Solr Field holding the \
+type information of Entities. Values are expected to be URIs
+
+enhancer.engines.linking.solrfst.rankingField.name=Entity Ranking Field
+enhancer.engines.linking.solrfst.rankingField.description=The Solr Field holding the \
+Entity Ranking (importance of the Entity within the knowledge base). Values \
+are expected to be floating point numbers.
+
+enhancer.engines.linking.solrfst.fstThreadPoolSize.name=FST Thread Pool Size
+enhancer.engines.linking.solrfst.fstThreadPoolSize.description=The size of the \
+thread pool used for the runtime creation of FST models. NOTE that memory allocation \
+during creation is considerable higher as for holding the built model (up to to times) \
+so creation multiple models in parallel may require a lot of heap space. If memory \
+allocation is not an issue this value should be set based on the available CPU cores \
+and the resources one would like to assign to the creation of FST models.
+
+enhancer.engines.linking.solrfst.entityCacheSize.name=Entity Cache Size
+enhancer.engines.linking.solrfst.entityCacheSize.description=Used to configure \
+the size of the Cache used to for Entity information. While the FST linking is \
+fully performed in memory this engine needs still to load tagging relevant fields \
+(labels, types, redirectes and entity ranking) for matched entities from the disc. \
+The EntityCache is a LRU cache for such information (default is 65k entities)
+
+#===============================================================================
+#Properties and Options used to configure 
+#===============================================================================
+
+enhancer.engines.linking.suggestions.name=Max Suggestions
+enhancer.engines.linking.suggestions.description=The maximum number of suggestions
+
+enhancer.engines.linking.minSearchTokenLength.name=Min Token Length
+enhancer.engines.linking.minSearchTokenLength.description=The minimum \
+length of Tokens used to lookup Entities within the Controlled Vocabulary. This parameter is ignored \
+in case a POS (Part of Speech) tagger is available for the language of the parsed content.
+
+enhancer.engines.linking.caseSensitive.name=Case Sensitivity
+enhancer.engines.linking.caseSensitive.description=Allows to enable/disable \
+case sensitive ranking. NOTE that the linking is based on the Solr FieldType of the \
+FST field. This only affects the ranking (fise:confidence value) of suggestions.
+
+enhancer.engines.linking.properNounsState.name=Link ProperNouns only
+enhancer.engines.linking.properNounsState.description=If activated \
+only ProperNouns will be matched against the Vocabulary. If deactivated any Noun will be matched. \
+NOTE that this parameter requires a tag of the POS TagSet to be mapped against 'olia:PorperNoun'. \
+Otherwise mapping will not work as expected.
+
+enhancer.engines.linking.processedLanguages.name=Processed Languages
+enhancer.engines.linking.processedLanguages.description=Languages to \
+process and optionally language specific configurations. Syntax "{lang};{param-name}={param-value};\
+{param-name}={param-value};...". Supported {param-name}s: "lc" - processed Lexical Categories (see \
+LexicalCategory enumeration for possible values); "pos" - processed Pos types (see Pos enumeration \
+for possible values); "tag" - processed string pos tags; "prob" - minumum probability of pos annotations.
+
+enhancer.engines.linking.defaultMatchingLanguage.name=Default Matching Language
+enhancer.engines.linking.defaultMatchingLanguage.description=The language \
+used in addition to the language detected for the analysed text to search for Entities. Typically this \
+configuration is an empty string to search for labels without any language defined, but for some data \
+sets (such as DBpedia.org) that add languages to any labels it might improve resuls to change this \
+configuration (e.g. to 'en' in the case of DBpedia.org).
+
+enhancer.engines.linking.typeMappings.name=Type Mappings
+enhancer.engines.linking.typeMappings.description=This allows to add \
+additional entity-type > text-annotation-type mappings. Such mappings are used to determine the \
+'dc:type' value of the 'fise:TextAnnotation' created for extracted entities. Usage: \
+variant (a) '{uri}' short for {uri} > {uri} or (b) '{source1};{source2};..;{sourceN} > {target}'. \
+Note that a {source} may be only mapped to a single {target}. Multiple {source} types \
+can be mapped to the same {target}.
+
+enhancer.engines.linking.typeField.name=Type Field
+enhancer.engines.linking.typeField.description=The field used to \
+retrieve the types of matched Entities. Values of that field are expected to be URIs
+
+enhancer.engines.linking.entityTypes.name=Entity Type Filter
+enhancer.engines.linking.entityTypes.description=Allows to define a white/black list \
+based on the types of Entities. Use '!{uri}' for black listing and '{uri}' for white \
+listing. Include '*' to force white listing (e.g. to allow Entities without any type). \
+Rules are processed based on their oder. 
+

Added: stanbol/trunk/enhancement-engines/lucenefstlinking/src/test/resources/log4j.properties
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/test/resources/log4j.properties?rev=1516784&view=auto
==============================================================================
--- stanbol/trunk/enhancement-engines/lucenefstlinking/src/test/resources/log4j.properties (added)
+++ stanbol/trunk/enhancement-engines/lucenefstlinking/src/test/resources/log4j.properties Fri Aug 23 09:31:48 2013
@@ -0,0 +1,24 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Root logger option
+log4j.rootLogger=INFO, stdout
+ 
+# Direct log messages to stdout
+log4j.appender.stdout=org.apache.log4j.ConsoleAppender
+log4j.appender.stdout.Target=System.out
+log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
+log4j.appender.stdout.layout.ConversionPattern=%d{ABSOLUTE} %5p %c{1}:%L - %m%n
+log4j.logger.org.apache.stanbol.enhancer.engines.keywordextraction=DEBUG
\ No newline at end of file