You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2013/08/23 11:31:49 UTC
svn commit: r1516784 [3/3] - in
/stanbol/trunk/enhancement-engines/lucenefstlinking: ./ src/ src/main/
src/main/java/ src/main/java/org/ src/main/java/org/apache/
src/main/java/org/apache/stanbol/
src/main/java/org/apache/stanbol/enhancer/ src/main/jav...
Added: stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilterStream.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilterStream.java?rev=1516784&view=auto
==============================================================================
--- stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilterStream.java (added)
+++ stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilterStream.java Fri Aug 23 09:31:48 2013
@@ -0,0 +1,232 @@
+package org.apache.stanbol.enhancer.engines.lucenefstlinking;
+
+import static org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig.UNICASE_SCRIPT_LANUAGES;
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.EnumSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig;
+import org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig;
+import org.apache.stanbol.enhancer.engines.entitylinking.engine.EntityLinkingEngine;
+import org.apache.stanbol.enhancer.engines.entitylinking.impl.ProcessingState;
+import org.apache.stanbol.enhancer.engines.entitylinking.impl.SectionData;
+import org.apache.stanbol.enhancer.engines.entitylinking.impl.TokenData;
+import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
+import org.apache.stanbol.enhancer.nlp.model.Section;
+import org.apache.stanbol.enhancer.nlp.model.Sentence;
+import org.apache.stanbol.enhancer.nlp.model.Span.SpanTypeEnum;
+import org.apache.stanbol.enhancer.nlp.model.Token;
+import org.opensextant.solrtexttagger.TaggingAttribute;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Classifies Tokens in the Solr {@link TokenStream} with the {@link TaggingAttribute}
+ * based on NLP processing results present in the {@link AnalysedText}. This
+ * implementation Classifies Token similar to the {@link EntityLinkingEngine}.
+ * It uses the {@link TextProcessingConfig} for its configuration.<p>
+ * <b> Implementation Details</b><p>
+ * While this code does not directly use {@link ProcessingState} it serves a
+ * similar purpose.<p>
+ * <ul>
+ * <li>This code needs to deal with potential different tokenization present
+ * in the {@link AnalysedText} and the {@link TokenStream}. The implemented
+ * semantics does mark Tokens in the {@link TokenStream} as
+ * <code>{@link TaggingAttribute#isTaggable()} == ture</code> if the do overlap
+ * with a {@link TokenData#isLinkable} token in the {@link AnalysedText}.
+ * <li> {@link TokenData#isMatchable} tokens are also considered as
+ * <code>{@link TaggingAttribute#isTaggable()} == ture</code> if a
+ * {@link TokenData#isMatchable} token is following within two tokens of the
+ * {@link AnalysedText}. This Range is extended if other matchable tokens are
+ * within the lookahead range. However the range is never extended over a
+ * section border.
+ * </ul>
+ * @author Rupert Westenthaler
+ *
+ */
+public class LinkableTokenFilterStream extends TokenFilter {
+
+ private final Logger log = LoggerFactory.getLogger(LinkableTokenFilterStream.class);
+
+ /**
+ * Required to use {@link SectionData}
+ */
+ private static final Set<SpanTypeEnum> PROCESSED_SPAN_TYPES = EnumSet.of(
+ SpanTypeEnum.Chunk,SpanTypeEnum.Token);
+ /**
+ * The NLP processing results
+ */
+ private AnalysedText at;
+ /**
+ * The language of the text
+ */
+ //private String lang;
+ /**
+ * If the language is unicase or not
+ */
+ private boolean isUnicaseLanguage;
+ /**
+ * Defines how NLP processing results are processed to determine Words that
+ * need to be looked-up in the vocabulary
+ */
+ private LanguageProcessingConfig lpc;
+
+ /**
+ * Iterator over all sections of the {@link AnalysedText}
+ */
+ private Iterator<? extends Section> sections;
+ /**
+ * The current section
+ */
+ private SectionData sectionData;
+ /**
+ * Iterator over all {@link Token}s in the current section
+ */
+ private Iterator<TokenData> tokenIt;
+ /**
+ * The current Token
+ */
+ private TokenData token;
+
+ private int lookupCount = 0;
+ private int incrementCount = 0;
+
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final OffsetAttribute offset = addAttribute(OffsetAttribute.class);
+ private final TaggingAttribute taggable = addAttribute(TaggingAttribute.class);
+
+ protected LinkableTokenFilterStream(TokenStream input, AnalysedText at,
+ String lang, LanguageProcessingConfig lpc) {
+ super(input);
+ this.at = at;
+ //this.lang = lang;
+ this.lpc = lpc;
+ this.isUnicaseLanguage = lang != null && !lang.isEmpty() &&
+ UNICASE_SCRIPT_LANUAGES.contains(lang);
+ }
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ Iterator<Sentence> sentences = at.getSentences();
+ this.sections = sentences.hasNext() ? sentences : Collections.singleton(at).iterator();
+ sectionData = null;
+ tokenIt = null;
+ incrementCount = 0;
+ lookupCount = 0;
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if(input.incrementToken()){
+ incrementCount++;
+ boolean first = true;
+ TokenData token;
+ boolean lookup = false;
+ int lastMatchable = -1;
+ int lastIndex = -1;
+ while((token = nextToken(first)) != null){
+ first = false;
+ if(token.isLinkable){
+ lookup = true;
+ } else if (token.isMatchable){
+ lastMatchable = token.index;
+ lastIndex = lastMatchable;
+ } //else if(token.hasAlphaNumeric){
+ // lastIndex = token.index;
+ //}
+ }
+ //lookahead
+ if(!lookup && lastIndex >= 0 && sectionData != null){
+ List<TokenData> tokens = sectionData.getTokens();
+ int maxLookahead = Math.max(lastIndex, lastMatchable+3);
+ for(int i = lastIndex+1;!lookup && i < maxLookahead && i < tokens.size(); i++){
+ token = tokens.get(i);
+ if(token.isLinkable){
+ lookup = true;
+ } else if(token.isMatchable && (i+1) == maxLookahead){
+ maxLookahead++; //increase lookahead for matchable tokens
+ }
+ }
+ }
+ this.taggable.setTaggable(lookup);
+ if(lookup){
+ if(log.isTraceEnabled()){
+ log.trace("Solr Token: [{},{}]: {}", new Object[]{
+ offset.startOffset(), offset.endOffset(), termAtt});
+ }
+ lookupCount++;
+ }
+ return true;
+ } else {
+ log.debug("lookup percentage: {}",lookupCount*100/(float)incrementCount);
+ return false;
+ }
+ }
+
+ /**
+ * Iterating over TokensData requires to iterate over two hierarchy levels:
+ * (1) sections (likely Sentences) and (2) Tokens <p>
+ * <b>NOTE</b> that this method modifies a lot of fields to update the
+ * state of the iteration accordingly. If the {@link #token} field is
+ * <code>null</code> after a call to this method this indicates that the
+ * end of the {@link Token} in the {@link AnalysedText} was reached.
+ * @param first is this the first call for the current {@link #offset} state?
+ * @return the token or <code>null</code> if there are no more tokens for
+ * the current {@link #offset}
+ */
+ private TokenData nextToken(boolean first){
+ final boolean isToken;
+ if(token == null || //on the first call
+ !first || //not the first call within on #incrementToken()
+ //current Token is before the current offset
+ token.token.getEnd() <= offset.startOffset()){
+ if(incrementTokenData()){ //get the next token
+ //the next token still overlaps with the current offset
+ isToken = token.token.getStart() < offset.endOffset();
+ } else { //end of stream
+ isToken = false;
+ }
+ } else { //check the current #token
+ isToken = token.token.getStart() < offset.endOffset();
+ }
+ return isToken ? token : null;
+ }
+ /**
+ * Increments the {@link #token} and - if necessary also the {@link #sectionData
+ * section}.
+ * @return <code>true</code> unless there are no more tokens
+ */
+ private boolean incrementTokenData(){
+ if(tokenIt == null || !tokenIt.hasNext()){
+ sectionData = null;
+ tokenIt = null;
+ while(sections.hasNext() && (tokenIt == null || !tokenIt.hasNext())){
+ //analyse NLP results for the next Section
+ sectionData = new SectionData(lpc, sections.next(),
+ PROCESSED_SPAN_TYPES, isUnicaseLanguage);
+ tokenIt = sectionData.getTokens().iterator();
+ }
+ if(tokenIt != null && tokenIt.hasNext()){
+ token = tokenIt.next(); //first token of the next section
+ return true;
+ } else { //reached the end .. clean up
+ sectionData = null;
+ tokenIt = null;
+ return false;
+ }
+ } else { //more token in the same section
+ token = tokenIt.next();
+ return true;
+ }
+ }
+
+}
Added: stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/Match.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/Match.java?rev=1516784&view=auto
==============================================================================
--- stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/Match.java (added)
+++ stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/Match.java Fri Aug 23 09:31:48 2013
@@ -0,0 +1,189 @@
+package org.apache.stanbol.enhancer.engines.lucenefstlinking;
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.Map;
+
+import org.apache.clerezza.rdf.core.Literal;
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.stanbol.enhancer.engines.entitylinking.impl.Suggestion;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Represents a Entity that Matches somewhere in the tagged text.
+ * <p>
+ * Matches are generated for {@link #id Lucene Document IDs} and
+ * {@link #uri Solr Document ids} (the URI of the matching entity). On the
+ * first access to the {@link #getLabels() labels}, {@link #getTypes() types}
+ * or {@link #getRedirects()} all those information are lazily retrieved by
+ * accessing the data stored in the index. The {@link FieldLoader} instance
+ * parsed in the constructor is used to load those information.
+ * Typically this is implemented by the {@link MatchPool} instance used to
+ * instantiate Match instances.
+ *
+ *
+ * @author Rupert Westenthaler
+ *
+ */
+public class Match {
+
+ private static final Logger log = LoggerFactory.getLogger(Match.class);
+
+ /**
+ * Lucene document id
+ */
+ public final int id;
+
+ private FieldLoader fieldLoader;
+
+ private Map<FieldType,Object> values;
+ private boolean error = false;
+
+ private Literal matchLabel;
+ /**
+ * The score of the Match
+ */
+ private double score;
+
+ Match(int id, FieldLoader fieldLoader){
+ this.id = id;
+ this.fieldLoader = fieldLoader;
+ }
+
+ public String getUri() {
+ return getValue(FieldType.id);
+ }
+
+ public Collection<Literal> getLabels(){
+ return getValues(FieldType.label);
+ }
+
+ public Collection<UriRef> getTypes(){
+ return getValues(FieldType.type);
+ }
+
+ public Collection<UriRef> getRedirects(){
+ return getValues(FieldType.redirect);
+ }
+ public Double getRanking(){
+ return getValue(FieldType.ranking);
+ }
+ private <T> Collection<T> getValues(FieldType type){
+ if(!type.isMultivalued()){
+ throw new IllegalArgumentException("The parsed field Type '" + type
+ + "' is not multi valued!");
+ }
+ Object value = getValue(type);
+ return value == null ? Collections.EMPTY_SET : (Collection<T>)value;
+ }
+ private <T> T getValue(FieldType type){
+ if(error){
+ return null;
+ } else if(values == null){
+ try {
+ values = fieldLoader.load(id);
+ } catch (IOException e) {
+ log.warn("Unable to load Entity for Lucene DocId '"+id+"'!",e);
+ error = true;
+ return null;
+ } catch (RuntimeException e) {
+ log.warn("Error while loading Entity for Lucene DocId '"+id+"'!",e);
+ error = true;
+ return null;
+ }
+ }
+ return (T) values.get(type);
+ }
+
+ public void setMatch(double score, Literal matchLabel){
+ this.score = score;
+ this.matchLabel = matchLabel;
+ }
+ /**
+ * Allows to update the {@link #getScore() score} without changing the
+ * {@link #getMatchLabel() match}.
+ * @param score the new score
+ */
+ public void updateScore(double score) {
+ this.score = score;
+ }
+ public double getScore() {
+ return score;
+ }
+
+ public Literal getMatchLabel() {
+ return matchLabel;
+ }
+
+ @Override
+ public int hashCode() {
+ return id;
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ return o instanceof Match && id == ((Match)o).id;
+ }
+
+ @Override
+ public String toString() {
+ String uri = getUri();
+ return uri != null ? uri : "Match[id: "+id+"|(uri unknown)]";
+ }
+
+ static enum FieldType {
+ id(String.class),
+ label(Literal.class, true),
+ type(UriRef.class,true),
+ redirect(UriRef.class,true),
+ ranking(Double.class);
+
+ Class<?> valueType;
+ boolean multivalued;
+
+ FieldType(Class<?> type){
+ this(type,false);
+ }
+ FieldType(Class<?> type, boolean multivalued){
+ this.valueType = type;
+ this.multivalued = multivalued;
+ }
+ public Class<?> getValueType() {
+ return valueType;
+ }
+ public boolean isMultivalued() {
+ return multivalued;
+ }
+ }
+
+ static interface FieldLoader {
+ Map<FieldType,Object> load(int id) throws IOException;
+ }
+ /**
+ * Compares {@link Match} instances based on the {@link Match#getScore()}
+ */
+ public static final Comparator<Match> SCORE_COMPARATOR = new Comparator<Match>() {
+
+ @Override
+ public int compare(Match a, Match b) {
+ return Double.compare(b.score,a.score); //higher first
+ }
+
+ };
+ /**
+ * Compares {@link Match} instances based on the {@link Match#getRanking()}.
+ * <code>null</code> values are assumed to be the smallest.
+ */
+ public static final Comparator<Match> ENTITY_RANK_COMPARATOR = new Comparator<Match>(){
+ @Override
+ public int compare(Match arg0, Match arg1) {
+ Double r1 = arg0.getRanking();
+ Double r2 = arg1.getRanking();
+ return r2 == null ? r1 == null ? 0 : -1 : r1 == null ? 1 : r2.compareTo(r1);
+ }
+ };
+
+}
Added: stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/Tag.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/Tag.java?rev=1516784&view=auto
==============================================================================
--- stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/Tag.java (added)
+++ stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/Tag.java Fri Aug 23 09:31:48 2013
@@ -0,0 +1,125 @@
+package org.apache.stanbol.enhancer.engines.lucenefstlinking;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig;
+import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
+import org.apache.stanbol.enhancer.nlp.model.Span;
+
+/**
+ * minimal helper class to represent a Tag.<p>
+ * TODO: This will need to collect additional values from the suggested
+ * SolrDocuments:<ul>
+ * <li> the type information - {@link EntityLinkerConfig#TYPE_FIELD} values
+ * <li>
+ * <li>
+ * <li>
+ * </ul>
+ * @author Rupert Westenthaler
+ *
+ */
+class Tag {
+
+ /**
+ * the start index within the {@link AnalysedText}
+ */
+ final int[] span;
+ /**
+ * Matching documents
+ */
+ private Set<Match> ids;
+
+ private List<Match> suggestions;
+ private String anchor;
+
+ Tag(int start, int end) {
+ span = new int[]{start,end};
+ }
+ Tag(int[] span) {
+ this.span = span;
+ }
+
+ public void addIds(Set<Match> ids){
+ if(this.ids == null){
+ this.ids = ids;
+ } else {
+ this.ids.addAll(ids);
+ }
+ }
+ @SuppressWarnings("unchecked")
+ public Set<Match> getMatches(){
+ return ids == null ? Collections.EMPTY_SET : ids;
+ }
+ public int getStart() {
+ return span[0];
+ }
+
+ public int getEnd() {
+ return span[1];
+ }
+ /**
+ * Setter for the Anchor text
+ * @param anchor
+ */
+ public void setAnchor(String anchor) {
+ this.anchor = anchor;
+ }
+ /**
+ * Getter for the Anchor text
+ * @return the fise:selected-text value
+ */
+ public String getAnchor() {
+ return anchor;
+ }
+
+ public void setSuggestions(List<Match> suggestions) {
+ this.suggestions = suggestions;
+ }
+
+ public List<Match> getSuggestions() {
+ return suggestions;
+ }
+
+ @Override
+ public int hashCode() {
+ return Arrays.hashCode(span);
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ return o instanceof Tag && Arrays.equals(span, ((Tag)o).span);
+ }
+
+ @Override
+ public String toString() {
+ return new StringBuilder("Tag").append(Arrays.toString(span)).toString();
+ }
+
+ static final Comparator<int[]> SPAN_COMPARATOR = new Comparator<int[]>() {
+
+ @Override
+ public int compare(int[] a, int[] b) {
+ int c = a[0] < b[0] ? -1 : a[0] > b[0] ? 1 : 0;
+ if(c == 0){
+ c = a[1] > b[1] ? -1 : a[1] < b[1] ? 1 : 0;
+ }
+ return c;
+ }
+
+ };
+
+ /**
+ * Returns the score of the best {@link #getSuggestions() suggestion}
+ * @return
+ */
+ public double getScore() {
+ return suggestions == null || suggestions.isEmpty() ? 0 :
+ suggestions.get(0).getScore();
+ }
+
+}
\ No newline at end of file
Added: stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/TaggingSession.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/TaggingSession.java?rev=1516784&view=auto
==============================================================================
--- stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/TaggingSession.java (added)
+++ stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/TaggingSession.java Fri Aug 23 09:31:48 2013
@@ -0,0 +1,510 @@
+package org.apache.stanbol.enhancer.engines.lucenefstlinking;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.EnumMap;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.clerezza.rdf.core.Language;
+import org.apache.clerezza.rdf.core.Literal;
+import org.apache.clerezza.rdf.core.Resource;
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
+import org.apache.commons.lang.StringUtils;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.Field.Store;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexableField;
+import org.apache.lucene.queries.function.valuesource.IfFunction;
+import org.apache.solr.schema.SchemaField;
+import org.apache.solr.search.SolrIndexSearcher;
+import org.apache.solr.util.RefCounted;
+import org.apache.stanbol.enhancer.engines.lucenefstlinking.Match.FieldLoader;
+import org.apache.stanbol.enhancer.engines.lucenefstlinking.Match.FieldType;
+import org.apache.stanbol.enhancer.engines.lucenefstlinking.cache.EntityCache;
+import org.apache.stanbol.enhancer.engines.lucenefstlinking.impl.ValueSourceAccessor;
+import org.apache.stanbol.enhancer.servicesapi.ContentItem;
+import org.apache.stanbol.enhancer.servicesapi.EngineException;
+import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
+import org.opensextant.solrtexttagger.TaggerFstCorpus;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Profile created based on the {@link IndexConfiguration} for processing a
+ * parsed ContentItem. <p>
+ *
+ * @author Rupert Westenthaler
+ *
+ */
+public class TaggingSession implements Closeable {
+
+ private final Logger log = LoggerFactory.getLogger(TaggingSession.class);
+
+ private String language;
+
+ private Corpus langCorpus;
+
+ private Corpus defaultCorpus;
+
+ /**
+ * The Solr document id field holding the URI of the Entity.
+ */
+ protected final String idField;
+
+ /**
+ * The Solr field holding the labels in the language of the current Document
+ */
+ protected final String labelField;
+
+ protected final Language labelLang;
+ /**
+ * The Solr field holding the labels in the default matching language or
+ * <code>null</code> if the same as {@link #labelField}
+ */
+ protected final String defaultLabelField;
+
+ protected final Language defaultLabelLang;
+
+ protected final Set<String> solrDocfields = new HashSet<String>();
+
+ protected final IndexConfiguration config;
+
+ protected final String typeField;
+ protected final String redirectField;
+ protected final String rankingField;
+ private final RefCounted<SolrIndexSearcher> searcherRef;
+ /*
+ * Document Cache and session statistics for the cache
+ */
+ private RefCounted<EntityCache> documentCacheRef;
+ private int docLoaded = 0;
+ private int docCached = 0;
+ private int docAppended = 0;
+ private final ValueSourceAccessor uniqueKeyCache;
+ //private final Map<Integer,Match> matchPool = new HashMap<Integer,Match>(2048);
+ private final FieldLoaderImpl fieldLoader;
+
+
+ TaggingSession(String language, IndexConfiguration config) throws CorpusException {
+ this.language = language;
+ this.config = config;
+ CorpusInfo langCorpusInfo = config.getCorpus(language);
+ CorpusInfo defaultCorpusInfo = config.getDefaultCorpus();
+
+ //obtain the Solr Document Id field
+ SchemaField idSchemaField = config.getIndex().getSchema().getUniqueKeyField();
+ idField = idSchemaField.getName();
+ solrDocfields.add(idField);
+
+ //obtain the language specific fields for the session
+ if(langCorpusInfo == null && defaultCorpusInfo == null){
+ //this should not happen, because the canEnhance method of the
+ //engine should already reject such calls
+ throw new IllegalStateException("No FST Corpus configured for language '"
+ +language+"' and also no default FST Corpus is present.!");
+ }
+ if(langCorpusInfo != null){
+ this.langCorpus = new Corpus(langCorpusInfo,
+ obtainFstCorpus(langCorpusInfo));
+ this.labelField = langCorpusInfo.storedField;
+ solrDocfields.add(labelField);
+ this.labelLang = langCorpusInfo.language == null ||
+ StringUtils.isBlank(langCorpusInfo.language) ? null :
+ new Language(langCorpusInfo.language);
+ } else {
+ this.labelField = null;
+ this.labelLang = null;
+ }
+ if(defaultCorpusInfo != null && !defaultCorpusInfo.equals(langCorpusInfo)){
+ this.defaultCorpus = new Corpus(defaultCorpusInfo,
+ obtainFstCorpus(defaultCorpusInfo));
+ this.defaultLabelField = defaultCorpusInfo.storedField;
+ solrDocfields.add(defaultLabelField);
+ this.defaultLabelLang = defaultCorpusInfo.language == null ||
+ StringUtils.isBlank(defaultCorpusInfo.language) ? null :
+ new Language(defaultCorpusInfo.language);
+ } else {
+ this.defaultCorpus = null;
+ this.defaultLabelField = null;
+ this.defaultLabelLang = null;
+ }
+ if(this.defaultCorpus == null && this.langCorpus == null){
+ throw new CorpusException("Unable to initialise a FST corpus for language '"
+ + language+"'. Neigher the language specific Coprpus (field : "
+ + langCorpusInfo != null ? langCorpusInfo.indexedField : "<undefined>"
+ + ") nor for the default language (field: "
+ + defaultCorpusInfo != null ? defaultCorpusInfo.indexedField : "<undefined>"
+ + ") is currently available!",null);
+ }
+ if(config.getTypeField() != null){
+ this.typeField = config.getTypeField();
+ solrDocfields.add(typeField);
+ } else {
+ this.typeField = null;
+ }
+ if(config.getRedirectField() != null){
+ this.redirectField = config.getRedirectField();
+ solrDocfields.add(redirectField);
+ } else {
+ this.redirectField = null;
+ }
+ if(config.getRankingField() != null){
+ this.rankingField = config.getRankingField();
+ solrDocfields.add(rankingField);
+ } else {
+ this.rankingField = null;
+ }
+ searcherRef = config.getIndex().getSearcher();
+ SolrIndexSearcher searcher = searcherRef.get();
+ documentCacheRef = config.getEntityCacheManager().getCache(searcher);
+ uniqueKeyCache = null; //no longer used.
+// uniqueKeyCache = new ValueSourceAccessor(searcher, idSchemaField.getType()
+// .getValueSource(idSchemaField, null));
+ fieldLoader = new FieldLoaderImpl(searcher.getIndexReader());
+
+ }
+ /**
+ * Used to instantiate {@link Match}es
+ * @param docId the Lucene document Id as returned by the FST corpus
+ * @return the Match instance
+ */
+ public Match createMatch(int docId){
+ return new Match(docId,fieldLoader);
+ }
+
+ public void close(){
+ //matchPool.clear(); //clean up the matchpool
+ searcherRef.decref(); //clean up the Solr index searcher reference
+ documentCacheRef.decref(); //clean up the DocumentCache reference
+ }
+ /**
+ * The language of this Session. This is typically the language detected for
+ * the document.
+ * @return the language of this Session
+ */
+ public String getLanguage() {
+ return language;
+ }
+
+// public String getTypeField() {
+// return config.getTypeField();
+// }
+
+// public String getRedirectField() {
+// return config.getRedirectField();
+// }
+
+// public String getDefaultLabelField() {
+// return defaultLabelField;
+// }
+
+// public Language getDefaultLabelLanguage() {
+// return defaultLabelLang;
+// }
+
+// public String getLabelField() {
+// return labelField;
+// }
+
+// public Language getLabelLanguage() {
+// return labelLang;
+// }
+
+// /**
+// * @return the langCorpus
+// */
+// public final CorpusInfo getLangCorpus() {
+// return langCorpusInfo;
+// }
+
+// /**
+// * @return the defaultCorpus
+// */
+// public final CorpusInfo getDefaultCorpus() {
+// return defaultCorpusInfo;
+// }
+
+
+ public Corpus getDefaultCorpus() {
+ return defaultCorpus;
+ }
+
+ public Corpus getLanguageCorpus() {
+ return langCorpus;
+ }
+
+ public SolrIndexSearcher getSearcher() {
+ return searcherRef.get();
+ }
+
+ public static TaggingSession createSession(IndexConfiguration indexConfig,
+ String language) throws CorpusException {
+ TaggingSession session = new TaggingSession(language, indexConfig);
+ return session;
+ }
+
+ public EntityCache getDocumentCache(){
+ return documentCacheRef.get();
+ }
+ /**
+ * The number of Lucene Documents loaded form disc in this session so far
+ * @return
+ */
+ public int getSessionDocLoaded(){
+ return docLoaded;
+ }
+ /**
+ * The number of Lucene Documents retrieved from the {@link #getDocumentCache()}
+ * in this session so far
+ * @return
+ */
+ public int getSessionDocCached(){
+ return docCached;
+ }
+ /**
+ * The number of Lucene Documents retrived from the {@link #getDocumentCache()},
+ * but with missing fields from the Cache. For such documents the additional
+ * fields (typically labels of different languages) where readed from disc and
+ * added to the cached document.
+ * @return
+ */
+ public int getSessionDocAppended(){
+ return docAppended;
+ }
+
+
+ /**
+ * Obtains the FST corpus for the parsed CorpusInfo. The other parameters
+ * are just used for error messages in case this is not successful.
+ * @param fstInfo the info about the corpus
+ * @param ci the contentIteem (just used for logging and error messages)
+ * @return
+ * @throws CorpusException
+ */
+ private TaggerFstCorpus obtainFstCorpus(CorpusInfo fstInfo) throws CorpusException {
+ TaggerFstCorpus fstCorpus;
+ synchronized (fstInfo) { // one at a time
+ fstCorpus = fstInfo.getCorpus();
+ if (fstCorpus == null) {
+ if (fstInfo.isEnqueued()) {
+ throw new CorpusException("The FST corpus for language '"
+ + fstInfo.language + "' is enqueued for creation, but not yet "
+ + "available. Try at a later point in time", null);
+ }
+ if (fstInfo.isFstCreationError()) {
+ throw new CorpusException(fstInfo.getErrorMessage(), null);
+ }
+ if (fstInfo.isFstFileError() && fstInfo.allowCreation) {
+ //try to recreate the FST corpus
+ if(config.getExecutorService() != null){
+ // TODO: this code should get moved to a CorpusManager class
+ config.getExecutorService().execute(
+ new CorpusCreationTask(config.getIndex(), fstInfo));
+ throw new CorpusException("The FST corpus for language '"
+ + fstInfo.language + "' was invalid and is now "
+ + "enqueued for re-creation. Retry at a later "
+ + "point in time.", null);
+ } else {
+ throw new CorpusException(fstInfo.getErrorMessage(), null);
+ }
+ }
+ }
+
+ }
+ return fstCorpus;
+ }
+ /**
+ * {@link FieldLoader} implementation used to create {@link Match} instances
+ */
+ private class FieldLoaderImpl implements FieldLoader {
+
+ private static final String LOADED_FIELDS_FIELD_NAME = "__loadedFields__";
+
+ private List<Field> loadedFieldsFields;
+
+ private final IndexReader reader;
+ /**
+ * Cache similar to the {@link EntityCache}, but with a scope bound to
+ * life cycle of this FieldLoaderImpl instance (a single TaggingSession).
+ * This cache ensures the Lucene Documents are not loaded twice while
+ * processing the same document (even if no EntiyCache is configured or
+ * the size of the EntityCache is to small).
+ */
+ private final Map<Integer,Document> sessionCache = new HashMap<Integer,Document>();
+ /**
+ * The EntityCache instance that caches entity data over multiple sessions
+ */
+ private final EntityCache cache;
+
+ public FieldLoaderImpl(IndexReader reader) {
+ this.reader = reader;
+ loadedFieldsFields = new ArrayList<Field>(solrDocfields.size());
+ for(String loadedFieldName : solrDocfields){
+ loadedFieldsFields.add(new StringField(LOADED_FIELDS_FIELD_NAME,
+ loadedFieldName, Store.NO));
+ }
+ this.cache = documentCacheRef.get();
+ }
+
+ @Override
+ public Map<FieldType,Object> load(int id) throws IOException {
+ //load the Lucene Document for the id
+ Integer ID = Integer.valueOf(id);
+ Document doc = sessionCache.get(ID);
+ if(doc == null){
+ doc = cache.get(ID);
+ if(doc == null){
+ doc = reader.document(id, solrDocfields);
+ //if we read a doc from the index we need to add information about
+ //the fields we loaded (especially the languages of labels loaded
+ //NOTE that those information will never be stored in the index. They
+ //are only kept in-memory when caching this document.
+ for(Field loadedFieldsField : loadedFieldsFields){
+ doc.add(loadedFieldsField);
+ }
+ docLoaded++;
+ cache.cache(ID, doc);
+ } else {
+ //we need to check if the fields of the cached doc are sufficient
+ //for the requested Solr Document fields
+ Set<String> fields = new HashSet<String>(solrDocfields);
+ String[] loaded = doc.getValues(LOADED_FIELDS_FIELD_NAME);
+ for(int i=0;i < loaded.length && !fields.isEmpty(); i++){
+ fields.remove(loaded[i]);
+ }
+ if(!fields.isEmpty()){ //we are missing some fields
+ //need to load it from the index
+ Document tmp = reader.document(id, fields);
+ //add the additional fields to the cached doc
+ for(IndexableField field : tmp.getFields()){
+ doc.add(field);
+ }
+ //also update the loaded fields
+ for(String loadedField : fields){
+ doc.add(new StringField(LOADED_FIELDS_FIELD_NAME,
+ loadedField, Store.NO));
+ }
+ //NOTE: no need to update the cache, as we have updated the
+ //cached value.
+ //cache.cache(ID, doc);
+ docAppended++;
+ } else {
+ docCached++;
+ }
+ //and put the doc in the sessionCache
+ }
+ //add this doc to the session cache
+ sessionCache.put(ID, doc);
+ } //else { //document is in the session cache ... just use it
+ //NOTE: The session cache has a minor side effect on the
+ // EntityCache. Because multiple occurrences of an Entity
+ // within the Document are not requested on the EntityCache
+ // LRU based implementations will get slightly different
+ // statistics. Assuming that the maximum size of the EntityCache
+ // is >> as the number of Documents matching for the current Text
+ // this effect can be considered as negligible.
+ //}
+ if(doc != null){
+ Map<FieldType,Object> values =
+ new EnumMap<Match.FieldType,Object>(FieldType.class);
+ //load the ID
+ values.put(FieldType.id, doc.get(idField));
+ //load the labels
+ Set<Literal> labels = new HashSet<Literal>();
+ for(String label : doc.getValues(labelField)){
+ labels.add(new PlainLiteralImpl(label, labelLang));
+ }
+ if(defaultLabelField != null){
+ for(String label : doc.getValues(defaultLabelField)){
+ labels.add(new PlainLiteralImpl(label, defaultLabelLang));
+ }
+ }
+ values.put(FieldType.label, labels);
+ //load the types
+ if(typeField != null){
+ Set<UriRef> types = new HashSet<UriRef>();
+ for(String type : doc.getValues(typeField)){
+ types.add(new UriRef(type));
+ }
+ values.put(FieldType.type, types);
+ }
+ //load the redirects
+ if(redirectField != null){
+ Set<UriRef> redirects = new HashSet<UriRef>();
+ for(String redirect : doc.getValues(redirectField)){
+ redirects.add(new UriRef(redirect));
+ }
+ values.put(FieldType.redirect, redirects);
+ }
+ //load the rankings
+ if(rankingField != null){
+ Number num = doc.getField(rankingField).numericValue();
+ Double ranking;
+ if(num instanceof Double){
+ ranking = (Double)num;
+ } else if (num != null){
+ ranking = Double.valueOf(num.doubleValue());
+ } else { //num == null
+ String value = doc.get(rankingField);
+ if(value != null){
+ try {
+ ranking = Double.valueOf(value);
+ } catch (NumberFormatException e) {
+ ranking = null;
+ }
+ } else {
+ ranking = null;
+ }
+ }
+ if(ranking != null){
+ values.put(FieldType.ranking, ranking);
+ }
+ }
+ return values;
+ } else {
+ throw new IOException("No document found for Lucene doc id '"+id+"'!");
+ }
+ }
+ }
+
+ public class Corpus {
+
+ private CorpusInfo corpusInfo;
+ private TaggerFstCorpus fst;
+
+ Corpus(CorpusInfo corpusInfo, TaggerFstCorpus fst){
+ this.corpusInfo = corpusInfo;
+ this.fst = fst;
+ }
+
+ public String getLanugage(){
+ return corpusInfo.language;
+ }
+
+ public Analyzer getAnalyzer(){
+ return corpusInfo.analyzer;
+ }
+
+ public TaggerFstCorpus getFst(){
+ return fst;
+ }
+
+ public String getIndexedField() {
+ return corpusInfo.indexedField;
+ }
+
+ public String getStoredField(){
+ return corpusInfo.storedField;
+ }
+ }
+
+}
Added: stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/cache/EntityCache.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/cache/EntityCache.java?rev=1516784&view=auto
==============================================================================
--- stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/cache/EntityCache.java (added)
+++ stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/cache/EntityCache.java Fri Aug 23 09:31:48 2013
@@ -0,0 +1,46 @@
+package org.apache.stanbol.enhancer.engines.lucenefstlinking.cache;
+
+import org.apache.lucene.document.Document;
+import org.apache.solr.common.SolrDocument;
+
+/**
+ * A Cache for {@link SolrDocument}s holding Entity information required for
+ * entity linking. This cache is intended to avoid disc access for loading
+ * entity data of entities detected by the FST tagging in the parsed document.
+ * @author Rupert Westenthaler
+ *
+ */
+public interface EntityCache {
+
+ /**
+ * if the current version of the index does not equals this version
+ * the Cache need to be renewed.
+ * @return the version this cache is build upon
+ */
+ Object getVersion();
+ /**
+ * Getter for the Document based on the Lucene Document ID
+ * @param docId the Lucene document ID (the unique key)
+ * @return the Document or <code>null</code> if not in the cache
+ */
+ Document get(Integer docId);
+
+ /**
+ * Caches the document for the parsed Lucene document id
+ * @param docId the Lucene document id
+ * @param doc the Document
+ */
+ void cache(Integer docId, Document doc);
+
+ /**
+ * The size of the cache of <code>-1</code> if not available
+ * @return the size or <code>-1</code> if not known
+ */
+ int size();
+
+ /**
+ * The statistics for this Cache
+ * @return
+ */
+ String printStatistics();
+}
Added: stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/cache/EntityCacheManager.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/cache/EntityCacheManager.java?rev=1516784&view=auto
==============================================================================
--- stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/cache/EntityCacheManager.java (added)
+++ stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/cache/EntityCacheManager.java Fri Aug 23 09:31:48 2013
@@ -0,0 +1,36 @@
+package org.apache.stanbol.enhancer.engines.lucenefstlinking.cache;
+
+import org.apache.lucene.document.Document;
+import org.apache.solr.search.SolrIndexSearcher;
+import org.apache.solr.util.RefCounted;
+
+/**
+ * Manages a Cache for Entity data required for EntityLinking. Cached entity
+ * data are represented by Lucene {@link Document}s.
+ * <p>
+ * This is expected to manage a single {@link EntityCache} for the current
+ * version of the Lucene index. A 'new' version is expected as soon as
+ * {@link #getCache(Object)} is called for a different version. In that case
+ * the current {@link EntityCache} should be cleared and a new empty one
+ * needs to be created. The new Cache might get autowarmed (if supported and
+ * configured)
+ * <p>
+ * Implementations need to wait with clearing/closing outdated {@link EntityCache}
+ * instances until the old version is no longer used
+ * ({@link RefCounted#close()} is called).
+ */
+public interface EntityCacheManager {
+
+ /**
+ * Getter for a reference counting instance of the {@link EntityCache}.
+ * Callers need to ensure that {@link RefCounted#decref()} is called when the
+ * do no longer need the obtained DocumentCache instance.
+ * @param version the version object. Typically the current
+ * {@link SolrIndexSearcher} instance can be used as version object as a new
+ * cache instance should be created if a new index searcher was opened by
+ * the SolrCore.
+ * @return A counting reference to the EntityCache
+ */
+ RefCounted<EntityCache> getCache(Object version);
+
+}
Added: stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/cache/FastLRUCacheManager.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/cache/FastLRUCacheManager.java?rev=1516784&view=auto
==============================================================================
--- stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/cache/FastLRUCacheManager.java (added)
+++ stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/cache/FastLRUCacheManager.java Fri Aug 23 09:31:48 2013
@@ -0,0 +1,108 @@
+package org.apache.stanbol.enhancer.engines.lucenefstlinking.cache;
+
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.felix.scr.annotations.Component;
+import org.apache.felix.scr.annotations.Service;
+import org.apache.lucene.document.Document;
+import org.apache.solr.search.CacheRegenerator;
+import org.apache.solr.search.FastLRUCache;
+import org.apache.solr.search.SolrCache;
+import org.apache.solr.util.RefCounted;
+
+/**
+ * Implementation of the {@link EntityCacheManager} based on the Solr
+ * {@link FastLRUCache} implementation
+ *
+ * @author Rupert Westenthaler
+ *
+ */
+public class FastLRUCacheManager implements EntityCacheManager {
+
+ RefCounted<EntityCache> current;
+ private final CacheRegenerator regenerator;
+ private final Map<String,String> config;
+
+ /**
+ * Creates a cache manager instance with the parsed maximum size and no
+ * support for autowarming
+ * @param size the maximum size or values <= 0 to use the default size
+ */
+ public FastLRUCacheManager(int size){
+ this(size,0,null);
+ }
+ /**
+ * Creates a cache manager instance with the parsed maximum size and support
+ * for autowarming.
+ * @param size the maximum size
+ * @param autowarmCount the number of documents added to the new cache based
+ * on entries in an old version
+ * @param regenerator the regenerator instance used for autowarming
+ */
+ public FastLRUCacheManager(int size, int autowarmCount, CacheRegenerator regenerator){
+ Map<String,String> config = new HashMap<String,String>();
+ config.put("name", "Tagging Document Cache");
+ if(size > 0){
+ config.put("size",Integer.toString(size));
+ }
+ if(regenerator != null){
+ config.put("autowarmCount",Integer.toString(autowarmCount));
+ }
+ this.config = Collections.unmodifiableMap(config);
+ this.regenerator = regenerator;
+ }
+
+
+ @Override
+ public RefCounted<EntityCache> getCache(Object version) {
+ if(current == null || !current.get().getVersion().equals(version)){
+ if(current != null){
+ //the the old one as outdated!
+ ((RefCountedImpl)current).setOutdated();
+ }
+ //create a new cache
+ SolrCache<Integer,Document> cache = new FastLRUCache<Integer,Document>();
+ cache.init(config, null, regenerator);
+ current = new RefCountedImpl(new SolrEntityCache(version, cache));
+ }
+ current.incref();
+ return current;
+ }
+
+ /**
+ * {@link RefCounted} implementation that ensures that outdated caches are
+ * cleared and closed as soon as they are no longer in use.
+ *
+ * @author Rupert Westenthaler
+ *
+ */
+ protected class RefCountedImpl extends RefCounted<EntityCache>{
+
+ public RefCountedImpl(SolrEntityCache resource) {
+ super(resource);
+ }
+
+ private boolean outdated;
+
+ /**
+ * Used by the manager implementation to set the RefCounted EntityCache
+ * as outdated
+ */
+ protected void setOutdated() {
+ outdated = true;
+ }
+
+ /**
+ * clears the cache if outdated
+ */
+ protected void close(){
+ if(outdated){
+ ((SolrEntityCache)get()).close();
+ }
+ }
+
+ }
+
+}
Added: stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/cache/SolrEntityCache.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/cache/SolrEntityCache.java?rev=1516784&view=auto
==============================================================================
--- stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/cache/SolrEntityCache.java (added)
+++ stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/cache/SolrEntityCache.java Fri Aug 23 09:31:48 2013
@@ -0,0 +1,61 @@
+package org.apache.stanbol.enhancer.engines.lucenefstlinking.cache;
+
+import org.apache.lucene.document.Document;
+import org.apache.solr.search.FastLRUCache;
+import org.apache.solr.search.SolrCache;
+
+/**
+ * Implementation of the {@link EntityCache} interface by using the
+ * {@link SolrCache} API.
+ *
+ * @author Rupert Westenthaler
+ *
+ */
+public class SolrEntityCache implements EntityCache {
+
+ private final SolrCache<Integer,Document> cache;
+ private final Object version;
+ private boolean closed;
+
+ public SolrEntityCache(Object version, SolrCache<Integer,Document> cache) {
+ this.cache = cache;
+ this.version = version;
+ }
+
+ @Override
+ public Object getVersion() {
+ return version;
+ }
+
+ @Override
+ public Document get(Integer docId) {
+ return !closed ? cache.get(docId) : null;
+ }
+
+ @Override
+ public void cache(Integer docId, Document doc) {
+ if(!closed){
+ cache.put(docId, doc);
+ }
+ }
+
+ @Override
+ public int size() {
+ return cache.size();
+ }
+ @Override
+ public String printStatistics() {
+ return cache.getStatistics().toString();
+ }
+
+ @Override
+ public String toString() {
+ return cache.getDescription();
+ }
+
+ void close(){
+ closed = true;
+ cache.clear();
+ cache.close();
+ }
+}
Added: stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/impl/ValueSourceAccessor.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/impl/ValueSourceAccessor.java?rev=1516784&view=auto
==============================================================================
--- stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/impl/ValueSourceAccessor.java (added)
+++ stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/impl/ValueSourceAccessor.java Fri Aug 23 09:31:48 2013
@@ -0,0 +1,50 @@
+package org.apache.stanbol.enhancer.engines.lucenefstlinking.impl;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.lucene.index.AtomicReaderContext;
+import org.apache.lucene.index.ReaderUtil;
+import org.apache.lucene.queries.function.FunctionValues;
+import org.apache.lucene.queries.function.ValueSource;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.solr.common.SolrException;
+
+/** See LUCENE-4541 or {@link org.apache.solr.response.transform.ValueSourceAugmenter}. */
+public class ValueSourceAccessor {
+ // implement FunctionValues ?
+ private final List<AtomicReaderContext> readerContexts;
+ private final FunctionValues[] docValuesArr;
+ private final ValueSource valueSource;
+ private final Map fContext;
+
+ private int localId;
+ private FunctionValues values;
+
+ public ValueSourceAccessor(IndexSearcher searcher, ValueSource valueSource) {
+ readerContexts = searcher.getIndexReader().leaves();
+ this.valueSource = valueSource;
+ docValuesArr = new FunctionValues[readerContexts.size()];
+ fContext = ValueSource.newContext(searcher);
+ }
+
+ private void setState(int docid) {
+ int idx = ReaderUtil.subIndex(docid, readerContexts);
+ AtomicReaderContext rcontext = readerContexts.get(idx);
+ values = docValuesArr[idx];
+ if (values == null) {
+ try {
+ docValuesArr[idx] = values = valueSource.getValues(fContext, rcontext);
+ } catch (IOException e) {
+ throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
+ }
+ }
+ localId = docid - rcontext.docBase;
+ }
+
+ public Object objectVal(int docid) {
+ setState(docid);
+ return values.objectVal(localId);
+ }
+}
\ No newline at end of file
Added: stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/resources/OSGI-INF/metatype/metatype.properties
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/resources/OSGI-INF/metatype/metatype.properties?rev=1516784&view=auto
==============================================================================
--- stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/resources/OSGI-INF/metatype/metatype.properties (added)
+++ stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/resources/OSGI-INF/metatype/metatype.properties Fri Aug 23 09:31:48 2013
@@ -0,0 +1,144 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+stanbol.enhancer.engine.name.name=Name
+stanbol.enhancer.engine.name.description=The name of the enhancement engine as \
+used in the RESTful interface '/engine/<name>'
+
+service.ranking.name=Ranking
+service.ranking.description=If two enhancement engines with the same name are active the \
+one with the higher ranking will be used to process parsed content items.
+
+#===============================================================================
+#Properties specific to the FST linking engine
+#===============================================================================
+org.apache.stanbol.enhancer.engines.lucenefstlinking.FstLinkingEngineComponent.name=Apache \
+Stanbol Enhancer Engine: FST Linking
+org.apache.stanbol.enhancer.engines.lucenefstlinking.FstLinkingEngineComponent.description=Lucene \
+FST based Entity Linking Engine implementation.
+
+enhancer.engines.linking.solrfst.solrcore.name=Solr Core
+enhancer.engines.linking.solrfst.solrcore.description=The reference to the SolrCore. \
+Supports the '{server-name}:{core-name}' syntax to reference a specific Managed- / \
+Referenced SolrServer. If {server-name} is not present the configured {core-name} is \
+assumed to be available on the default SolrServer. Remote SolrServer are NOT supported!
+
+enhancer.engines.linking.solrfst.fieldEncoding.name=Field Name Encoding
+enhancer.engines.linking.solrfst.fieldEncoding.description=Specifies how FieldNames \
+of the SolrCore are encoded. This is mainly used to specify the pattern used to \
+name fields holding entity labels of different languages. The 'SolrYard' supports \
+the encoding used by the Stanbol Entityhub SolrYard implementation. If 'None' is \
+selected the exact field names used by the SolrCore need to be configured.
+enhancer.engines.linking.solrfst.fieldEncoding.option.none=None
+enhancer.engines.linking.solrfst.fieldEncoding.option.solrYard=SolrYard
+enhancer.engines.linking.solrfst.fieldEncoding.option.minusPrefix='-' Prefix: '{lang}-{name}'
+enhancer.engines.linking.solrfst.fieldEncoding.option.underscorePrefix='_' Prefix: '{lang}_{name}'
+enhancer.engines.linking.solrfst.fieldEncoding.option.minusSuffix='-' Suffix: '{name}-{lang}'
+enhancer.engines.linking.solrfst.fieldEncoding.option.underscoreSuffix='_' Suffix: '{name}_{lang}'
+enhancer.engines.linking.solrfst.fieldEncoding.option.atPrefix='@' Prefix: '{lang}@{name}'
+enhancer.engines.linking.solrfst.fieldEncoding.option.atSuffix='@' Suffix: '{name}@{lang}'
+
+enhancer.engines.linking.solrfst.fstconfig.name=FST Corpora configuration
+enhancer.engines.linking.solrfst.fstconfig.description=Configuration for the FST \
+Corpora. Syntax: '{lang};{param-name}={param-value};{param-name}={param-value};...' \
+Supported {param-name}s: 'field' ... the field name of the SolrIndex used for the \
+FST corpus (default: rdfs:label). The configured field name is encoded using the \
+Field Name Encoding. \
+'fst' ... the {base-name} of the file with the serialized FST model (default: {field} with \
+none alpha-numeric chars replaced by '_'). The actual file name is '{base-name}.{lang}.fst'. \
+Files are located in the 'fst' folder relative to the instance directory of the \
+configured SolrCore. \
+'generate' ... Boolean switch that allows to enable runtime generation of FST \
+corpora (default: false) \
+
+enhancer.engines.linking.solrfst.typeField.name=Entity Type Field
+enhancer.engines.linking.solrfst.typeField.description=The Solr Field holding the \
+type information of Entities. Values are expected to be URIs
+
+enhancer.engines.linking.solrfst.rankingField.name=Entity Ranking Field
+enhancer.engines.linking.solrfst.rankingField.description=The Solr Field holding the \
+Entity Ranking (importance of the Entity within the knowledge base). Values \
+are expected to be floating point numbers.
+
+enhancer.engines.linking.solrfst.fstThreadPoolSize.name=FST Thread Pool Size
+enhancer.engines.linking.solrfst.fstThreadPoolSize.description=The size of the \
+thread pool used for the runtime creation of FST models. NOTE that memory allocation \
+during creation is considerable higher as for holding the built model (up to to times) \
+so creation multiple models in parallel may require a lot of heap space. If memory \
+allocation is not an issue this value should be set based on the available CPU cores \
+and the resources one would like to assign to the creation of FST models.
+
+enhancer.engines.linking.solrfst.entityCacheSize.name=Entity Cache Size
+enhancer.engines.linking.solrfst.entityCacheSize.description=Used to configure \
+the size of the Cache used to for Entity information. While the FST linking is \
+fully performed in memory this engine needs still to load tagging relevant fields \
+(labels, types, redirectes and entity ranking) for matched entities from the disc. \
+The EntityCache is a LRU cache for such information (default is 65k entities)
+
+#===============================================================================
+#Properties and Options used to configure
+#===============================================================================
+
+enhancer.engines.linking.suggestions.name=Max Suggestions
+enhancer.engines.linking.suggestions.description=The maximum number of suggestions
+
+enhancer.engines.linking.minSearchTokenLength.name=Min Token Length
+enhancer.engines.linking.minSearchTokenLength.description=The minimum \
+length of Tokens used to lookup Entities within the Controlled Vocabulary. This parameter is ignored \
+in case a POS (Part of Speech) tagger is available for the language of the parsed content.
+
+enhancer.engines.linking.caseSensitive.name=Case Sensitivity
+enhancer.engines.linking.caseSensitive.description=Allows to enable/disable \
+case sensitive ranking. NOTE that the linking is based on the Solr FieldType of the \
+FST field. This only affects the ranking (fise:confidence value) of suggestions.
+
+enhancer.engines.linking.properNounsState.name=Link ProperNouns only
+enhancer.engines.linking.properNounsState.description=If activated \
+only ProperNouns will be matched against the Vocabulary. If deactivated any Noun will be matched. \
+NOTE that this parameter requires a tag of the POS TagSet to be mapped against 'olia:PorperNoun'. \
+Otherwise mapping will not work as expected.
+
+enhancer.engines.linking.processedLanguages.name=Processed Languages
+enhancer.engines.linking.processedLanguages.description=Languages to \
+process and optionally language specific configurations. Syntax "{lang};{param-name}={param-value};\
+{param-name}={param-value};...". Supported {param-name}s: "lc" - processed Lexical Categories (see \
+LexicalCategory enumeration for possible values); "pos" - processed Pos types (see Pos enumeration \
+for possible values); "tag" - processed string pos tags; "prob" - minumum probability of pos annotations.
+
+enhancer.engines.linking.defaultMatchingLanguage.name=Default Matching Language
+enhancer.engines.linking.defaultMatchingLanguage.description=The language \
+used in addition to the language detected for the analysed text to search for Entities. Typically this \
+configuration is an empty string to search for labels without any language defined, but for some data \
+sets (such as DBpedia.org) that add languages to any labels it might improve resuls to change this \
+configuration (e.g. to 'en' in the case of DBpedia.org).
+
+enhancer.engines.linking.typeMappings.name=Type Mappings
+enhancer.engines.linking.typeMappings.description=This allows to add \
+additional entity-type > text-annotation-type mappings. Such mappings are used to determine the \
+'dc:type' value of the 'fise:TextAnnotation' created for extracted entities. Usage: \
+variant (a) '{uri}' short for {uri} > {uri} or (b) '{source1};{source2};..;{sourceN} > {target}'. \
+Note that a {source} may be only mapped to a single {target}. Multiple {source} types \
+can be mapped to the same {target}.
+
+enhancer.engines.linking.typeField.name=Type Field
+enhancer.engines.linking.typeField.description=The field used to \
+retrieve the types of matched Entities. Values of that field are expected to be URIs
+
+enhancer.engines.linking.entityTypes.name=Entity Type Filter
+enhancer.engines.linking.entityTypes.description=Allows to define a white/black list \
+based on the types of Entities. Use '!{uri}' for black listing and '{uri}' for white \
+listing. Include '*' to force white listing (e.g. to allow Entities without any type). \
+Rules are processed based on their oder.
+
Added: stanbol/trunk/enhancement-engines/lucenefstlinking/src/test/resources/log4j.properties
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/test/resources/log4j.properties?rev=1516784&view=auto
==============================================================================
--- stanbol/trunk/enhancement-engines/lucenefstlinking/src/test/resources/log4j.properties (added)
+++ stanbol/trunk/enhancement-engines/lucenefstlinking/src/test/resources/log4j.properties Fri Aug 23 09:31:48 2013
@@ -0,0 +1,24 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Root logger option
+log4j.rootLogger=INFO, stdout
+
+# Direct log messages to stdout
+log4j.appender.stdout=org.apache.log4j.ConsoleAppender
+log4j.appender.stdout.Target=System.out
+log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
+log4j.appender.stdout.layout.ConversionPattern=%d{ABSOLUTE} %5p %c{1}:%L - %m%n
+log4j.logger.org.apache.stanbol.enhancer.engines.keywordextraction=DEBUG
\ No newline at end of file