You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2011/09/22 08:51:36 UTC
svn commit: r1173968 [3/5] - in /incubator/stanbol/trunk:
commons/installer/bundleprovider/src/main/java/org/apache/stanbol/commons/installer/provider/bundle/impl/
commons/jsonld/ commons/opennlp/
commons/opennlp/src/main/java/org/apache/stanbol/common...
Added: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java?rev=1173968&view=auto
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java (added)
+++ incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java Thu Sep 22 06:51:30 2011
@@ -0,0 +1,676 @@
+package org.apache.stanbol.enhancer.engines.keywordextraction.engine;
+
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.NIE_PLAINTEXTCONTENT;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Dictionary;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.clerezza.rdf.core.Language;
+import org.apache.clerezza.rdf.core.Literal;
+import org.apache.clerezza.rdf.core.LiteralFactory;
+import org.apache.clerezza.rdf.core.MGraph;
+import org.apache.clerezza.rdf.core.Triple;
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
+import org.apache.clerezza.rdf.core.impl.TripleImpl;
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang.StringUtils;
+import org.apache.felix.scr.annotations.Activate;
+import org.apache.felix.scr.annotations.Component;
+import org.apache.felix.scr.annotations.ConfigurationPolicy;
+import org.apache.felix.scr.annotations.Deactivate;
+import org.apache.felix.scr.annotations.Property;
+import org.apache.felix.scr.annotations.PropertyOption;
+import org.apache.felix.scr.annotations.ReferenceCardinality;
+import org.apache.felix.scr.annotations.ReferencePolicy;
+import org.apache.felix.scr.annotations.ReferenceStrategy;
+import org.apache.felix.scr.annotations.Service;
+import org.apache.stanbol.commons.opennlp.OpenNLP;
+import org.apache.stanbol.commons.opennlp.TextAnalyzer;
+import org.apache.stanbol.commons.stanboltools.offline.OfflineMode;
+import org.apache.stanbol.enhancer.engines.keywordextraction.linking.AnalysedContent;
+import org.apache.stanbol.enhancer.engines.keywordextraction.linking.EntityLinker;
+import org.apache.stanbol.enhancer.engines.keywordextraction.linking.EntityLinkerConfig;
+import org.apache.stanbol.enhancer.engines.keywordextraction.linking.EntitySearcher;
+import org.apache.stanbol.enhancer.engines.keywordextraction.linking.LinkedEntity;
+import org.apache.stanbol.enhancer.engines.keywordextraction.linking.Suggestion;
+import org.apache.stanbol.enhancer.engines.keywordextraction.linking.EntityLinkerConfig.RedirectProcessingMode;
+import org.apache.stanbol.enhancer.engines.keywordextraction.linking.LinkedEntity.Occurrence;
+import org.apache.stanbol.enhancer.engines.keywordextraction.linking.impl.EntityhubSearcher;
+import org.apache.stanbol.enhancer.engines.keywordextraction.linking.impl.OpenNlpAnalysedContentFactory;
+import org.apache.stanbol.enhancer.engines.keywordextraction.linking.impl.ReferencedSiteSearcher;
+import org.apache.stanbol.enhancer.engines.keywordextraction.linking.impl.TrackingEntitySearcher;
+import org.apache.stanbol.enhancer.servicesapi.ContentItem;
+import org.apache.stanbol.enhancer.servicesapi.EngineException;
+import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.InvalidContentException;
+import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
+import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
+import org.apache.stanbol.enhancer.servicesapi.rdf.Properties;
+import org.apache.stanbol.entityhub.servicesapi.Entityhub;
+import org.apache.stanbol.entityhub.servicesapi.model.Reference;
+import org.apache.stanbol.entityhub.servicesapi.model.Text;
+import org.apache.stanbol.entityhub.servicesapi.site.ReferencedSite;
+import org.osgi.service.cm.ConfigurationException;
+import org.osgi.service.component.ComponentContext;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+@Component(configurationFactory = true, policy = ConfigurationPolicy.REQUIRE, // the baseUri is required!
+ specVersion = "1.1", metatype = true, immediate = true)
+@Service
+@org.apache.felix.scr.annotations.Properties(value={
+ @Property(name=KeywordLinkingEngine.REFERENCED_SITE_ID),
+ @Property(name=KeywordLinkingEngine.NAME_FIELD,value=EntityLinkerConfig.DEFAULT_NAME_FIELD),
+ @Property(name=KeywordLinkingEngine.TYPE_FIELD,value=EntityLinkerConfig.DEFAULT_TYPE_FIELD),
+ @Property(name=KeywordLinkingEngine.REDIRECT_FIELD,value=EntityLinkerConfig.DEFAULT_REDIRECT_FIELD),
+ //@Property(name=TaxonomyLinkingEngine2.SIMPLE_TOKENIZER,boolValue=true),
+ //@Property(name=TaxonomyLinkingEngine2.ENABLE_CHUNKER,boolValue=false),
+ @Property(name=KeywordLinkingEngine.REDIRECT_PROCESSING_MODE,options={
+ @PropertyOption(
+ value='%'+KeywordLinkingEngine.REDIRECT_PROCESSING_MODE+".option.ignore",
+ name="IGNORE"),
+ @PropertyOption(
+ value='%'+KeywordLinkingEngine.REDIRECT_PROCESSING_MODE+".option.addValues",
+ name="ADD_VALUES"),
+ @PropertyOption(
+ value='%'+KeywordLinkingEngine.REDIRECT_PROCESSING_MODE+".option.follow",
+ name="FOLLOW")
+ },value="FOLLOW"),
+ @Property(name=KeywordLinkingEngine.MIN_SEARCH_TOKEN_LENGTH,
+ intValue=EntityLinkerConfig.DEFAULT_MIN_SEARCH_TOKEN_LENGTH),
+ @Property(name=KeywordLinkingEngine.MAX_SUGGESTIONS,
+ intValue=EntityLinkerConfig.DEFAULT_SUGGESTIONS),
+ @Property(name=KeywordLinkingEngine.PROCESSED_LANGUAGES,value="")
+})
+public class KeywordLinkingEngine implements EnhancementEngine, ServiceProperties{
+
+ private final Logger log = LoggerFactory.getLogger(KeywordLinkingEngine.class);
+ /**
+ * This is used to check the content type of parsed {@link ContentItem}s for
+ * plain text
+ */
+ protected static final String TEXT_PLAIN_MIMETYPE = "text/plain";
+ /**
+ * The default value for the Execution of this Engine.
+ * This Engine creates TextAnnotations that should not be processed by other Engines.
+ * Therefore it uses a lower rank than {@link ServiceProperties#ORDERING_DEFAULT}
+ * to ensure that other engines do not get confused
+ */
+ public static final Integer DEFAULT_ORDER = ServiceProperties.ORDERING_DEFAULT - 10;
+
+
+ public static final String REFERENCED_SITE_ID = "org.apache.stanbol.enhancer.engines.keywordextraction.referencedSiteId";
+ public static final String NAME_FIELD = "org.apache.stanbol.enhancer.engines.keywordextraction.nameField";
+ public static final String TYPE_FIELD = "org.apache.stanbol.enhancer.engines.keywordextraction.typeField";
+ public static final String REDIRECT_FIELD = "org.apache.stanbol.enhancer.engines.keywordextraction.redirectField";
+ public static final String REDIRECT_PROCESSING_MODE = "org.apache.stanbol.enhancer.engines.keywordextraction.redirectMode";
+ public static final String MIN_SEARCH_TOKEN_LENGTH = "org.apache.stanbol.enhancer.engines.keywordextraction.minSearchTokenLength";
+ public static final String MAX_SUGGESTIONS = "org.apache.stanbol.enhancer.engines.keywordextraction.maxSuggestions";
+ public static final String PROCESSED_LANGUAGES = "org.apache.stanbol.enhancer.engines.keywordextraction.processedLanguages";
+ public static final String MIN_FOUND_TOKENS= "org.apache.stanbol.enhancer.engines.keywordextraction.minFoundTokens";
+// public static final String SIMPLE_TOKENIZER = "org.apache.stanbol.enhancer.engines.keywordextraction.simpleTokenizer";
+// public static final String ENABLE_CHUNKER = "org.apache.stanbol.enhancer.engines.keywordextraction.enableChunker";
+ /**
+ * Default set of languages. This is an empty set indicating that texts in any
+ * language are processed.
+ */
+ public static final Set<String> DEFAULT_LANGUAGES = Collections.emptySet();
+ /**
+ * The languages this engine is configured to enhance. An empty List is
+ * considered as active for any language
+ */
+ private Set<String> languages = DEFAULT_LANGUAGES;
+ /**
+ * The literal representing the LangIDEngine as creator.
+ */
+ public static final Literal LANG_ID_ENGINE_NAME = LiteralFactory.getInstance().createTypedLiteral("org.apache.stanbol.enhancer.engines.langid.LangIdEnhancementEngine");
+
+ private EntitySearcher entitySearcher;
+ private EntityLinkerConfig config;
+
+ /**
+ * The reference to the OpenNLP component
+ */
+ @org.apache.felix.scr.annotations.Reference
+ private OpenNLP openNLP;
+ /**
+ * Used for natural language processing of parsed content
+ */
+ private TextAnalyzer textAnalyser;
+ /**
+ * Used to create {@link AnalysedContent} instances for parsed content items
+ */
+ private OpenNlpAnalysedContentFactory analysedContentFactory;
+ /**
+ * The literalFactory used to create typed literals
+ */
+ private LiteralFactory literalFactory = LiteralFactory.getInstance();
+
+ /**
+ * The {@link OfflineMode} is used by Stanbol to indicate that no external service should be referenced.
+ * For this engine that means it is necessary to check if the used {@link ReferencedSite} can operate
+ * offline or not.
+ *
+ * @see #enableOfflineMode(OfflineMode)
+ * @see #disableOfflineMode(OfflineMode)
+ */
+ @org.apache.felix.scr.annotations.Reference(
+ cardinality = ReferenceCardinality.OPTIONAL_UNARY,
+ policy = ReferencePolicy.DYNAMIC,
+ bind = "enableOfflineMode",
+ unbind = "disableOfflineMode",
+ strategy = ReferenceStrategy.EVENT)
+ private OfflineMode offlineMode;
+
+ /**
+ * Called by the ConfigurationAdmin to bind the {@link #offlineMode} if the service becomes available
+ *
+ * @param mode
+ */
+ protected final void enableOfflineMode(OfflineMode mode) {
+ this.offlineMode = mode;
+ }
+
+ /**
+ * Called by the ConfigurationAdmin to unbind the {@link #offlineMode} if the service becomes unavailable
+ *
+ * @param mode
+ */
+ protected final void disableOfflineMode(OfflineMode mode) {
+ this.offlineMode = null;
+ }
+
+ /**
+ * Returns <code>true</code> only if Stanbol operates in {@link OfflineMode}.
+ *
+ * @return the offline state
+ */
+ protected final boolean isOfflineMode() {
+ return offlineMode != null;
+ }
+
+ /**
+ * Default constructor as used by OSGI. This expects that
+ * {@link #activate(ComponentContext)} is called before usage
+ */
+ public KeywordLinkingEngine() {
+ }
+ /**
+ * Internal Constructor used by {@link #createInstance(OpenNLP, EntitySearcher, EntityLinkerConfig)}
+ * @param openNLP
+ * @param entitySearcher
+ * @param config
+ */
+ protected KeywordLinkingEngine(OpenNLP openNLP,EntitySearcher entitySearcher,
+ EntityLinkerConfig config){
+ this.openNLP = openNLP;
+ this.textAnalyser = new TextAnalyzer(openNLP);
+ this.analysedContentFactory = OpenNlpAnalysedContentFactory.getInstance(textAnalyser);
+ this.entitySearcher = entitySearcher;
+ this.config = config != null ? config : new EntityLinkerConfig();
+ }
+ /**
+ * Allows to create an instance that can be used outside of an OSGI
+ * environment. This is mainly intended for unit tests.
+ * @param openNLP The {@link OpenNLP} instance used for natural language processing
+ * @param entitySearcher the searcher used to lookup terms
+ * @param config the configuration or <code>null</code> to use the defaults
+ * @return the created engine instance
+ */
+ public static KeywordLinkingEngine createInstance(OpenNLP openNLP,
+ EntitySearcher entitySearcher,
+ EntityLinkerConfig config){
+ return new KeywordLinkingEngine(openNLP,entitySearcher,config);
+ }
+
+
+ /**
+ * Checks if the parsed language is enabled for processing.
+ * @param language The language to process
+ * @return the processing state for the parsed language.
+ */
+ protected boolean isProcessableLanguages(String language) {
+ return languages.isEmpty() || languages.contains(language);
+ }
+
+ @Override
+ public Map<String,Object> getServiceProperties() {
+ return Collections.unmodifiableMap(Collections.singletonMap(
+ ENHANCEMENT_ENGINE_ORDERING,
+ (Object) DEFAULT_ORDER));
+ }
+
+ @Override
+ public int canEnhance(ContentItem ci) throws EngineException {
+ String mimeType = ci.getMimeType().split(";", 2)[0];
+ if (TEXT_PLAIN_MIMETYPE.equalsIgnoreCase(mimeType)) {
+ return ENHANCE_SYNCHRONOUS;
+ }
+ // check for existence of textual content in metadata
+ UriRef subj = new UriRef(ci.getId());
+ Iterator<Triple> it = ci.getMetadata().filter(subj, NIE_PLAINTEXTCONTENT, null);
+ if (it.hasNext()) {
+ return ENHANCE_SYNCHRONOUS;
+ }
+ return CANNOT_ENHANCE;
+ }
+
+ @Override
+ public void computeEnhancements(ContentItem ci) throws EngineException {
+ if(isOfflineMode() && !entitySearcher.supportsOfflineMode()){
+ throw new EngineException("Offline mode is not supported by the Component used to lookup Entities");
+ }
+ String mimeType = ci.getMimeType().split(";", 2)[0];
+ String text = extractText(ci, mimeType);
+ if (text.trim().length() == 0) {
+ // TODO: make the length of the data a field of the ContentItem
+ // interface to be able to filter out empty items in the canEnhance
+ // method
+ log.warn("nothing to extract knowledge from in ContentItem {}", ci);
+ return;
+ }
+ //Determine the language
+ String language = extractLanguage(ci);
+ if(isProcessableLanguages(language)){
+ log.debug("computeEnhancements for ContentItem {} language {} text={}",
+ new Object []{ci.getId(), language, StringUtils.abbreviate(text, 100)});
+
+ EntityLinker taxonomyLinker = new EntityLinker(
+ analysedContentFactory.create(text, language),
+ entitySearcher, config);
+ //process
+ taxonomyLinker.process();
+ //write results
+ writeEnhancements(ci, taxonomyLinker.getLinkedEntities().values(), language);
+ } else {
+ log.debug("ignore ContentItem {} because language '{}' is not configured to" +
+ "be processed by this engine.",ci.getId(),language);
+ }
+
+ }
+
+ /**
+ * Writes the Enhancements for the {@link LinkedEntity LinkedEntities}
+ * extracted from the parsed ContentItem
+ * @param ci
+ * @param linkedEntities
+ * @param language
+ */
+ private void writeEnhancements(ContentItem ci, Collection<LinkedEntity> linkedEntities, String language) {
+ MGraph metadata = ci.getMetadata();
+ for(LinkedEntity linkedEntity : linkedEntities){
+ Collection<UriRef> textAnnotations = new ArrayList<UriRef>(linkedEntity.getOccurrences().size());
+ //first create the TextAnnotations for the Occurrences
+ for(Occurrence occurrence : linkedEntity.getOccurrences()){
+ UriRef textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
+ textAnnotations.add(textAnnotation);
+ metadata.add(new TripleImpl(textAnnotation,
+ Properties.ENHANCER_START,
+ literalFactory.createTypedLiteral(occurrence.getStart())));
+ metadata.add(new TripleImpl(textAnnotation,
+ Properties.ENHANCER_END,
+ literalFactory.createTypedLiteral(occurrence.getEnd())));
+ metadata.add(new TripleImpl(textAnnotation,
+ Properties.ENHANCER_SELECTION_CONTEXT,
+ literalFactory.createTypedLiteral(occurrence.getContext())));
+ metadata.add(new TripleImpl(textAnnotation,
+ Properties.ENHANCER_SELECTED_TEXT,
+ literalFactory.createTypedLiteral(occurrence.getSelectedText())));
+ metadata.add(new TripleImpl(textAnnotation,
+ Properties.ENHANCER_CONFIDENCE,
+ literalFactory.createTypedLiteral(linkedEntity.getScore())));
+ for(UriRef dcType : linkedEntity.getTypes()){
+ metadata.add(new TripleImpl(
+ textAnnotation, Properties.DC_TYPE, dcType));
+ }
+ }
+ //now the EntityAnnotations for the Suggestions
+ for(Suggestion suggestion : linkedEntity.getSuggestions()){
+ UriRef entityAnnotation = EnhancementEngineHelper.createEntityEnhancement(ci, this);
+ //should we use the label used for the match, or search the
+ //representation for the best label ... currently its the matched one
+ Text label = suggestion.getBestLabel(config.getNameField(),language);
+ metadata.add(new TripleImpl(entityAnnotation,
+ Properties.ENHANCER_ENTITY_LABEL,
+ label.getLanguage() == null ?
+ new PlainLiteralImpl(label.getText()) :
+ new PlainLiteralImpl(label.getText(),
+ new Language(label.getLanguage()))));
+ metadata.add(new TripleImpl(entityAnnotation,
+ Properties.ENHANCER_ENTITY_REFERENCE,
+ new UriRef(suggestion.getRepresentation().getId())));
+ Iterator<Reference> suggestionTypes = suggestion.getRepresentation().getReferences(config.getTypeField());
+ while(suggestionTypes.hasNext()){
+ metadata.add(new TripleImpl(entityAnnotation,
+ Properties.ENHANCER_ENTITY_TYPE, new UriRef(suggestionTypes.next().getReference())));
+ }
+ metadata.add(new TripleImpl(entityAnnotation,
+ Properties.ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(suggestion.getScore())));
+ for(UriRef textAnnotation : textAnnotations){
+ metadata.add(new TripleImpl(entityAnnotation,
+ Properties.DC_RELATION, textAnnotation));
+ }
+ }
+ }
+ }
+ /**
+ * Extracts the language of the parsed ContentItem from the metadata
+ * @param ci the content item
+ * @return the language
+ */
+ private String extractLanguage(ContentItem ci) {
+ MGraph metadata = ci.getMetadata();
+ Iterator<Triple> langaugeEnhancementCreatorTriples =
+ metadata.filter(null, Properties.DC_CREATOR, LANG_ID_ENGINE_NAME);
+ if(langaugeEnhancementCreatorTriples.hasNext()){
+ String lang = EnhancementEngineHelper.getString(metadata,
+ langaugeEnhancementCreatorTriples.next().getSubject(),
+ Properties.DC_LANGUAGE);
+ if(lang != null){
+ return lang;
+ } else {
+ log.warn("Unable to extract language for ContentItem %s! The Enhancement of the %s is missing the %s property",
+ new Object[]{ci.getId(),LANG_ID_ENGINE_NAME.getLexicalForm(),Properties.DC_LANGUAGE});
+ log.warn(" ... return 'en' as default");
+ return "en";
+ }
+ } else {
+ log.warn("Unable to extract language for ContentItem %s! Is the %s active?",
+ ci.getId(),LANG_ID_ENGINE_NAME.getLexicalForm());
+ log.warn(" ... return 'en' as default");
+ return "en";
+ }
+ }
+
+ /**
+ * Extracts the text from the parsed contentItem. In case the content type is
+ * plain text, it directly reads the text from the stream. In other cases it
+ * tries to read the string representation from the metadata by looking for
+ * values of the {@link org.apache.stanbol.enhancer.servicesapi.rdf.Properties#NIE_PLAINTEXTCONTENT}
+ * property.<p>
+ * TODO: This is a Workaround for the currently not implemented Adapter
+ * Pattern for the Stanbol Enhancer.
+ * @param ci
+ * @param mimeType
+ * @return
+ * @throws InvalidContentException
+ */
+ private String extractText(ContentItem ci, String mimeType) throws InvalidContentException {
+ String text;
+ if (TEXT_PLAIN_MIMETYPE.equals(mimeType)) {
+ try {
+ text = IOUtils.toString(ci.getStream(),"UTF-8");
+ } catch (IOException e) {
+ throw new InvalidContentException(this, ci, e);
+ }
+ } else {
+ //TODO: change that as soon the Adapter Pattern is used for multiple
+ // mimetype support.
+ StringBuilder textBuilder = new StringBuilder();
+ Iterator<Triple> it = ci.getMetadata().filter(new UriRef(ci.getId()), NIE_PLAINTEXTCONTENT, null);
+ while (it.hasNext()) {
+ textBuilder.append(it.next().getObject());
+ }
+ text = textBuilder.toString();
+ }
+ return text;
+ }
+
+ /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+ * Methods for activate() and deactivate() the properties configureable via
+ * OSGI.
+ *
+ * NOTEs:
+ * Directly calling super.activate and super.deactivate
+ * is possible but might not be applicable in all cases.
+ * The activate**(...) and deactivate**() Methods are intended to be
+ * called by subclasses that need more control over the initialisation
+ * process.
+ * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+ */
+ /**
+ * Activates this Engine. Subclasses should not call this method but rather
+ * call<ul>
+ * <li> {@link #activateEntitySearcher(ComponentContext, Dictionary)}
+ * <li> {@link #initEntityLinkerConfig(Dictionary, EntityLinkerConfig)} and
+ * <li> {@link #activateProcessedLanguages(Dictionary)}
+ * </ul>
+ * if applicable.
+ * @param context the Component context
+ * @throws ConfigurationException if the required {@link #REFERENCED_SITE_ID}
+ * configuration is missing or any of the other properties has an illegal value
+ */
+ @Activate
+ @SuppressWarnings("unchecked")
+ protected void activate(ComponentContext context) throws ConfigurationException {
+ textAnalyser = new TextAnalyzer(openNLP);
+ analysedContentFactory = OpenNlpAnalysedContentFactory.getInstance(textAnalyser);
+ Dictionary<String,Object> properties = context.getProperties();
+ activateEntitySearcher(context, properties);
+ activateEntityLinkerConfig(properties);
+ activateProcessedLanguages(properties);
+ }
+
+ /**
+ * Initialise the processed languages based on the value of the
+ * {@link #PROCESSED_LANGUAGES} key. If no configuration is present the
+ * default (process all languages) is used.
+ * @param configuration the OSGI component configuration
+ */
+ protected final void activateProcessedLanguages(Dictionary<String,Object> configuration) {
+ Object value;
+ value = configuration.get(PROCESSED_LANGUAGES);
+ if(value == null){
+ this.languages = DEFAULT_LANGUAGES;
+ } else if (value.toString().trim().isEmpty()){
+ this.languages = Collections.emptySet();
+ } else {
+ String[] languageArray = value.toString().split(",");
+ languages = new HashSet<String>();
+ for(String language : languageArray){
+ if(language != null){
+ language = language.trim();
+ if(!language.isEmpty()){
+ languages.add(language);
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * Configures the parsed {@link EntityLinkerConfig} with the values of the
+ * following properties:<ul>
+ * <li>{@link #NAME_FIELD}
+ * <li>{@link #TYPE_FIELD}
+ * <li>{@link #REDIRECT_FIELD}
+ * <li>{@link #REDIRECT_PROCESSING_MODE}
+ * <li>{@link #MAX_SUGGESTIONS}
+ * <li>{@link #MIN_SEARCH_TOKEN_LENGTH}
+ * <li>{@link #MIN_FOUND_TOKENS}
+ * </ul>
+ * This Method create an new {@link EntityLinkerConfig} instance only if
+ * <code>{@link #config} == null</code>. If the instance is already initialised
+ * that all current values for keys missing in the parsed configuration are
+ * preserved.
+ * @param configuration the configuration
+ * @throws ConfigurationException In case of an illegal value in the parsed configuration.
+ * Note that all configuration are assumed as optional, therefore missing values will not
+ * case a ConfigurationException.
+ */
+ protected void activateEntityLinkerConfig(Dictionary<String,Object> configuration) throws ConfigurationException {
+ if(config == null){
+ this.config = new EntityLinkerConfig();
+ }
+ Object value;
+ value = configuration.get(NAME_FIELD);
+ if(value != null){
+ if(value.toString().isEmpty()){
+ throw new ConfigurationException(NAME_FIELD,"The configured name field MUST NOT be empty");
+ }
+ config.setNameField(value.toString());
+ }
+ //init TYPE_FIELD
+ value = configuration.get(TYPE_FIELD);
+ if(value != null){
+ if(value.toString().isEmpty()){
+ throw new ConfigurationException(TYPE_FIELD,"The configured name field MUST NOT be empty");
+ }
+ config.setTypeField(value.toString());
+ }
+ //init REDIRECT_FIELD
+ value = configuration.get(REDIRECT_FIELD);
+ if(value != null){
+ if(value.toString().isEmpty()){
+ throw new ConfigurationException(NAME_FIELD,"The configured name field MUST NOT be empty");
+ }
+ config.setRedirectField(value.toString());
+ }
+ //init MAX_SUGGESTIONS
+ value = configuration.get(MAX_SUGGESTIONS);
+ Integer maxSuggestions;
+ if(value instanceof Integer){
+ maxSuggestions = (Integer)value;
+ } else if (value != null){
+ try {
+ maxSuggestions = Integer.valueOf(value.toString());
+ } catch(NumberFormatException e){
+ throw new ConfigurationException(MAX_SUGGESTIONS, "Values MUST be valid Integer values > 0",e);
+ }
+ } else {
+ maxSuggestions = null;
+ }
+ if(maxSuggestions != null){
+ if(maxSuggestions < 1){
+ throw new ConfigurationException(MAX_SUGGESTIONS, "Values MUST be valid Integer values > 0");
+ }
+ config.setMaxSuggestions(maxSuggestions);
+ }
+ //init MIN_FOUND_TOKENS
+ value = configuration.get(MIN_FOUND_TOKENS);
+ Integer minFoundTokens;
+ if(value instanceof Integer){
+ minFoundTokens = (Integer)value;
+ } else if(value != null){
+ try {
+ minFoundTokens = Integer.valueOf(value.toString());
+ } catch(NumberFormatException e){
+ throw new ConfigurationException(MIN_FOUND_TOKENS, "Values MUST be valid Integer values > 0",e);
+ }
+ } else {
+ minFoundTokens = null;
+ }
+ if(minFoundTokens != null){
+ if(minFoundTokens < 1){
+ throw new ConfigurationException(MIN_FOUND_TOKENS, "Values MUST be valid Integer values > 0");
+ }
+ config.setMinFoundTokens(minFoundTokens);
+ }
+ // init MIN_SEARCH_TOKEN_LENGTH
+ value = configuration.get(MIN_SEARCH_TOKEN_LENGTH);
+ Integer minSearchTokenLength;
+ if(value instanceof Integer){
+ minSearchTokenLength = (Integer)value;
+ } else if (value != null){
+ try {
+ minSearchTokenLength = Integer.valueOf(value.toString());
+ } catch(NumberFormatException e){
+ throw new ConfigurationException(MIN_SEARCH_TOKEN_LENGTH, "Values MUST be valid Integer values > 0",e);
+ }
+ } else {
+ minSearchTokenLength = null;
+ }
+ if(minSearchTokenLength != null){
+ if(minSearchTokenLength < 1){
+ throw new ConfigurationException(MIN_SEARCH_TOKEN_LENGTH, "Values MUST be valid Integer values > 0");
+ }
+ config.setMaxSuggestions(minSearchTokenLength);
+ }
+ //init the REDIRECT_PROCESSING_MODE
+ value = configuration.get(REDIRECT_PROCESSING_MODE);
+ if(value != null){
+ try {
+ config.setRedirectProcessingMode(RedirectProcessingMode.valueOf(value.toString()));
+ } catch (IllegalArgumentException e) {
+ throw new ConfigurationException(REDIRECT_PROCESSING_MODE, "Values MUST be one of "+
+ Arrays.toString(RedirectProcessingMode.values()));
+ }
+ }
+ }
+
+ /**
+ * Initialise the {@link #entitySearcher} based on the value of the
+ * {@link #REFERENCED_SITE_ID} property in the parsed configuration
+ * @param context
+ * @param configuration
+ * @throws ConfigurationException
+ */
+ protected void activateEntitySearcher(ComponentContext context, Dictionary<String,Object> configuration) throws ConfigurationException {
+ Object value = configuration.get(REFERENCED_SITE_ID);
+ //init the EntitySource
+ if (value == null) {
+ throw new ConfigurationException(REFERENCED_SITE_ID,
+ "The ID of the Referenced Site is a required Parameter and MUST NOT be NULL!");
+ }
+ String refSiteId = value.toString();
+ if (refSiteId.isEmpty()) {
+ throw new ConfigurationException(REFERENCED_SITE_ID,
+ "The ID of the Referenced Site is a required Parameter and MUST NOT be an empty String!");
+ }
+ if(Entityhub.ENTITYHUB_IDS.contains(refSiteId.toLowerCase())){
+ entitySearcher = new EntityhubSearcher(context.getBundleContext());
+ } else {
+ entitySearcher = new ReferencedSiteSearcher(context.getBundleContext(),refSiteId);
+ }
+ }
+ /**
+ * Deactivates this Engine. Subclasses should not call this method but rather
+ * call<ul>
+ * <li> {@link #deactivateEntitySearcher()}
+ * <li> {@link #deactivateEntityLinkerConfig()} and
+ * <li> {@link #deactivateProcessedLanguages())}
+ * </ul>
+ * @param context the context (not used)
+ */
+ @Deactivate
+ protected void deactivate(ComponentContext context) {
+ deactivateEntitySearcher();
+ deactivateProcessedLanguages();
+ deactivateEntityLinkerConfig();
+ }
+
+ /**
+ * Sets the languages to {@link #DEFAULT_LANGUAGES}
+ */
+ protected void deactivateProcessedLanguages() {
+ languages = DEFAULT_LANGUAGES;
+ }
+
+ /**
+ * sets the {@link EntityLinkerConfig} to <code>null</code>
+ */
+ protected void deactivateEntityLinkerConfig() {
+ config = null;
+ }
+
+ /**
+ * Closes and resets the EntitySearcher. Also calls
+ * {@link TrackingEntitySearcher#close()} if applicable.
+ */
+ protected void deactivateEntitySearcher() {
+ if(entitySearcher instanceof TrackingEntitySearcher<?>){
+ //close tracking EntitySearcher
+ ((TrackingEntitySearcher<?>)entitySearcher).close();
+ }
+ entitySearcher = null;
+ }
+}
Propchange: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/ProcessingState.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/ProcessingState.java?rev=1173968&view=auto
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/ProcessingState.java (added)
+++ incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/ProcessingState.java Thu Sep 22 06:51:30 2011
@@ -0,0 +1,224 @@
+/**
+ *
+ */
+package org.apache.stanbol.enhancer.engines.keywordextraction.impl;
+
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+
+import org.apache.stanbol.commons.opennlp.TextAnalyzer.AnalysedText;
+import org.apache.stanbol.commons.opennlp.TextAnalyzer.AnalysedText.Chunk;
+import org.apache.stanbol.commons.opennlp.TextAnalyzer.AnalysedText.Token;
+
+public class ProcessingState {
+
+ private final Iterator<AnalysedText> sentences;
+ /**
+ * The sentence currently processed
+ */
+ private AnalysedText sentence;
+ /**
+ * The index of the current token needed to be linked
+ */
+ private int tokenIndex = -1;
+ /**
+ * The current token
+ */
+ private Token token;
+ /**
+ * The iterator over the chunks of the current {@link #sentence}
+ * or <code>null</code> if no {@link Chunk}s are available.
+ */
+ private Iterator<Chunk> chunks;
+ /**
+ * The current {@link Chunk}
+ */
+ private Chunk chunk;
+ /**
+ * This is a cache over the exact labels over the following 'n' tokens
+ * relative {@link #tokenIndex}. It is cleared each time {@link #next()}
+ * is called.
+ */
+ private Map<Integer,String> textCache = new HashMap<Integer,String>();
+ /**
+ * The position for the next token
+ */
+ private int nextToken = -1;
+
+ public ProcessingState(Iterator<AnalysedText> sentences){
+ this.sentences = sentences;
+ if(!sentences.hasNext()){
+ throw new IllegalArgumentException("The parsed AnalysedContent MUST NOT have an empty AnalysedText iterator!");
+ }
+ }
+ /**
+ * Getter for the current Sentence
+ * @return the sentence
+ */
+ public final AnalysedText getSentence() {
+ return sentence;
+ }
+ /**
+ * Getter for the index of the current active token within the current
+ * active {@link #getSentence() sentence}
+ * @return the tokenPos the index of the token
+ */
+ public final int getTokenIndex() {
+ return tokenIndex;
+ }
+ /**
+ * The currently active token
+ * @return the token
+ */
+ public final Token getToken() {
+ return token;
+ }
+ /**
+ * The currently active chunk or <code>null</code> if no chunks are
+ * available. If chunks are present this can not be <code>null</code>
+ * because {@link Token}s outside of chunks are skiped.
+ * @return the chunk the current {@link Chunk} or <code>null</code> if
+ * no chunks are present.
+ */
+ public final Chunk getChunk() {
+ return chunk;
+ }
+ /**
+ * Getter for the next {@link Token} to be processed. Calling {@link #next()}
+ * is guaranteed to skip all tokens in between {@link #getTokenIndex()}
+ * and {@link #getNextToken()}, but it might even skip more tokens (e.g.
+ * in case that the token referenced by {@link #getNextToken()} is not
+ * within a {@link Chunk}
+ * @return the nextToken
+ */
+ public final int getNextToken() {
+ return nextToken;
+ }
+ /**
+ * Allows to manually set to position of the next token to process.
+ * This can be used to skip some tokens within (e.g. if a Concept
+ * matching multiple Tokens where found.<p>
+ * The set token may be greater than the number of tokens in
+ * {@link #sentence}. This will simple cause the next sentence to be
+ * activated on the next call to {@link #next()}
+ * @param pos the position of the next token to process.
+ */
+ public void setNextToken(int pos){
+ if(pos > tokenIndex){
+ this.nextToken = pos;
+ } else {
+ throw new IllegalArgumentException("The nextTokenPos "+pos+
+ " MUST BE greater than the current "+tokenIndex);
+ }
+ }
+ /**
+ * Moves the state to #nextToken this may switch to the next Chunk or
+ * sentence.
+ * @return <code>true</code> if there are further elements to process or
+ * <code>false</code> if there are no further elements to process.
+ */
+ public boolean next() {
+ //first clear caches for the current element
+ textCache.clear();
+ //switch to the next token
+ if(nextToken > tokenIndex){
+ tokenIndex = nextToken;
+ } else {
+ tokenIndex++;
+ nextToken = tokenIndex;
+ }
+ //now init the next element
+ final boolean hasNext;
+ if(chunk != null){ //if chunks are present
+ //get next chunk (may be the current if chunk.getEnd() > tokenPos
+ for(;tokenIndex > chunk.getEnd() && chunks.hasNext();chunk = chunks.next());
+ if(tokenIndex <= chunk.getEnd()){ //found valid chunk
+ if(chunk.getStart() > tokenIndex) { //skip tokens outside chunks
+ tokenIndex = chunk.getStart();
+ }
+ hasNext = true;
+ } else { //no more valid chunks in this sentence
+ hasNext = initNextSentence();
+ }
+ } else { //no chunks ... use tokens only
+ if(sentence == null){ //first sentence
+ hasNext = initNextSentence();
+ } else if(tokenIndex >= sentence.getTokens().size()){
+ hasNext = initNextSentence();
+ } else { //more tokens in the sentence
+ //set the token
+ hasNext = true;
+ }
+ }
+ if(hasNext){ //set the Token
+ token = sentence.getTokens().get(tokenIndex);
+ }
+ return hasNext;
+ }
+
+ /**
+ * Correctly initialise {@link #sentence}, {@link #chunks}, {@link #chunk}
+ * and {@link #tokenIndex} for the next element of {@link #sentences}. If
+ * no further sentences are to process it simple sets {@link #sentence},
+ * {@link #chunks}, {@link #chunk} and {@link #tokenIndex} to <code>null</code>
+ */
+ private boolean initNextSentence() {
+ sentence = null;
+ while(sentence == null && sentences.hasNext()){
+ sentence = sentences.next();
+ if(sentence.getChunks() != null){
+ chunks = sentence.getChunks().iterator();
+ if(chunks.hasNext()){
+ chunk = chunks.next();
+ tokenIndex = chunk.getStart();
+ nextToken = tokenIndex;
+ } else { //no chunks in this sentence
+ sentence = null; //skip this sentence
+ }
+ } else {
+ if(sentence.getTokens().isEmpty()){ //no tokens in this sentence
+ sentence = null; //skip this one
+ } else {
+ chunks = null;
+ chunk = null;
+ tokenIndex = 0;
+ nextToken = 0;
+ }
+ }
+ }
+ return sentence != null;
+ }
+ /**
+ * Getter for the text covered by the next tokenCount tokens relative to
+ * {@link #token}. It uses the {@link #textCache} to lookup/store such texts.
+ * Given the Tokens
+ * <pre>
+ * [This, is, an, Example]
+ * </pre>
+ * and the parameter <code>3</code> this method will return
+ * <pre>
+ * This is an
+ * </pre>
+ * @param tokenCount the number of tokens to be included relative to
+ * {@link #tokenIndex}
+ * @return the text covered by the span start of {@link #token} to end of
+ * token at <code>{@link #tokenIndex}+tokenCount</code>.
+ */
+ public String getTokenText(int tokenCount){
+ Integer pos = Integer.valueOf(tokenCount-1);
+ String text = textCache.get(Integer.valueOf(tokenCount-1));
+ if(text == null){
+ text = sentence.getText().substring(token.getStart(),
+ sentence.getTokens().get(tokenIndex+pos.intValue()).getEnd());
+ textCache.put(pos, text);
+ }
+ return text;
+ }
+ @Override
+ public String toString() {
+ return "["+tokenIndex+","+token+"] chunk: " +
+ (chunk == null?null:chunk.getText())+"| sentence: "+
+ (sentence == null?null:sentence.getText());
+ }
+}
\ No newline at end of file
Propchange: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/ProcessingState.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/AnalysedContent.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/AnalysedContent.java?rev=1173968&view=auto
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/AnalysedContent.java (added)
+++ incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/AnalysedContent.java Thu Sep 22 06:51:30 2011
@@ -0,0 +1,63 @@
+/**
+ *
+ */
+package org.apache.stanbol.enhancer.engines.keywordextraction.linking;
+
+import java.util.Iterator;
+
+import opennlp.tools.util.Span;
+
+import org.apache.stanbol.commons.opennlp.TextAnalyzer;
+import org.apache.stanbol.commons.opennlp.TextAnalyzer.AnalysedText;
+import org.apache.stanbol.commons.opennlp.TextAnalyzer.AnalysedText.Token;
+
+/**
+ * Represents the already with NLP tools analysed content to be linked with
+ * Entities of an {@link EntitySearcher}.<p>
+ * Note that for the linking process it is only required that the text is
+ * tokenized. All other features (sentence detection, POS tags and Chunks) are
+ * optional but do improve the performance and to an smaller amount also the
+ * results of the linking process. <p>
+ * TODO: <ul>
+ * <li> Find a better Name
+ * <li> The API is not optimal. In general the {@link TextAnalyzer} and the
+ * {@link AnalysedContent} interface do not play well together :(
+ * </ul>
+ * @author Rupert Westenthaler
+ *
+ */
+public interface AnalysedContent {
+
+
+ /**
+ * Getter for the Iterator over the analysed sentences. This Method
+ * is expected to return always the same Iterator instance.
+ * @return the iterator over the analysed sentences
+ */
+ public Iterator<AnalysedText> getAnalysedText();
+ /**
+ * Called to check if a {@link Token} should be used to search for
+ * Concepts within the Taxonomy based on the POS tag of the Token.
+ * @param posTag the POS tag to check
+ * @return <code>true</code> if Tokens with this POS tag should be
+ * included in searches. Otherwise <code>false</code>. If this information
+ * is not available (e.g. no set of Tags that need to be processed is defined)
+ * this Method MUST return <code>null</code>
+ */
+ public Boolean processPOS(String posTag);
+ /**
+ * Called to check if a chunk should be used to search for Concepts.
+ * @param chunkTag the tag (type) of the chunk
+ * @return <code>true</code> if chunks with this tag (type) should be
+ * processed (used to search for matches of concepts) and <code>false</code>
+ * if not. If this information is not available (e.g. no set of Tags that
+ * need to be processed is defined) this Method MUST return <code>null</code>
+ */
+ public Boolean processChunk(String chunkTag);
+ /**
+ * Tokenizes the parsed label
+ * @param label the label to tokenize
+ * @return the spans of the tokens
+ */
+ public String[] tokenize(String label);
+}
\ No newline at end of file
Propchange: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/AnalysedContent.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java?rev=1173968&view=auto
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java (added)
+++ incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java Thu Sep 22 06:51:30 2011
@@ -0,0 +1,375 @@
+package org.apache.stanbol.enhancer.engines.keywordextraction.linking;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import opennlp.tools.util.Span;
+
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.stanbol.commons.opennlp.TextAnalyzer.AnalysedText.Token;
+import org.apache.stanbol.enhancer.engines.keywordextraction.impl.ProcessingState;
+import org.apache.stanbol.enhancer.engines.keywordextraction.linking.EntityLinkerConfig.RedirectProcessingMode;
+import org.apache.stanbol.enhancer.engines.keywordextraction.linking.Suggestion.MATCH;
+import org.apache.stanbol.entityhub.servicesapi.defaults.NamespaceEnum;
+import org.apache.stanbol.entityhub.servicesapi.model.Reference;
+import org.apache.stanbol.entityhub.servicesapi.model.Representation;
+import org.apache.stanbol.entityhub.servicesapi.model.Text;
+import org.apache.stanbol.entityhub.servicesapi.model.rdf.RdfResourceEnum;
+
+public class EntityLinker {
+
+ private final EntityLinkerConfig config;
+ private final AnalysedContent content;
+ private final EntitySearcher entitySearcher;
+ /**
+ * The state of the current processing
+ */
+ private final ProcessingState state;
+ /**
+ * The map holding the results of the linking process
+ */
+ private final Map<String,LinkedEntity> linkedEntities = new HashMap<String,LinkedEntity>();
+
+ /**
+ * After {@link #process()}ing this returns the entities linked for the
+ * parsed {@link AnalysedContent}.
+ * @return the linked entities
+ */
+ public final Map<String,LinkedEntity> getLinkedEntities() {
+ return linkedEntities;
+ }
+ public EntityLinker(AnalysedContent content,EntitySearcher taxonomy,EntityLinkerConfig config){
+ if(config == null){
+ throw new IllegalArgumentException("The parsed TaxonomyLinkerConfig MUST NOT be NULL!");
+ }
+ if(taxonomy == null){
+ throw new IllegalArgumentException("The parsed Taxonomy MUST NOT be NULL!");
+ }
+ if(content == null){
+ throw new IllegalArgumentException("The parsed AnalysedContent MUST NOT be NULL!");
+ }
+ this.content = content;
+ this.entitySearcher = taxonomy;
+ this.config = config;
+ this.state = new ProcessingState(content.getAnalysedText());
+ }
+ /**
+ * Steps over the sentences, chunks, tokens of the {@link #sentences}
+ */
+ public void process(){
+ while(state.next()) {
+ if(isProcessableToken(state.getToken())){
+ List<String> searchStrings = new ArrayList<String>(config.getMaxSearchTokens());
+ searchStrings.add(state.getToken().getText());
+ //get the list of all tokens that can possible be matched
+ int includeTokenIndex = state.getTokenIndex();
+ includeTokenIndex++;
+ while(searchStrings.size() < config.getMaxSearchTokens() && //more search strings
+ (includeTokenIndex <= (state.getChunk() != null ? //still within
+ state.getChunk().getEnd() : //the chunk
+ state.getSentence().getTokens().size()-1))){ //or sentence
+ Token included = state.getSentence().getTokens().get(includeTokenIndex);
+ includeTokenIndex++;
+ if(isProcessableToken(included)){
+ searchStrings.add(included.getText());
+ }
+ }
+ //search for Entities
+ List<Suggestion> suggestions = lookupEntities(searchStrings);
+ if(!suggestions.isEmpty()){
+ //update the suggestions based on the best match
+ int bestMatchCount = suggestions.get(0).getMatchCount();
+ Iterator<Suggestion> it = suggestions.iterator();
+ while(it.hasNext()){
+ Suggestion suggestion = it.next();
+ //suggestions that match less tokens as the best match
+ //need to be updated to PARTIAL
+ if(suggestion.getMatchCount() < bestMatchCount){
+ suggestion.setMatch(MATCH.PARTIAL);
+ }
+ //Filter matches with less than config.getMinFoundTokens()
+ //if matchcount is less than of the best match
+ if(suggestion.getMatchCount() < bestMatchCount &&
+ suggestion.getMatchCount() < config.getMinFoundTokens()){
+ it.remove();
+ } else { //calculate the score
+ //how good is the current match in relation to the best one
+ double spanScore = ((double)suggestion.getMatchCount())/bestMatchCount;
+ //how good is the match to the span selected by this suggestion
+ double textScore = ((double)suggestion.getMatchCount())/suggestion.getSpan();
+ //how good is the match in relation to the tokens of the suggested label
+ double labelScore = ((double)suggestion.getMatchCount()/suggestion.getLabelTokenCount());
+ suggestion.setScore(spanScore*spanScore*textScore*labelScore);
+ }
+ }
+ Suggestion oldBestRanked = suggestions.get(0); //for debugging
+ //resort by score
+ Collections.sort(suggestions, Suggestion.SCORE_COMPARATOR);
+ //this should never happen ... but the
+ //matchcount of the best match MUST NOT change
+ //after the sort by score!
+ if(bestMatchCount != suggestions.get(0).getMatchCount()){
+ //TODO: change this to a warning (like to have exceptions during debugging)
+ throw new IllegalStateException(String.format(
+ "The match count for the top Ranked Suggestion for %s changed after resorting based on Scores! (original: %s, currnet %s)",
+ state.getTokenText(bestMatchCount),oldBestRanked,suggestions));
+ }
+ //remove all suggestions > config.maxSuggestions
+ if(suggestions.size() > config.getMaxSuggestions()){
+ suggestions.subList(config.getMaxSuggestions(),suggestions.size()).clear();
+ }
+
+ //process redirects
+ if(config.getRedirectProcessingMode() != RedirectProcessingMode.IGNORE){
+ for(Suggestion suggestion : suggestions){
+ processRedirects(suggestion);
+ }
+ }
+ int span = suggestions.get(0).getSpan();
+ //Store the linking results
+ String selectedText = state.getTokenText(span);
+ //float score;
+ LinkedEntity linkedEntity = linkedEntities.get(selectedText);
+ if(linkedEntity == null){
+ linkedEntity = new LinkedEntity(selectedText,
+ suggestions, getLinkedEntityTypes(suggestions.subList(0, 1)));
+ linkedEntities.put(selectedText, linkedEntity);
+ }
+ linkedEntity.addOccurrence(
+ state.getSentence(), state.getTokenIndex(), span);
+ //set the next token to process to the next word after the
+ //currently found suggestion
+ state.setNextToken(state.getTokenIndex()+span);
+ }
+
+ } //else do not process this token
+ }
+ }
+ /**
+ * Retrieves all {@link EntitySearcher#getTypeField()} values of the parsed
+ * {@link Suggestion}s and than lookup the {@link NamespaceEnum#dcTerms dc}:type
+ * values for the {@link LinkedEntity#getTypes()} by using the configured
+ * {@link EntityLinkerConfig#getTypeMappings() types mappings} (and if
+ * no mapping is found the {@link EntityLinkerConfig#getDefaultDcType()
+ * default} type.
+ * @param conceptTypes The list of suggestions
+ * @return the types values for the {@link LinkedEntity}
+ */
+ private Set<UriRef> getLinkedEntityTypes(Collection<Suggestion> suggestions){
+ Collection<String> conceptTypes = new HashSet<String>();
+ for(Suggestion suggestion : suggestions){
+ for(Iterator<Reference> types =
+ suggestion.getRepresentation().getReferences(config.getTypeField());
+ types.hasNext();conceptTypes.add(types.next().getReference()));
+ }
+ Map<String,UriRef> typeMappings = config.getTypeMappings();
+ Set<UriRef> dcTypes = new HashSet<UriRef>();
+ for(String conceptType : conceptTypes){
+ UriRef dcType = typeMappings.get(conceptType);
+ if(dcType != null){
+ dcTypes.add(dcType);
+ }
+ }
+ if(dcTypes.isEmpty() && config.getDefaultDcType() != null){
+ dcTypes.add(config.getDefaultDcType());
+ }
+ return dcTypes;
+ }
+ /**
+ * Processes {@link EntitySearcher#getRedirectField() redirect field} values for
+ * the parsed suggestions based on the {@link RedirectProcessingMode}
+ * as configured in the {@link #config}.<p>
+ * The results of this method are stored within the parsed {@link Suggestion}s
+ * @param suggestion The suggestion to process.
+ */
+ private void processRedirects(Suggestion suggestion) {
+ //if mode is IGNORE -> nothing to do
+ if(config.getRedirectProcessingMode() == RedirectProcessingMode.IGNORE){
+ return;
+ }
+ //in case results for queries are locally cached it might be the case
+ //that some/all of the results do already redirects processed.
+ //therefore there is a small internal state that stores this information
+ if(suggestion.isRedirectedProcessed()){
+ return; //Redirects for ResultMatch are already processed ... ignore
+ }
+ Representation result = suggestion.getResult();
+ Iterator<Reference> redirects = result.getReferences(config.getRedirectField());
+ switch (config.getRedirectProcessingMode()) {
+ case ADD_VALUES:
+ while(redirects.hasNext()){
+ Reference redirect = redirects.next();
+ if(redirect != null){
+ Representation redirectedEntity = entitySearcher.get(redirect.getReference(),
+ config.getSelectedFields());
+ if(redirectedEntity != null){
+ for(Iterator<String> fields = redirectedEntity.getFieldNames();fields.hasNext();){
+ String field = fields.next();
+ result.add(field, redirectedEntity.get(field));
+ }
+ }
+ //set that the redirects where searched for this result
+ suggestion.setRedirectProcessed(true);
+ }
+ }
+ case FOLLOW:
+ while(redirects.hasNext()){
+ Reference redirect = redirects.next();
+ if(redirect != null){
+ Representation redirectedEntity = entitySearcher.get(redirect.getReference(),
+ config.getSelectedFields());
+ if(redirectedEntity != null){
+ //copy the original result score
+ redirectedEntity.set(RdfResourceEnum.resultScore.getUri(),
+ result.get(RdfResourceEnum.resultScore.getUri()));
+ //set the redirect
+ suggestion.setRedirect(redirectedEntity);
+ }
+ }
+ }
+ default: //nothing to do
+ }
+ }
+ /**
+ * Searches for Entities in the {@link #entitySearcher} corresponding to the
+ * {@link Token#getText() words} of the current {@link #state position} in
+ * the text.
+ * @param searchStrings the list of {@link Token#getText() words} to search
+ * entities for.
+ * @return The sorted list with the suggestions.
+ * If there are no suggestions an empty list will be returned.
+ */
+ private List<Suggestion> lookupEntities(List<String> searchStrings) {
+ Collection<? extends Representation> results = entitySearcher.lookup(
+ config.getNameField(),config.getSelectedFields(),
+ searchStrings, state.getSentence().getLanguage());
+ List<Suggestion> suggestions = new ArrayList<Suggestion>();
+ for(Representation result : results){
+ Suggestion match = matchLabels(result);
+ if(match.getMatch() != MATCH.NONE){
+ suggestions.add(match);
+ }
+ }
+ //sort the suggestions
+ if(suggestions.size()>1){
+ Collections.sort(suggestions,Suggestion.DEFAULT_SUGGESTION_COMPARATOR);
+ }
+ //remove all elements > config.getMaxSuggestions()
+ return suggestions;
+ }
+ /**
+ * Matches the labels of the parsed {@link Representation} with the Tokens of
+ * the texts (beginning with the currently active
+ * {@link ProcessingState#getToken() token}).<p>
+ * The field used to get the labels is retrieved from
+ * {@link EntitySearcher#getNameField()}. Only labels with no language or the
+ * language of the current sentence are considered. If less than
+ * {@link EntityLinkerConfig#getMinFoundTokens()} tokens match with an
+ * label the Concept is only considered to match if the label is
+ * {@link String#equalsIgnoreCase(String)} to the text covered by the
+ * matched token(s). Otherwise also {@link MATCH#FULL} and {@link MATCH#PARTIAL}
+ * results are allowed.
+ * @param rep The representation including at least the data for the
+ * {@link EntitySearcher#getNameField()} property.
+ * @return The result of the matching.
+ */
+ private Suggestion matchLabels(Representation rep) {
+ Iterator<Text> labels = rep.getText(config.getNameField());
+ Suggestion match = new Suggestion(rep);
+ while(labels.hasNext()){
+ Text label = labels.next();
+ //NOTE: I use here startWith language because I want 'en-GB' labels accepted for 'en'
+ if(label.getLanguage() == null || label.getLanguage().startsWith(
+ state.getSentence().getLanguage())){
+ String text = label.getText().toLowerCase();
+ List<String> labelTokens = Arrays.asList(content.tokenize(text));
+ int foundTokens = 0;
+ //ensure the correct order of the tokens in the suggested entity
+ int foundInLabelIndex = 0;
+ boolean search = true;
+ int lastFoundIndex = -1;
+ Token currentToken;
+ int maxNotFound = 1; //TODO make configureable
+ int notFound = 0;
+ for(int currentIndex = state.getTokenIndex();currentIndex < state.getSentence().getTokens().size() && search;currentIndex++){
+ currentToken = state.getSentence().getTokens().get(currentIndex);
+ boolean isProcessable = isProcessableToken(currentToken);
+ int found = text.indexOf(currentToken.getText().toLowerCase());
+ if(found>=foundInLabelIndex){ //found
+ if(isProcessable){
+ foundTokens++; //only count processable Tokens
+ }
+ foundInLabelIndex = found+currentToken.getText().length();
+ lastFoundIndex = currentIndex;
+ } else { //not found
+ notFound++;
+ if(isProcessable || notFound > maxNotFound){
+ //stop as soon as a token that needs to be processed is
+ //not found in the label or the maximum number of tokens
+ //that are not processable are not found
+ search = false;
+ }
+ } //else it is OK if non processable tokens are not found
+ }
+ MATCH labelMatch;
+ int coveredTokens = lastFoundIndex-state.getTokenIndex()+1;
+ //Matching rules
+ // - if less than config#minTokenFound() than accept only EXACT
+ // - override PARTIAL matches with FULL/EXACT matches only if
+ // foundTokens of the PARTIAL match is > than of the FULL/EXACT
+ // match (this will be very rare
+ if(foundTokens > 0 && match.getMatchCount() <= foundTokens) {
+ String currentText = state.getTokenText(coveredTokens);
+ if(currentText.equalsIgnoreCase(label.getText())){
+ labelMatch = MATCH.EXACT;
+ //set found to covered: May be lower because only
+ //processable tokens are counted, but Exact also checks
+ //of non-processable!
+ foundTokens = coveredTokens;
+ } else if(foundTokens >= config.getMinFoundTokens()){
+ if(foundTokens == coveredTokens){
+ labelMatch = MATCH.FULL;
+ } else {
+ labelMatch = MATCH.PARTIAL;
+ }
+ } else {
+ labelMatch = MATCH.NONE;
+ }
+ if(labelMatch != MATCH.NONE){
+ if(match.getMatchCount() < foundTokens ||
+ match.getMatchCount() < foundTokens &&
+ labelMatch.ordinal() > match.getMatch().ordinal()){
+ match.updateMatch(labelMatch, coveredTokens, foundTokens,label,labelTokens.size());
+ } //else this match is not better as the existing one
+ } //else ignore labels with MATCH.NONE
+ } //else NO tokens found -> nothing to do
+ } // else worng language
+ }
+ return match;
+ }
+
+ /**
+ * Checks if the current token of {@link #state} is processable.
+ * @param token the {@link Token} to check.
+ * @return <code>true</code> if the parsed token needs to be processed.
+ * Otherwise <code>false</code>
+ */
+ private boolean isProcessableToken(Token token) {
+ Boolean processToken = null;
+ if(token.getPosTag() != null){
+ processToken = content.processPOS(token.getPosTag());
+ }
+ if(processToken == null) {
+ processToken = token.getText().length() >= config.getMinSearchTokenLength();
+ }
+ return processToken;
+ }
+}
Propchange: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinkerConfig.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinkerConfig.java?rev=1173968&view=auto
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinkerConfig.java (added)
+++ incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinkerConfig.java Thu Sep 22 06:51:30 2011
@@ -0,0 +1,399 @@
+package org.apache.stanbol.enhancer.engines.keywordextraction.linking;
+
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+import opennlp.tools.chunker.Chunker;
+import opennlp.tools.postag.POSTagger;
+
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.stanbol.commons.opennlp.TextAnalyzer.AnalysedText;
+import org.apache.stanbol.commons.opennlp.TextAnalyzer.AnalysedText.Chunk;
+import org.apache.stanbol.commons.opennlp.TextAnalyzer.AnalysedText.Token;
+import org.apache.stanbol.enhancer.engines.keywordextraction.linking.Suggestion.MATCH;
+import org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses;
+import org.apache.stanbol.entityhub.servicesapi.defaults.NamespaceEnum;
+
+/**
+ * The configuration for the {@link EntityLinker}. Typically this
+ * configuration does not change often. Therefore it will be used for
+ * several {@link EntityLinker} instances processing different
+ * contents.
+ * @author Rupert Westenthaler
+ *
+ */
+public class EntityLinkerConfig {
+ /**
+ * The minimum length of Token to be used for searches in case no
+ * POS (Part of Speech) tags are available.
+ */
+ public static final int DEFAULT_MIN_SEARCH_TOKEN_LENGTH = 3;
+ /**
+ * The default number for the maximum number of terms suggested for a word
+ */
+ public static final int DEFAULT_SUGGESTIONS = 3;
+ /**
+ * Default value for the number of tokens that must be contained in
+ * suggested terms.
+ */
+ public static final int DEFAULT_MIN_FOUND_TOKENS = 2;
+ /**
+ * Multiple Tokens can be sent to the {@link EntitySearcher} service. The
+ * service uses this as optional parameters for the search. Therefore
+ * returned Concepts MUST contain at least a single of the parsed
+ * tokens. <p>
+ * The default value of <code>2</code> should be enough for nearly all
+ * Taxonomies to sufficiently reduce the number of results.<p>
+ * NOTE that the labels (nameField) of the results are compared as a
+ * whole. So even if only 2 Tokens are used for the search there may be
+ * more mapped to the actual label of an result.
+ */
+ public static final int DEFAULT_MAX_SEARCH_TOKENS = 2;
+
+ /**
+ * Default value for {@link #getNameField()} (rdfs:label)
+ */
+ public static final String DEFAULT_NAME_FIELD = "rdfs:label";
+ /**
+ * Default value for {@link #getTypeField()} (rdf:type)
+ */
+ public static final String DEFAULT_TYPE_FIELD = "rdf:type";
+ /**
+ * Default value for {@link #getRedirectField()} (rdf:seeAlso)
+ */
+ public static final String DEFAULT_REDIRECT_FIELD = "rdfs:seeAlso";
+ /**
+ * Default mapping for Concept types to dc:type values added for
+ * TextAnnotations.
+ */
+ public static final Map<String,UriRef> DEFAULT_ENTITY_TYPE_MAPPINGS;
+
+ static { //the default mappings for the three types used by the Stanbol Enhancement Structure
+ Map<String,UriRef> mappings = new HashMap<String,UriRef>();
+ mappings.put(OntologicalClasses.DBPEDIA_ORGANISATION.getUnicodeString(), OntologicalClasses.DBPEDIA_ORGANISATION);
+ mappings.put(NamespaceEnum.dbpediaOnt+"Newspaper", OntologicalClasses.DBPEDIA_ORGANISATION);
+ mappings.put(NamespaceEnum.schema+"Organization", OntologicalClasses.DBPEDIA_ORGANISATION);
+
+ mappings.put(OntologicalClasses.DBPEDIA_PERSON.getUnicodeString(), OntologicalClasses.DBPEDIA_PERSON);
+ mappings.put(NamespaceEnum.foaf+"Person", OntologicalClasses.DBPEDIA_PERSON);
+ mappings.put(NamespaceEnum.schema+"Person", OntologicalClasses.DBPEDIA_PERSON);
+
+ mappings.put(OntologicalClasses.DBPEDIA_PLACE.getUnicodeString(), OntologicalClasses.DBPEDIA_PLACE);
+ mappings.put(NamespaceEnum.schema+"Place", OntologicalClasses.DBPEDIA_PLACE);
+
+ mappings.put(OntologicalClasses.SKOS_CONCEPT.getUnicodeString(), OntologicalClasses.SKOS_CONCEPT);
+ DEFAULT_ENTITY_TYPE_MAPPINGS = Collections.unmodifiableMap(mappings);
+ }
+ /**
+ * Enumeration over the different possibilities on how to deal with
+ * redirects (similar to Browsers following HTTP status 303 and RDF defining
+ * the "rdf:seeAlso" relation.
+ * @author Rupert Westenthaler
+ */
+ public static enum RedirectProcessingMode {
+ /**
+ * Ignore redirects
+ */
+ IGNORE,
+ /**
+ * Follow redirects, but only add the values (e.g. labels, types) such
+ * entities to the original one.
+ */
+ ADD_VALUES,
+ /**
+ * Follow the redirect.
+ */
+ FOLLOW
+ }
+ /**
+ * The default value for how to process redirect is set to
+ * {@link RedirectProcessingMode#IGNORE}
+ */
+ public static RedirectProcessingMode DEFAULT_REDIRECT_PROCESSING_MODE =
+ RedirectProcessingMode.IGNORE;
+ /**
+ * The minimum length of labels that are looked-up in the directory
+ */
+ private int minSearchTokenLength = DEFAULT_MIN_SEARCH_TOKEN_LENGTH;
+ /**
+ * The the maximum number of terms suggested for a word
+ */
+ private int maxSuggestions = DEFAULT_SUGGESTIONS;
+ /**
+ * If several words are selected from the text to search for an Entity in the
+ * Dictionary (e.g. if a {@link Chunker} is used or if the {@link POSTagger}
+ * detects several connected nouns) that entities found for the such chunks
+ * MUST define a label (with no or the correct lanugage) that contains at
+ * least this number of tokens to be accepted.<p>
+ * TODO: make configurable
+ */
+ private int minFoundTokens = DEFAULT_MIN_FOUND_TOKENS;
+ /**
+ * The maximum numbers of Tokens sent to the {@link EntitySearcher} to search
+ * for concepts. <p>
+ * NOTE that the labels (nameField) of the results are compared as a
+ * whole. So even if only e.g. 2 tokens are used for the search there may be
+ * more mapped to the actual label of an result.
+ */
+ private int maxSearchTokens = DEFAULT_MAX_SEARCH_TOKENS;
+ /**
+ * Holds the mappings of rdf:type used by concepts to dc:type values used
+ * by TextAnnotations.
+ */
+ private Map<String,UriRef> typeMappings;
+ private Map<String, UriRef> unmodTypeMappings;
+ /**
+ * The mode on how to process redirect for Entities.
+ */
+ private RedirectProcessingMode redirectProcessingMode;
+ /**
+ * the default DC Type
+ */
+ private UriRef defaultDcType;
+ private String nameField;
+ private String redirectField;
+ private String typeField;
+ private Set<String> selectedFields = new HashSet<String>();
+ /**
+ * Default constructor the initialises the configuration with the
+ * default values
+ */
+ public EntityLinkerConfig(){
+ setMinSearchTokenLength(DEFAULT_MIN_SEARCH_TOKEN_LENGTH);
+ setMaxSuggestions(DEFAULT_SUGGESTIONS);
+ setMaxSearchTokens(DEFAULT_MAX_SEARCH_TOKENS);
+ setRedirectProcessingMode(DEFAULT_REDIRECT_PROCESSING_MODE);
+ typeMappings = new HashMap<String,UriRef>(DEFAULT_ENTITY_TYPE_MAPPINGS);
+ unmodTypeMappings = Collections.unmodifiableMap(typeMappings);
+ setDefaultDcType(typeMappings.remove(null));
+ setNameField(DEFAULT_NAME_FIELD);
+ setRedirectField(DEFAULT_REDIRECT_FIELD);
+ setTypeField(DEFAULT_TYPE_FIELD);
+ }
+ /**
+ * Getter for the uri of the field used for the names in the taxonomy
+ * (e.g. rdfs:label, skos:prefLabel). Needs to return the full URI
+ * @return the field used for the names of in the Taxonomy.
+ */
+ public final String getNameField() {
+ return nameField;
+ }
+ /**
+ * Setter for the uri of the field used for the names in the taxonomy
+ * (e.g. rdfs:label, skos:prefLabel).
+ * Converts short to full URIy by using the prefixes as registered in the
+ * {@link NamespaceEnum}.
+ * @param nameField the nameField to set
+ */
+ public final void setNameField(String nameField) {
+ this.nameField = NamespaceEnum.getFullName(nameField);
+ updateSelectedFields();
+ }
+ /**
+ * internally used to update the selected fields on changes to
+ * {@link #setNameField(String)}, {@link #setRedirectField(String)} or
+ * {@link #setTypeField(String)}
+ */
+ private void updateSelectedFields() {
+ selectedFields.clear();
+ selectedFields.add(nameField);
+ selectedFields.add(redirectField);
+ selectedFields.add(typeField);
+ }
+ /**
+ * Getter for the selected fields. A set that includes the current
+ * {@link #getNameField()}, {@link #getTypeField()} and {@link #getRedirectField()}.
+ * @return the selectedFields
+ */
+ public final Set<String> getSelectedFields() {
+ return selectedFields;
+ }
+ /**
+ * The field used to follow redirects (typically rdf:seeAlso)
+ * @return the redirect field
+ */
+ public final String getRedirectField() {
+ return redirectField;
+ }
+ /**
+ * The field used to follow redirects (typically rdf:seeAlso)
+ * Converts short to full URIy by using the prefixes as registered in the
+ * {@link NamespaceEnum}.
+ * @param redirectField the redirectField to set
+ */
+ public final void setRedirectField(String redirectField) {
+ this.redirectField = NamespaceEnum.getFullName(redirectField);
+ updateSelectedFields();
+ }
+ /**
+ * The field used to lookup the types (typically rdf:type)
+ * @return the field name used to lookup types
+ */
+ public final String getTypeField() {
+ return typeField;
+ }
+ /**
+ * The field used to lookup the types (typically rdf:type)
+ * Converts short to full URIy by using the prefixes as registered in the
+ * {@link NamespaceEnum}.
+ * @param typeField the typeField to set
+ */
+ public final void setTypeField(String typeField) {
+ this.typeField = NamespaceEnum.getFullName(typeField);
+ updateSelectedFields();
+ }
+ /**
+ * The minimum number of character a {@link Token} (word) must have to be
+ * used {@link EntitySearcher#lookup(java.util.List, String...) lookup} concepts
+ * in the taxonomy. Note that this parameter is only used of no POS (Part-
+ * of-speech) tags are available in the {@link AnalysedText}.
+ * @param minSearchTokenLength the minSearchTokenLength to set
+ */
+ public void setMinSearchTokenLength(int minSearchTokenLength) {
+ this.minSearchTokenLength = minSearchTokenLength;
+ }
+ /**
+ * The minimum number of character a {@link Token} (word) must have to be
+ * used {@link EntitySearcher#lookup(java.util.List, String...) lookup} concepts
+ * in the taxonomy. Note that this parameter is only used of no POS (Part-
+ * of-speech) tags are available in the {@link AnalysedText}.
+ * @return the minSearchTokenLength
+ */
+ public int getMinSearchTokenLength() {
+ return minSearchTokenLength;
+ }
+ /**
+ * Setter for the maximum number of suggestion returned.
+ * @param maxSuggestions the maxSuggestions to set
+ */
+ public void setMaxSuggestions(int maxSuggestions) {
+ this.maxSuggestions = maxSuggestions;
+ }
+ /**
+ * Getter for the maximum number of suggestion returned.
+ * @return the maxSuggestions
+ */
+ public int getMaxSuggestions() {
+ return maxSuggestions;
+ }
+ /**
+ * Setter for the minimum number of Tokens (of the content) that MUST match
+ * with a {@link EntitySearcher#getNameField() label} of a
+ * {@link EntitySearcher#lookup(java.util.List, String...) concept of the taxonomy}
+ * so that it is {@link Suggestion suggested} even if the match is only
+ * {@link MATCH#PARTIAL}. Entities that match less than that are only included
+ * if a label is an {@link MATCH#EXACT EXACT} match with the current position
+ * in the text.
+ * @param minFoundTokens the minFoundTokens to set
+ */
+ public void setMinFoundTokens(int minFoundTokens) {
+ this.minFoundTokens = minFoundTokens;
+ }
+ /**
+ * Getter for the minimum number of Tokens (of the content) that MUST match
+ * with a {@link EntitySearcher#getNameField() label} of a
+ * {@link EntitySearcher#lookup(java.util.List, String...) concept of the taxonomy}
+ * so that it is {@link Suggestion suggested} even if the match is only
+ * {@link MATCH#PARTIAL}. Entities that match less than that are only included
+ * if a label is an {@link MATCH#EXACT EXACT} match with the current position
+ * in the text.
+ * @return the minFoundTokens
+ */
+ public int getMinFoundTokens() {
+ return minFoundTokens;
+ }
+ /**
+ * Getter for the maximum number of tokens parsed to
+ * {@link EntitySearcher#lookup(java.util.List, String...)}
+ * @return the maxSearchTokens
+ */
+ public final int getMaxSearchTokens() {
+ return maxSearchTokens;
+ }
+ /**
+ * The maximum number of tokens parsed to
+ * {@link EntitySearcher#lookup(java.util.List, String...)}. This is NOT the
+ * maximum number of Tokens mapped for Entities returned by such queries.<p>
+ * In case {@link Chunk}s are available in the parsed {@link AnalysedText}
+ * searches can be scoped by such chunks. However if no chunks are available,
+ * than this value is used to collect this number of words in the text.<p>
+ * The {@link #DEFAULT_MAX_SEARCH_TOKENS default value} of <code>2</code>
+ * should be ok in most cases.
+ * @param maxSearchTokens the maxSearchTokens to set
+ */
+ public final void setMaxSearchTokens(int maxSearchTokens) {
+ this.maxSearchTokens = maxSearchTokens;
+ }
+ /**
+ * Removes the mapping for the parsed concept type
+ * @param conceptType the concept type to remove the mapping
+ * @return the previously mapped dc:type value or <code>null</code> if
+ * no mapping for the parsed concept type was present
+ */
+ public UriRef removeTypeMapping(String conceptType){
+ return typeMappings.remove(conceptType);
+ }
+ /**
+ *
+ * @param conceptType the type of the concept or <code>null</code> to
+ * add the default dc:type mapping. See also {@link #setDefaultDcType(UriRef)}
+ * @param dcType the dc:type for the parsed concept type
+ * @return the previously mapped dc:type value if an existing mapping
+ * was updated or <code>null</code> if a new mapping was added.
+ */
+ public UriRef setTypeMapping(String conceptType, UriRef dcType){
+ if(dcType == null) {
+ throw new IllegalArgumentException("The parsed dc:type URI MUST NOT be NULL!");
+ }
+ if(conceptType == null){ //handle setting of the default dc:type value
+ UriRef oldDefault = getDefaultDcType();
+ setDefaultDcType(dcType);
+ return oldDefault;
+ }
+ return typeMappings.put(conceptType, dcType);
+ }
+
+ /**
+ * Setter for the default dc:type of linked entities if for none of the
+ * types of the suggestions a {@link #getTypeMappings()} exists. Set this
+ * to <code>null</code> to specify that no dc:type should be set in such
+ * cases.
+ * @param defaultDcType the defaultDcType to set
+ */
+ public void setDefaultDcType(UriRef defaultDcType) {
+ this.defaultDcType = defaultDcType;
+ }
+ /**
+ * The default type for Entities if no {@link #getTypeMappings() type mapping}
+ * is present. <code>null</code> means that no type should be set if no
+ * explicit mapping exists
+ * @return the defaultDcType
+ */
+ public UriRef getDefaultDcType() {
+ return defaultDcType;
+ }
+ /**
+ * Setter for the mode on how to deal with redirects
+ * @param redirectProcessingMode the redirectProcessingMode to set
+ */
+ public void setRedirectProcessingMode(RedirectProcessingMode redirectProcessingMode) {
+ this.redirectProcessingMode = redirectProcessingMode;
+ }
+ /**
+ * Getter for the mode how to deal with redirects
+ * @return the redirectProcessingMode
+ */
+ public RedirectProcessingMode getRedirectProcessingMode() {
+ return redirectProcessingMode;
+ }
+ /**
+ * Getter for the read only mappings of type mappings
+ * @return the type mappings (read only)
+ */
+ public Map<String,UriRef> getTypeMappings() {
+ return unmodTypeMappings;
+ }
+}
\ No newline at end of file
Propchange: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinkerConfig.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntitySearcher.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntitySearcher.java?rev=1173968&view=auto
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntitySearcher.java (added)
+++ incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntitySearcher.java Thu Sep 22 06:51:30 2011
@@ -0,0 +1,51 @@
+package org.apache.stanbol.enhancer.engines.keywordextraction.linking;
+
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.stanbol.entityhub.servicesapi.Entityhub;
+import org.apache.stanbol.entityhub.servicesapi.model.Representation;
+import org.apache.stanbol.entityhub.servicesapi.query.TextConstraint;
+import org.apache.stanbol.entityhub.servicesapi.site.ReferencedSite;
+
+/**
+ * Interface used to search for Entities (e.g. as defined by a Controlled
+ * Vocabulary) Different implementations of this interface allow to use
+ * different sources. Typically the {@link Entityhub} or a {@link ReferencedSite}
+ * will be used as source, but in some cases one might also use in-memory
+ * implementation.
+ * @author Rupert Westenthaler
+ */
+public interface EntitySearcher {
+ /**
+ * Lookup Concepts for the parsed strings. Parameters follow the same
+ * rules as {@link TextConstraint#TextConstraint(List, String...)}
+ * @param field the field used to search for values in the parsed languages
+ * @param includeFields A set of fields that need to be included within the
+ * returned {@link Representation}. The parsed field needs also to be included
+ * even if missing in this set. If <code>null</code> only the field needs
+ * to be included. Other fields MAY also be included.
+ * @param search the tokens to search for. MUST NOT be <code>null</code>
+ * @param languages the languages to include in the search
+ * @return the Representations found for the specified query
+ * @throws T An exception while searching for concepts
+ */
+ Collection<? extends Representation> lookup(String field,Set<String> includeFields,List<String> search,String...languages) throws IllegalStateException;
+ /**
+ * Lookup a concept of the taxonomy by the id.
+ * @param id the id
+ * @param includeFields A set of fields that need to be included within the
+ * returned {@link Representation}. Other fields MAY be also included.
+ * @return the concept or <code>null</code> if not found
+ */
+ Representation get(String id,Set<String> includeFields) throws IllegalStateException;
+ /**
+ * Returns <code>true</code> if this EntitySearcher can operate without
+ * dependencies to remote services. This is important because Stanbol can
+ * be forced to run in offline-mode.
+ * @return the state
+ */
+ boolean supportsOfflineMode();
+}
\ No newline at end of file
Propchange: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntitySearcher.java
------------------------------------------------------------------------------
svn:mime-type = text/plain