You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by fc...@apache.org on 2012/03/16 13:26:28 UTC
svn commit: r1301460 - in /incubator/stanbol/branches/0.9.0-incubating: ./
commons/opennlp/
commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/
enhancer/engines/
enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer...
Author: fchrist
Date: Fri Mar 16 12:26:27 2012
New Revision: 1301460
URL: http://svn.apache.org/viewvc?rev=1301460&view=rev
Log:
Merging latest changes from trunk to the release branch
Added:
incubator/stanbol/branches/0.9.0-incubating/commons/opennlp/README.md
- copied unchanged from r1301458, incubator/stanbol/trunk/commons/opennlp/README.md
incubator/stanbol/branches/0.9.0-incubating/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/KeywordTokenizer.java
- copied unchanged from r1301458, incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/KeywordTokenizer.java
Modified:
incubator/stanbol/branches/0.9.0-incubating/ (props changed)
incubator/stanbol/branches/0.9.0-incubating/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/TextAnalyzer.java
incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java
incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java
incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinkerConfig.java
incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/Suggestion.java
incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/keywordextraction/src/main/resources/OSGI-INF/metatype/metatype.properties
incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/pom.xml
incubator/stanbol/branches/0.9.0-incubating/enhancer/jersey/src/main/java/org/apache/stanbol/enhancer/jersey/resource/ContentItemResource.java
incubator/stanbol/branches/0.9.0-incubating/enhancer/jersey/src/main/resources/org/apache/stanbol/enhancer/jersey/templates/imports/contentitem.ftl
incubator/stanbol/branches/0.9.0-incubating/entityhub/generic/servicesapi/src/main/java/org/apache/stanbol/entityhub/servicesapi/defaults/NamespaceEnum.java
incubator/stanbol/branches/0.9.0-incubating/entityhub/yard/solr/src/main/java/org/apache/stanbol/entityhub/yard/solr/query/QueryUtils.java
incubator/stanbol/branches/0.9.0-incubating/reasoners/ (props changed)
Propchange: incubator/stanbol/branches/0.9.0-incubating/
------------------------------------------------------------------------------
svn:mergeinfo = /incubator/stanbol/trunk:1301064-1301458
Modified: incubator/stanbol/branches/0.9.0-incubating/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/TextAnalyzer.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/0.9.0-incubating/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/TextAnalyzer.java?rev=1301460&r1=1301459&r2=1301460&view=diff
==============================================================================
--- incubator/stanbol/branches/0.9.0-incubating/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/TextAnalyzer.java (original)
+++ incubator/stanbol/branches/0.9.0-incubating/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/TextAnalyzer.java Fri Mar 16 12:26:27 2012
@@ -52,6 +52,7 @@ public class TextAnalyzer {
public static final class TextAnalyzerConfig {
protected boolean forceSimpleTokenizer = false; //default to false
+ protected boolean forceKeywordTokenizer = false; //default to false
protected boolean enablePosTagger = true;
protected boolean enableChunker = true;
protected boolean enableSentenceDetector = true;
@@ -67,6 +68,19 @@ public class TextAnalyzer {
public final void forceSimpleTokenizer(boolean useSimpleTokenizer) {
this.forceSimpleTokenizer = useSimpleTokenizer;
+ if(useSimpleTokenizer){
+ this.forceKeywordTokenizer = false;
+ }
+ }
+ public final boolean isKeywordTokenizerForced() {
+ return forceKeywordTokenizer;
+ }
+
+ public final void forceKeywordTokenizer(boolean useKeywordTokenizer) {
+ this.forceKeywordTokenizer = useKeywordTokenizer;
+ if(useKeywordTokenizer){
+ this.forceSimpleTokenizer = false;
+ }
}
public final boolean isPosTaggerEnable() {
@@ -237,6 +251,8 @@ public class TextAnalyzer {
if(tokenizer == null){
if(config.forceSimpleTokenizer){
tokenizer = SimpleTokenizer.INSTANCE;
+ } else if(config.forceKeywordTokenizer){
+ tokenizer = KeywordTokenizer.INSTANCE;
} else {
tokenizer = openNLP.getTokenizer(language);
if(tokenizer == null){
Modified: incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java?rev=1301460&r1=1301459&r2=1301460&view=diff
==============================================================================
--- incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java (original)
+++ incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java Fri Mar 16 12:26:27 2012
@@ -19,6 +19,8 @@ package org.apache.stanbol.enhancer.engi
import static org.apache.stanbol.entityhub.servicesapi.defaults.NamespaceEnum.getFullName;
import java.io.IOException;
+import java.net.URI;
+import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
@@ -77,6 +79,7 @@ import org.apache.stanbol.enhancer.servi
import org.apache.stanbol.enhancer.servicesapi.rdf.Properties;
import org.apache.stanbol.entityhub.model.clerezza.RdfValueFactory;
import org.apache.stanbol.entityhub.servicesapi.Entityhub;
+import org.apache.stanbol.entityhub.servicesapi.defaults.NamespaceEnum;
import org.apache.stanbol.entityhub.servicesapi.model.Reference;
import org.apache.stanbol.entityhub.servicesapi.model.Text;
import org.apache.stanbol.entityhub.servicesapi.site.ReferencedSite;
@@ -98,6 +101,7 @@ import org.slf4j.LoggerFactory;
@Property(name=EnhancementEngine.PROPERTY_NAME),
@Property(name=KeywordLinkingEngine.REFERENCED_SITE_ID),
@Property(name=KeywordLinkingEngine.NAME_FIELD,value=EntityLinkerConfig.DEFAULT_NAME_FIELD),
+ @Property(name=KeywordLinkingEngine.CASE_SENSITIVE,boolValue=EntityLinkerConfig.DEFAULT_CASE_SENSITIVE_MATCHING_STATE),
@Property(name=KeywordLinkingEngine.TYPE_FIELD,value=EntityLinkerConfig.DEFAULT_TYPE_FIELD),
@Property(name=KeywordLinkingEngine.REDIRECT_FIELD,value=EntityLinkerConfig.DEFAULT_REDIRECT_FIELD),
@Property(name=KeywordLinkingEngine.REDIRECT_PROCESSING_MODE,options={
@@ -113,10 +117,12 @@ import org.slf4j.LoggerFactory;
},value="IGNORE"),
@Property(name=KeywordLinkingEngine.MIN_SEARCH_TOKEN_LENGTH,
intValue=EntityLinkerConfig.DEFAULT_MIN_SEARCH_TOKEN_LENGTH),
+ @Property(name=KeywordLinkingEngine.KEYWORD_TOKENIZER,boolValue=false),
@Property(name=KeywordLinkingEngine.MAX_SUGGESTIONS,
intValue=EntityLinkerConfig.DEFAULT_SUGGESTIONS),
@Property(name=KeywordLinkingEngine.PROCESSED_LANGUAGES,value=""),
@Property(name=KeywordLinkingEngine.DEFAULT_MATCHING_LANGUAGE,value=""),
+ @Property(name=KeywordLinkingEngine.TYPE_MAPPINGS,cardinality=1000),
@Property(name=KeywordLinkingEngine.DEREFERENCE_ENTITIES,
boolValue=KeywordLinkingEngine.DEFAULT_DEREFERENCE_ENTITIES_STATE),
@Property(name=Constants.SERVICE_RANKING,intValue=0)
@@ -147,6 +153,7 @@ public class KeywordLinkingEngine
public static final String REFERENCED_SITE_ID = "org.apache.stanbol.enhancer.engines.keywordextraction.referencedSiteId";
public static final String NAME_FIELD = "org.apache.stanbol.enhancer.engines.keywordextraction.nameField";
public static final String TYPE_FIELD = "org.apache.stanbol.enhancer.engines.keywordextraction.typeField";
+ public static final String CASE_SENSITIVE = "org.apache.stanbol.enhancer.engines.keywordextraction.caseSensitive";
public static final String REDIRECT_FIELD = "org.apache.stanbol.enhancer.engines.keywordextraction.redirectField";
public static final String REDIRECT_PROCESSING_MODE = "org.apache.stanbol.enhancer.engines.keywordextraction.redirectMode";
public static final String MIN_SEARCH_TOKEN_LENGTH = "org.apache.stanbol.enhancer.engines.keywordextraction.minSearchTokenLength";
@@ -155,7 +162,8 @@ public class KeywordLinkingEngine
public static final String MIN_FOUND_TOKENS= "org.apache.stanbol.enhancer.engines.keywordextraction.minFoundTokens";
public static final String DEFAULT_MATCHING_LANGUAGE = "org.apache.stanbol.enhancer.engines.keywordextraction.defaultMatchingLanguage";
public static final String MIN_POS_TAG_PROBABILITY = "org.apache.stanbol.enhancer.engines.keywordextraction.minPosTagProbability";
-// public static final String SIMPLE_TOKENIZER = "org.apache.stanbol.enhancer.engines.keywordextraction.simpleTokenizer";
+ public static final String TYPE_MAPPINGS = "org.apache.stanbol.enhancer.engines.keywordextraction.typeMappings";
+ public static final String KEYWORD_TOKENIZER = "org.apache.stanbol.enhancer.engines.keywordextraction.keywordTokenizer";
// public static final String ENABLE_CHUNKER = "org.apache.stanbol.enhancer.engines.keywordextraction.enableChunker";
/**
* Adds the dereference feature (STANBOL-333) also to this engine.
@@ -590,6 +598,13 @@ public class KeywordLinkingEngine
"The configured min POS tag probability MUST BE in the range [0..1] " +
"or < 0 to deactivate this feature (parsed value "+value+")!");
}
+ value = configuration.get(KEYWORD_TOKENIZER);
+ //the keyword tokenizer config
+ if(value instanceof Boolean){
+ nlpConfig.forceKeywordTokenizer((Boolean)value);
+ } else if(value != null && !value.toString().isEmpty()){
+ nlpConfig.forceKeywordTokenizer(Boolean.valueOf(value.toString()));
+ }
nlpConfig.setMinPosTagProbability(minPosTagProb);
analysedContentFactory = OpenNlpAnalysedContentFactory.getInstance(openNLP,nlpConfig);
}
@@ -626,6 +641,13 @@ public class KeywordLinkingEngine
}
linkerConfig.setNameField(value.toString());
}
+ //init case sensitivity
+ value = configuration.get(CASE_SENSITIVE);
+ if(value instanceof Boolean){
+ linkerConfig.setCaseSensitiveMatchingState((Boolean)value);
+ } else if(value != null && !value.toString().isEmpty()){
+ linkerConfig.setCaseSensitiveMatchingState(Boolean.valueOf(value.toString()));
+ } //if NULL or empty use default
//init TYPE_FIELD
value = configuration.get(TYPE_FIELD);
if(value != null){
@@ -725,6 +747,66 @@ public class KeywordLinkingEngine
linkerConfig.setDefaultLanguage(defaultLang);
}
}
+ //init type mappings
+ value = configuration.get(TYPE_MAPPINGS);
+ if(value instanceof String[]){ //support array
+ value = Arrays.asList((String[])value);
+ } else if(value instanceof String) { //single value
+ value = Collections.singleton(value);
+ }
+ if(value instanceof Collection<?>){ //and collection
+ log.info("Init Type Mappings");
+ configs :
+ for(Object o : (Iterable<?>)value){
+ if(o != null){
+ StringBuilder usage = new StringBuilder("useages: ");
+ usage.append("a: '{uri}' short for {uri} > {uri} | ");
+ usage.append("b: '{source1};{source2};..;{sourceN} > {target}'");
+ String[] config = o.toString().split(">");
+ if(config[0].isEmpty()){
+ log.warn("Invalid Type Mapping Config '{}': Missing Source Type ({}) -> ignore this config",
+ o,usage);
+ continue configs;
+ }
+ String[] sourceTypes = config[0].split(";");
+ if(sourceTypes.length > 1 && (config.length < 2 || config[1].isEmpty())){
+ log.warn("Invalid Type Mapping Config '{}': Missing Target Type '{}' ({}) -> ignore this config",
+ o,usage);
+ continue configs;
+ }
+ String targetType = config.length < 2 ? sourceTypes[0] : config[1];
+ targetType = getFullName(targetType.trim()); //support for ns:localName
+ try { //validate
+ new URI(targetType);
+ } catch (URISyntaxException e) {
+ log.warn("Invalid URI '{}' in Type Mapping Config '{}' -> ignore this config",
+ sourceTypes[0],o);
+ continue configs;
+ }
+ UriRef targetUri = new UriRef(targetType);
+ for(String sourceType : sourceTypes){
+ if(!sourceType.isEmpty()){
+ sourceType = getFullName(sourceType.trim()); //support for ns:localName
+ try { //validate
+ new URI(sourceType);
+ UriRef old = linkerConfig.setTypeMapping(sourceType, targetUri);
+ if(old == null){
+ log.info(" > add type mapping {} > {}", sourceType,targetType);
+ } else {
+ log.info(" > set type mapping {} > {} (old: {})",
+ new Object[]{sourceType,targetType,old.getUnicodeString()});
+ }
+ } catch (URISyntaxException e) {
+ log.warn("Invalid URI '{}' in Type Mapping Config '{}' -> ignore this source type",
+ sourceTypes[0],o);
+ }
+ }
+ }
+ }
+ }
+ } else {
+ log.debug("No Type mappings configured");
+ }
}
/**
Modified: incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java?rev=1301460&r1=1301459&r2=1301460&view=diff
==============================================================================
--- incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java (original)
+++ incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java Fri Mar 16 12:26:27 2012
@@ -353,7 +353,10 @@ public class EntityLinker {
* @param label
*/
private void matchLabel(Suggestion match, Text label) {
- String text = label.getText().toLowerCase();
+ String text = label.getText();
+ if(!config.isCaseSensitiveMatching()){
+ text = text.toLowerCase(); //TODO use language of label for Locale
+ }
//Tokenize the label and remove remove tokens without alpha numerical chars
String[] unprocessedLabelTokens = content.tokenize(text);
int offset = 0;
@@ -396,7 +399,10 @@ public class EntityLinker {
&& search ;currentIndex++){
currentToken = state.getSentence().getTokens().get(currentIndex);
if(currentToken.hasAplhaNumericChar()){
- currentTokenText = currentToken.getText().toLowerCase();
+ currentTokenText = currentToken.getText();
+ if(!config.isCaseSensitiveMatching()){
+ currentTokenText = currentTokenText.toLowerCase();
+ }
currentTokenLength = currentTokenText.length();
boolean isProcessable = isProcessableToken(currentToken);
boolean found = false;
@@ -460,7 +466,10 @@ public class EntityLinker {
String labelTokenText = labelTokens[labelIndex];
if(labelTokenSet.remove(labelTokenText)){ //still not matched
currentToken = state.getSentence().getTokens().get(currentIndex);
- currentTokenText = currentToken.getText().toLowerCase();
+ currentTokenText = currentToken.getText();
+ if(!config.isCaseSensitiveMatching()){
+ currentTokenText = currentTokenText.toLowerCase();
+ }
currentTokenLength = currentTokenText.length();
boolean found = false;
float matchFactor = 0f;
@@ -503,7 +512,7 @@ public class EntityLinker {
// match (this will be very rare
if(foundProcessableTokens > 0 && match.getMatchCount() <= foundProcessableTokens) {
String currentText = state.getTokenText(firstFoundIndex,coveredTokens);
- if(currentText.equalsIgnoreCase(text)){
+ if(config.isCaseSensitiveMatching() ? currentText.equals(text) : currentText.equalsIgnoreCase(text)){
labelMatch = MATCH.EXACT;
//set found to covered: May be lower because only
//processable tokens are counted, but Exact also checks
Modified: incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinkerConfig.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinkerConfig.java?rev=1301460&r1=1301459&r2=1301460&view=diff
==============================================================================
--- incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinkerConfig.java (original)
+++ incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinkerConfig.java Fri Mar 16 12:26:27 2012
@@ -88,6 +88,10 @@ public class EntityLinkerConfig {
*/
public static final String DEFAULT_LANGUAGE = null;
/**
+ * The default for case sensitive matching is set to <code>false</code>
+ */
+ public static final boolean DEFAULT_CASE_SENSITIVE_MATCHING_STATE = false;
+ /**
* Default mapping for Concept types to dc:type values added for
* TextAnnotations.
*/
@@ -98,6 +102,7 @@ public class EntityLinkerConfig {
mappings.put(OntologicalClasses.DBPEDIA_ORGANISATION.getUnicodeString(), OntologicalClasses.DBPEDIA_ORGANISATION);
mappings.put(NamespaceEnum.dbpediaOnt+"Newspaper", OntologicalClasses.DBPEDIA_ORGANISATION);
mappings.put(NamespaceEnum.schema+"Organization", OntologicalClasses.DBPEDIA_ORGANISATION);
+// mappings.put(NamespaceEnum.dailymed+"organization",OntologicalClasses.DBPEDIA_ORGANISATION);
mappings.put(OntologicalClasses.DBPEDIA_PERSON.getUnicodeString(), OntologicalClasses.DBPEDIA_PERSON);
mappings.put(NamespaceEnum.foaf+"Person", OntologicalClasses.DBPEDIA_PERSON);
@@ -108,6 +113,25 @@ public class EntityLinkerConfig {
mappings.put(NamespaceEnum.gml+"_Feature", OntologicalClasses.DBPEDIA_PLACE);
mappings.put(OntologicalClasses.SKOS_CONCEPT.getUnicodeString(), OntologicalClasses.SKOS_CONCEPT);
+
+// UriRef DRUG = new UriRef(NamespaceEnum.drugbank+"drugs");
+// mappings.put(DRUG.getUnicodeString(), DRUG);
+// mappings.put(NamespaceEnum.dbpediaOnt+"Drug", DRUG);
+// mappings.put(NamespaceEnum.dailymed+"drugs", DRUG);
+// mappings.put(NamespaceEnum.sider+"drugs", DRUG);
+// mappings.put(NamespaceEnum.tcm+"Medicine", DRUG);
+//
+// UriRef DISEASE = new UriRef(NamespaceEnum.diseasome+"diseases");
+// mappings.put(DISEASE.getUnicodeString(), DISEASE);
+// mappings.put(NamespaceEnum.linkedct+"condition", DISEASE);
+// mappings.put(NamespaceEnum.tcm+"Disease", DISEASE);
+//
+// UriRef SIDE_EFFECT = new UriRef(NamespaceEnum.sider+"side_effects");
+// mappings.put(SIDE_EFFECT.getUnicodeString(), SIDE_EFFECT);
+//
+// UriRef INGREDIENT = new UriRef(NamespaceEnum.dailymed+"ingredients");
+// mappings.put(INGREDIENT.getUnicodeString(), INGREDIENT);
+
DEFAULT_ENTITY_TYPE_MAPPINGS = Collections.unmodifiableMap(mappings);
}
/**
@@ -162,6 +186,8 @@ public class EntityLinkerConfig {
* more mapped to the actual label of an result.
*/
private int maxSearchTokens = DEFAULT_MAX_SEARCH_TOKENS;
+
+ private boolean caseSensitiveMatchingState = DEFAULT_CASE_SENSITIVE_MATCHING_STATE;
/**
* Holds the mappings of rdf:type used by concepts to dc:type values used
* by TextAnnotations.
@@ -356,6 +382,20 @@ public class EntityLinkerConfig {
this.maxSearchTokens = maxSearchTokens;
}
/**
+ * Getter for the case sensitive matching state
+ * @return the state
+ */
+ public boolean isCaseSensitiveMatching() {
+ return caseSensitiveMatchingState;
+ }
+ /**
+ * Setter for the case sensitive matching state
+ * @param caseSensitiveMatchingState the state
+ */
+ public void setCaseSensitiveMatchingState(boolean state) {
+ this.caseSensitiveMatchingState = state;
+ }
+ /**
* Removes the mapping for the parsed concept type
* @param conceptType the concept type to remove the mapping
* @return the previously mapped dc:type value or <code>null</code> if
Modified: incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/Suggestion.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/Suggestion.java?rev=1301460&r1=1301459&r2=1301460&view=diff
==============================================================================
--- incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/Suggestion.java (original)
+++ incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/Suggestion.java Fri Mar 16 12:26:27 2012
@@ -212,10 +212,12 @@ public class Suggestion implements Compa
* @return the best match or {@link Suggestion#getMatchedLabel()} if non is found
*/
public Text getBestLabel(String nameField, String language){
- Representation rep = getRepresentation();
+ Representation rep = getRepresentation();
+ //start with the matched label -> so if we do not find a better one
+ //we will use the matched!
+ Text label = this.label;
// 1. check if the returned Entity does has a label -> if not return null
// add labels (set only a single label. Use "en" if available!
- Text label = null;
Iterator<Text> labels = rep.getText(nameField);
boolean matchFound = false;
while (labels.hasNext() && !matchFound) {
Modified: incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/keywordextraction/src/main/resources/OSGI-INF/metatype/metatype.properties
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/keywordextraction/src/main/resources/OSGI-INF/metatype/metatype.properties?rev=1301460&r1=1301459&r2=1301460&view=diff
==============================================================================
--- incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/keywordextraction/src/main/resources/OSGI-INF/metatype/metatype.properties (original)
+++ incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/keywordextraction/src/main/resources/OSGI-INF/metatype/metatype.properties Fri Mar 16 12:26:27 2012
@@ -56,6 +56,10 @@ org.apache.stanbol.enhancer.engines.keyw
org.apache.stanbol.enhancer.engines.keywordextraction.typeField.description=The field used to \
retrieve the types of matched Entities. Values of that field are expected to be URIs
+org.apache.stanbol.enhancer.engines.keywordextraction.caseSensitive.name=Case Sensitivity
+org.apache.stanbol.enhancer.engines.keywordextraction.caseSensitive.description=Allows to enable/disable \
+case sensitive matching
+
org.apache.stanbol.enhancer.engines.keywordextraction.redirectField.name=Redirect Field
org.apache.stanbol.enhancer.engines.keywordextraction.redirectField.description=Entities may \
define redirects to other Entities (e.g. "USA"(http://dbpedia.org/resource/USA) -> \
@@ -97,3 +101,17 @@ configuration (e.g. to 'en' in the case
org.apache.stanbol.enhancer.engines.keywordextraction.dereference.name=Dereference Entities
org.apache.stanbol.enhancer.engines.keywordextraction.dereference.description=If enabled additional \
data for suggested Entities are included.
+
+org.apache.stanbol.enhancer.engines.keywordextraction.typeMappings.name=Type Mappings
+org.apache.stanbol.enhancer.engines.keywordextraction.typeMappings.description=This allows to add \
+additional entity-type > text-annotation-type mappings. Such mappings are used to determine the \
+'dc:type' value of the 'fise:TextAnnotation' created for extracted entities. Usage: \
+variant (a) '{uri}' short for {uri} > {uri} or (b) '{source1};{source2};..;{sourceN} > {target}'. \
+Note that a {source} may be only mapped to a single {target}. Multiple {source} types \
+can be mapped to the same {target}.
+
+org.apache.stanbol.enhancer.engines.keywordextraction.keywordTokenizer.name=Keyword Tokenizer
+org.apache.stanbol.enhancer.engines.keywordextraction.keywordTokenizer.description=This allows \
+to use a special Tokenizer for matching keywords and alpha numeric IDs. Typical language \
+specific Tokenizers tned to split such IDs in several tokens and therefore might prevent \
+a correct matching.
Modified: incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/pom.xml
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/pom.xml?rev=1301460&r1=1301459&r2=1301460&view=diff
==============================================================================
--- incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/pom.xml (original)
+++ incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/pom.xml Fri Mar 16 12:26:27 2012
@@ -45,7 +45,6 @@
<module>opennlp-ner</module>
<module>langid</module>
<module>topic</module>
- <module>metaxa</module>
<module>tika</module>
<module>geonames</module>
<module>entitytagging</module>
Modified: incubator/stanbol/branches/0.9.0-incubating/enhancer/jersey/src/main/java/org/apache/stanbol/enhancer/jersey/resource/ContentItemResource.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/0.9.0-incubating/enhancer/jersey/src/main/java/org/apache/stanbol/enhancer/jersey/resource/ContentItemResource.java?rev=1301460&r1=1301459&r2=1301460&view=diff
==============================================================================
--- incubator/stanbol/branches/0.9.0-incubating/enhancer/jersey/src/main/java/org/apache/stanbol/enhancer/jersey/resource/ContentItemResource.java (original)
+++ incubator/stanbol/branches/0.9.0-incubating/enhancer/jersey/src/main/java/org/apache/stanbol/enhancer/jersey/resource/ContentItemResource.java Fri Mar 16 12:26:27 2012
@@ -47,10 +47,12 @@ import java.util.Collections;
import java.util.Date;
import java.util.EnumMap;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
+import java.util.Set;
import java.util.TreeMap;
import javax.servlet.ServletContext;
@@ -80,6 +82,7 @@ import org.apache.clerezza.rdf.core.seri
import org.apache.clerezza.rdf.core.sparql.ParseException;
import org.apache.clerezza.rdf.ontologies.RDF;
import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang.StringUtils;
import org.apache.stanbol.commons.indexedgraph.IndexedMGraph;
import org.apache.stanbol.commons.web.base.resource.BaseStanbolResource;
import org.apache.stanbol.enhancer.servicesapi.Blob;
@@ -232,7 +235,40 @@ public class ContentItemResource extends
public URI getMetadataHref() {
return metadataHref;
}
-
+ /**
+ * Checks if there are Occurrences
+ */
+ public boolean hasOccurrences(){
+ for(Map<String,EntityExtractionSummary> occ : extractionsByTypeMap.values()){
+ if(!occ.isEmpty()){
+ return true;
+ }
+ }
+ return false;
+ }
+ /**
+ * Used to print occurrences with other types than the natively supported
+ */
+ public Collection<UriRef> getOtherOccurrencyTypes(){
+ Set<UriRef> types = new HashSet<UriRef>(extractionsByTypeMap.keySet());
+ types.remove(DBPEDIA_PERSON);
+ types.remove(DBPEDIA_ORGANISATION);
+ types.remove(DBPEDIA_PLACE);
+ types.remove(SKOS_CONCEPT);
+ types.remove(null); //other
+ return types;
+ }
+ public String extractLabel(UriRef uri){
+ String fullUri = uri.getUnicodeString();
+ int index = Math.max(fullUri.lastIndexOf('#'),fullUri.lastIndexOf('/'));
+ index = Math.max(index, fullUri.lastIndexOf(':'));
+ //do not convert if the parsed uri does not contain a local name
+ if(index > 0 && index+1 < fullUri.length()){
+ return StringUtils.capitalize(fullUri.substring(index+1).replaceAll("[\\-_]", " "));
+ } else {
+ return uri.getUnicodeString();
+ }
+ }
public Collection<EntityExtractionSummary> getOccurrences(UriRef type){
Map<String,EntityExtractionSummary> typeMap = extractionsByTypeMap.get(type);
Collection<EntityExtractionSummary> typeOccurrences;
@@ -401,13 +437,17 @@ public class ContentItemResource extends
public String getThumbnailSrc() {
if (suggestions.isEmpty()) {
- return defaultThumbnails.get(type);
+ return getMissingThumbnailSrc();
}
return suggestions.get(0).getThumbnailSrc();
}
public String getMissingThumbnailSrc() {
- return defaultThumbnails.get(type);
+ String source = defaultThumbnails.get(type);
+ if(source == null){
+ source = defaultThumbnails.get(null);//default
+ }
+ return source;
}
public EntitySuggestion getBestGuess() {
@@ -512,11 +552,15 @@ public class ContentItemResource extends
return ((UriRef) object).getUnicodeString();
}
}
- return defaultThumbnails.get(type);
+ return getMissingThumbnailSrc();
}
public String getMissingThumbnailSrc() {
- return defaultThumbnails.get(type);
+ String source = defaultThumbnails.get(type);
+ if(source == null){
+ source = defaultThumbnails.get(null);
+ }
+ return source;
}
public String getSummary() {
Modified: incubator/stanbol/branches/0.9.0-incubating/enhancer/jersey/src/main/resources/org/apache/stanbol/enhancer/jersey/templates/imports/contentitem.ftl
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/0.9.0-incubating/enhancer/jersey/src/main/resources/org/apache/stanbol/enhancer/jersey/templates/imports/contentitem.ftl?rev=1301460&r1=1301459&r2=1301460&view=diff
==============================================================================
--- incubator/stanbol/branches/0.9.0-incubating/enhancer/jersey/src/main/resources/org/apache/stanbol/enhancer/jersey/templates/imports/contentitem.ftl (original)
+++ incubator/stanbol/branches/0.9.0-incubating/enhancer/jersey/src/main/resources/org/apache/stanbol/enhancer/jersey/templates/imports/contentitem.ftl Fri Mar 16 12:26:27 2012
@@ -18,10 +18,10 @@
<#macro view>
<div class="entitylistings">
-<#if it.personOccurrences?size != 0 || it.organizationOccurrences?size != 0 || it.placeOccurrences?size != 0 || it.conceptOccurrences?size != 0 || it.otherOccurrences?size != 0>
+<#if it.hasOccurrences()>
<h3>Extracted entities</h3>
</#if>
-
+<#-- First print the predefined types -->
<div class="entitylisting">
<#if it.personOccurrences?size != 0>
<h3>People</h3>
@@ -50,6 +50,15 @@
</#if>
</div>
+<#-- add Occurrences with other types -->
+<#list it.otherOccurrencyTypes as type>
+ <div class="entitylisting">
+ <h3>${it.extractLabel(type)}</h3>
+ <@entities.listing entities=it.getOccurrences(type) />
+ </div>
+</#list>
+
+<#-- add Occurrences with no type -->
<div class="entitylisting">
<#if it.otherOccurrences?size != 0>
<h3>Others</h3>
Modified: incubator/stanbol/branches/0.9.0-incubating/entityhub/generic/servicesapi/src/main/java/org/apache/stanbol/entityhub/servicesapi/defaults/NamespaceEnum.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/0.9.0-incubating/entityhub/generic/servicesapi/src/main/java/org/apache/stanbol/entityhub/servicesapi/defaults/NamespaceEnum.java?rev=1301460&r1=1301459&r2=1301460&view=diff
==============================================================================
--- incubator/stanbol/branches/0.9.0-incubating/entityhub/generic/servicesapi/src/main/java/org/apache/stanbol/entityhub/servicesapi/defaults/NamespaceEnum.java (original)
+++ incubator/stanbol/branches/0.9.0-incubating/entityhub/generic/servicesapi/src/main/java/org/apache/stanbol/entityhub/servicesapi/defaults/NamespaceEnum.java Fri Mar 16 12:26:27 2012
@@ -79,7 +79,47 @@ public enum NamespaceEnum {
/**
* The W3C Ontology for Media Resources http://www.w3.org/TR/mediaont-10/
*/
- media("http://www.w3.org/ns/ma-ont#")
+ media("http://www.w3.org/ns/ma-ont#"),
+ /*
+ * eHealth domain
+ */
+ /**
+ * DrugBank is a repository of almost 5000 FDA-approved small molecule and
+ * biotech drugs.
+ */
+ drugbank("http://www4.wiwiss.fu-berlin.de/drugbank/resource/drugbank/"),
+ /**
+ * Dailymed is published by the National Library of Medicine,
+ * and provides high quality information about marketed drugs.
+ */
+ dailymed("http://www4.wiwiss.fu-berlin.de/dailymed/resource/dailymed/"),
+ /**
+ * SIDER contains information on marketed drugs and their adverse effects.
+ * The information is extracted from public documents and package inserts.
+ */
+ sider("http://www4.wiwiss.fu-berlin.de/sider/resource/sider/"),
+ /**
+ * The Linked Clinical Trials (LinkedCT) project aims at publishing the
+ * first open Semantic Web data source for clinical trials data.
+ */
+ linkedct("http://data.linkedct.org/resource/linkedct/"),
+ /**
+ * STITCH contains information on chemicals and proteins as well as their
+ * interactions and links.
+ */
+ stitch("http://www4.wiwiss.fu-berlin.de/stitch/resource/stitch/"),
+ /**
+ * Diseasome publishes a network of 4,300 disorders and disease genes linked
+ * by known disorder-gene associations for exploring all known phenotype and
+ * disease gene associations, indicating the common genetic origin of many
+ * diseases.
+ */
+ diseasome("http://www4.wiwiss.fu-berlin.de/diseasome/resource/diseasome/"),
+ /**
+ * National Cancer Institute Thesaurus (http://www.mindswap.org/2003/CancerOntology/)
+ */
+ nci("http://www.mindswap.org/2003/nciOncology.owl#"),
+ tcm("http://purl.org/net/tcm/tcm.lifescience.ntu.edu.tw/")
;
/**
* The logger
Modified: incubator/stanbol/branches/0.9.0-incubating/entityhub/yard/solr/src/main/java/org/apache/stanbol/entityhub/yard/solr/query/QueryUtils.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/0.9.0-incubating/entityhub/yard/solr/src/main/java/org/apache/stanbol/entityhub/yard/solr/query/QueryUtils.java?rev=1301460&r1=1301459&r2=1301460&view=diff
==============================================================================
--- incubator/stanbol/branches/0.9.0-incubating/entityhub/yard/solr/src/main/java/org/apache/stanbol/entityhub/yard/solr/query/QueryUtils.java (original)
+++ incubator/stanbol/branches/0.9.0-incubating/entityhub/yard/solr/src/main/java/org/apache/stanbol/entityhub/yard/solr/query/QueryUtils.java Fri Mar 16 12:26:27 2012
@@ -41,7 +41,22 @@ public final class QueryUtils {
* of STR no whitespace is assumed. Therefore spaces need to be replaced with '+' to search for tokens
* with the exact name. In all other cases the string need not to be converted.
*
- * Note also that text queries are converted to lower case
+ * <del>Note also that text queries are converted to lower case</del>
+ * Note: since 2012-03-14 parsed values are only converted to lower case.
+ * <p>
+ * <b>TODO:</b> Until Solr 3.6 is released and the implementation of
+ * <a href="https://issues.apache.org/jira/browse/">SOLR-2438</a> is
+ * released this needs to still convert wildcard queries to lower case.<br>
+ * Because of that:<ul>
+ * <li> in case <code>escape=true</code>. Non-wildcard queries should support
+ * case sensitive searches. If the searched solr field uses a lowerCase
+ * filter than this will be done by Solr anyway and if not that case
+ * sensitivity might be important!
+ * <li> for <code>escape=false</code> - wild card searches the values are
+ * still converted to lower case to keep compatible with previous versions.
+ * TODO: the caseSensitive parameter of TextConstraints should be used
+ * instead
+ * </ul>
*
* @param value
* the index value
@@ -62,13 +77,17 @@ public final class QueryUtils {
value = SolrUtil.escapeWildCardString(value);
}
if (IndexDataTypeEnum.TXT.getIndexType().equals(indexValue.getType())) {
- value = value.toLowerCase();
+ if(!escape){
+ value = value.toLowerCase();
+ } //rw: 20120314: respect case sensitivity for escaped (non wildcard)
Collection<String> tokens = new HashSet<String>(
Arrays.asList(value.split(" ")));
tokens.remove("");
queryConstraints = tokens.toArray(new String[tokens.size()]);
} else if (IndexDataTypeEnum.STR.getIndexType().equals(indexValue.getType())) {
- value = value.toLowerCase();
+ if(!escape){
+ value = value.toLowerCase();
+ } //rw: 20120314: respect case sensitivity for escaped (non wildcard)
queryConstraints = new String[] {value.replace(' ', '+')};
} else {
queryConstraints = new String[] {value};
Propchange: incubator/stanbol/branches/0.9.0-incubating/reasoners/
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Fri Mar 16 12:26:27 2012
@@ -1,2 +1,3 @@
/incubator/stanbol/branches/jena-reasoners/reasoners:1156596-1163703
/incubator/stanbol/branches/lto-reasoners/reasoners:1180011-1205767
+/incubator/stanbol/trunk/reasoners:1301064-1301458