You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by fc...@apache.org on 2012/03/16 13:26:28 UTC

svn commit: r1301460 - in /incubator/stanbol/branches/0.9.0-incubating: ./ commons/opennlp/ commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/ enhancer/engines/ enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer...

Author: fchrist
Date: Fri Mar 16 12:26:27 2012
New Revision: 1301460

URL: http://svn.apache.org/viewvc?rev=1301460&view=rev
Log:
Merging latest changes from trunk to the release branch

Added:
    incubator/stanbol/branches/0.9.0-incubating/commons/opennlp/README.md
      - copied unchanged from r1301458, incubator/stanbol/trunk/commons/opennlp/README.md
    incubator/stanbol/branches/0.9.0-incubating/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/KeywordTokenizer.java
      - copied unchanged from r1301458, incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/KeywordTokenizer.java
Modified:
    incubator/stanbol/branches/0.9.0-incubating/   (props changed)
    incubator/stanbol/branches/0.9.0-incubating/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/TextAnalyzer.java
    incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java
    incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java
    incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinkerConfig.java
    incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/Suggestion.java
    incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/keywordextraction/src/main/resources/OSGI-INF/metatype/metatype.properties
    incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/pom.xml
    incubator/stanbol/branches/0.9.0-incubating/enhancer/jersey/src/main/java/org/apache/stanbol/enhancer/jersey/resource/ContentItemResource.java
    incubator/stanbol/branches/0.9.0-incubating/enhancer/jersey/src/main/resources/org/apache/stanbol/enhancer/jersey/templates/imports/contentitem.ftl
    incubator/stanbol/branches/0.9.0-incubating/entityhub/generic/servicesapi/src/main/java/org/apache/stanbol/entityhub/servicesapi/defaults/NamespaceEnum.java
    incubator/stanbol/branches/0.9.0-incubating/entityhub/yard/solr/src/main/java/org/apache/stanbol/entityhub/yard/solr/query/QueryUtils.java
    incubator/stanbol/branches/0.9.0-incubating/reasoners/   (props changed)

Propchange: incubator/stanbol/branches/0.9.0-incubating/
------------------------------------------------------------------------------
    svn:mergeinfo = /incubator/stanbol/trunk:1301064-1301458

Modified: incubator/stanbol/branches/0.9.0-incubating/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/TextAnalyzer.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/0.9.0-incubating/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/TextAnalyzer.java?rev=1301460&r1=1301459&r2=1301460&view=diff
==============================================================================
--- incubator/stanbol/branches/0.9.0-incubating/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/TextAnalyzer.java (original)
+++ incubator/stanbol/branches/0.9.0-incubating/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/TextAnalyzer.java Fri Mar 16 12:26:27 2012
@@ -52,6 +52,7 @@ public class TextAnalyzer {
     
     public static final class TextAnalyzerConfig {
         protected boolean forceSimpleTokenizer = false; //default to false
+        protected boolean forceKeywordTokenizer = false; //default to false
         protected boolean enablePosTagger = true;
         protected boolean enableChunker = true;
         protected boolean enableSentenceDetector = true;
@@ -67,6 +68,19 @@ public class TextAnalyzer {
     
         public final void forceSimpleTokenizer(boolean useSimpleTokenizer) {
             this.forceSimpleTokenizer = useSimpleTokenizer;
+            if(useSimpleTokenizer){
+                this.forceKeywordTokenizer = false;
+            }
+        }
+        public final boolean isKeywordTokenizerForced() {
+            return forceKeywordTokenizer;
+        }
+    
+        public final void forceKeywordTokenizer(boolean useKeywordTokenizer) {
+            this.forceKeywordTokenizer = useKeywordTokenizer;
+            if(useKeywordTokenizer){
+                this.forceSimpleTokenizer = false;
+            }
         }
     
         public final boolean isPosTaggerEnable() {
@@ -237,6 +251,8 @@ public class TextAnalyzer {
         if(tokenizer == null){
             if(config.forceSimpleTokenizer){
                 tokenizer = SimpleTokenizer.INSTANCE;
+            } else if(config.forceKeywordTokenizer){
+                tokenizer = KeywordTokenizer.INSTANCE;
             } else {
                 tokenizer = openNLP.getTokenizer(language);
                 if(tokenizer == null){

Modified: incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java?rev=1301460&r1=1301459&r2=1301460&view=diff
==============================================================================
--- incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java (original)
+++ incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java Fri Mar 16 12:26:27 2012
@@ -19,6 +19,8 @@ package org.apache.stanbol.enhancer.engi
 import static org.apache.stanbol.entityhub.servicesapi.defaults.NamespaceEnum.getFullName;
 
 import java.io.IOException;
+import java.net.URI;
+import java.net.URISyntaxException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
@@ -77,6 +79,7 @@ import org.apache.stanbol.enhancer.servi
 import org.apache.stanbol.enhancer.servicesapi.rdf.Properties;
 import org.apache.stanbol.entityhub.model.clerezza.RdfValueFactory;
 import org.apache.stanbol.entityhub.servicesapi.Entityhub;
+import org.apache.stanbol.entityhub.servicesapi.defaults.NamespaceEnum;
 import org.apache.stanbol.entityhub.servicesapi.model.Reference;
 import org.apache.stanbol.entityhub.servicesapi.model.Text;
 import org.apache.stanbol.entityhub.servicesapi.site.ReferencedSite;
@@ -98,6 +101,7 @@ import org.slf4j.LoggerFactory;
     @Property(name=EnhancementEngine.PROPERTY_NAME),
     @Property(name=KeywordLinkingEngine.REFERENCED_SITE_ID),
     @Property(name=KeywordLinkingEngine.NAME_FIELD,value=EntityLinkerConfig.DEFAULT_NAME_FIELD),
+    @Property(name=KeywordLinkingEngine.CASE_SENSITIVE,boolValue=EntityLinkerConfig.DEFAULT_CASE_SENSITIVE_MATCHING_STATE),
     @Property(name=KeywordLinkingEngine.TYPE_FIELD,value=EntityLinkerConfig.DEFAULT_TYPE_FIELD),
     @Property(name=KeywordLinkingEngine.REDIRECT_FIELD,value=EntityLinkerConfig.DEFAULT_REDIRECT_FIELD),
     @Property(name=KeywordLinkingEngine.REDIRECT_PROCESSING_MODE,options={
@@ -113,10 +117,12 @@ import org.slf4j.LoggerFactory;
         },value="IGNORE"),
     @Property(name=KeywordLinkingEngine.MIN_SEARCH_TOKEN_LENGTH,
         intValue=EntityLinkerConfig.DEFAULT_MIN_SEARCH_TOKEN_LENGTH),
+    @Property(name=KeywordLinkingEngine.KEYWORD_TOKENIZER,boolValue=false),
     @Property(name=KeywordLinkingEngine.MAX_SUGGESTIONS,
         intValue=EntityLinkerConfig.DEFAULT_SUGGESTIONS),
     @Property(name=KeywordLinkingEngine.PROCESSED_LANGUAGES,value=""),
     @Property(name=KeywordLinkingEngine.DEFAULT_MATCHING_LANGUAGE,value=""),
+    @Property(name=KeywordLinkingEngine.TYPE_MAPPINGS,cardinality=1000),
     @Property(name=KeywordLinkingEngine.DEREFERENCE_ENTITIES,
         boolValue=KeywordLinkingEngine.DEFAULT_DEREFERENCE_ENTITIES_STATE),
     @Property(name=Constants.SERVICE_RANKING,intValue=0)
@@ -147,6 +153,7 @@ public class KeywordLinkingEngine 
     public static final String REFERENCED_SITE_ID = "org.apache.stanbol.enhancer.engines.keywordextraction.referencedSiteId";
     public static final String NAME_FIELD = "org.apache.stanbol.enhancer.engines.keywordextraction.nameField";
     public static final String TYPE_FIELD = "org.apache.stanbol.enhancer.engines.keywordextraction.typeField";
+    public static final String CASE_SENSITIVE = "org.apache.stanbol.enhancer.engines.keywordextraction.caseSensitive";
     public static final String REDIRECT_FIELD = "org.apache.stanbol.enhancer.engines.keywordextraction.redirectField";
     public static final String REDIRECT_PROCESSING_MODE = "org.apache.stanbol.enhancer.engines.keywordextraction.redirectMode";
     public static final String MIN_SEARCH_TOKEN_LENGTH = "org.apache.stanbol.enhancer.engines.keywordextraction.minSearchTokenLength";
@@ -155,7 +162,8 @@ public class KeywordLinkingEngine 
     public static final String MIN_FOUND_TOKENS= "org.apache.stanbol.enhancer.engines.keywordextraction.minFoundTokens";
     public static final String DEFAULT_MATCHING_LANGUAGE = "org.apache.stanbol.enhancer.engines.keywordextraction.defaultMatchingLanguage";
     public static final String MIN_POS_TAG_PROBABILITY = "org.apache.stanbol.enhancer.engines.keywordextraction.minPosTagProbability";
-//  public static final String SIMPLE_TOKENIZER = "org.apache.stanbol.enhancer.engines.keywordextraction.simpleTokenizer";
+    public static final String TYPE_MAPPINGS = "org.apache.stanbol.enhancer.engines.keywordextraction.typeMappings";
+    public static final String KEYWORD_TOKENIZER = "org.apache.stanbol.enhancer.engines.keywordextraction.keywordTokenizer";
 //  public static final String ENABLE_CHUNKER = "org.apache.stanbol.enhancer.engines.keywordextraction.enableChunker";
     /**
      * Adds the dereference feature (STANBOL-333) also to this engine.
@@ -590,6 +598,13 @@ public class KeywordLinkingEngine 
                 "The configured min POS tag probability MUST BE in the range [0..1] " +
                 "or < 0 to deactivate this feature (parsed value "+value+")!");
         }
+        value = configuration.get(KEYWORD_TOKENIZER);
+        //the keyword tokenizer config
+        if(value instanceof Boolean){
+            nlpConfig.forceKeywordTokenizer((Boolean)value);
+        } else if(value != null && !value.toString().isEmpty()){
+            nlpConfig.forceKeywordTokenizer(Boolean.valueOf(value.toString()));
+        }
         nlpConfig.setMinPosTagProbability(minPosTagProb);
         analysedContentFactory = OpenNlpAnalysedContentFactory.getInstance(openNLP,nlpConfig);
     }
@@ -626,6 +641,13 @@ public class KeywordLinkingEngine 
             }
             linkerConfig.setNameField(value.toString());
         }
+        //init case sensitivity
+        value = configuration.get(CASE_SENSITIVE);
+        if(value instanceof Boolean){
+            linkerConfig.setCaseSensitiveMatchingState((Boolean)value);
+        } else if(value != null && !value.toString().isEmpty()){
+            linkerConfig.setCaseSensitiveMatchingState(Boolean.valueOf(value.toString()));
+        } //if NULL or empty use default
         //init TYPE_FIELD
         value = configuration.get(TYPE_FIELD);
         if(value != null){
@@ -725,6 +747,66 @@ public class KeywordLinkingEngine 
                 linkerConfig.setDefaultLanguage(defaultLang);
             }
         }
+        //init type mappings
+        value = configuration.get(TYPE_MAPPINGS);
+        if(value instanceof String[]){ //support array
+            value = Arrays.asList((String[])value);
+        } else if(value instanceof String) { //single value
+            value = Collections.singleton(value);
+        }
+        if(value instanceof Collection<?>){ //and collection
+            log.info("Init Type Mappings");
+            configs :
+            for(Object o : (Iterable<?>)value){
+                if(o != null){
+                    StringBuilder usage = new StringBuilder("useages: ");
+                    usage.append("a: '{uri}' short for {uri} > {uri} | ");
+                    usage.append("b: '{source1};{source2};..;{sourceN} > {target}'");
+                    String[] config = o.toString().split(">");
+                    if(config[0].isEmpty()){
+                        log.warn("Invalid Type Mapping Config '{}': Missing Source Type ({}) -> ignore this config",
+                            o,usage);
+                        continue configs;
+                    }
+                    String[] sourceTypes = config[0].split(";");
+                    if(sourceTypes.length > 1 && (config.length < 2 || config[1].isEmpty())){
+                        log.warn("Invalid Type Mapping Config '{}': Missing Target Type '{}' ({}) -> ignore this config",
+                            o,usage);
+                        continue configs;
+                    }
+                    String targetType = config.length < 2 ? sourceTypes[0] : config[1];
+                    targetType = getFullName(targetType.trim()); //support for ns:localName
+                    try { //validate
+                        new URI(targetType);
+                    } catch (URISyntaxException e) {
+                        log.warn("Invalid URI '{}' in Type Mapping Config '{}' -> ignore this config",
+                            sourceTypes[0],o);
+                        continue configs;
+                    }
+                    UriRef targetUri = new UriRef(targetType);
+                    for(String sourceType : sourceTypes){
+                        if(!sourceType.isEmpty()){
+                            sourceType = getFullName(sourceType.trim()); //support for ns:localName
+                            try { //validate
+                                new URI(sourceType);
+                                UriRef old = linkerConfig.setTypeMapping(sourceType, targetUri);
+                                if(old == null){
+                                    log.info(" > add type mapping {} > {}", sourceType,targetType);
+                                } else {
+                                    log.info(" > set type mapping {} > {} (old: {})", 
+                                        new Object[]{sourceType,targetType,old.getUnicodeString()});
+                                }
+                            } catch (URISyntaxException e) {
+                                log.warn("Invalid URI '{}' in Type Mapping Config '{}' -> ignore this source type",
+                                    sourceTypes[0],o);
+                            }
+                        }
+                    }
+                }
+            }
+        } else {
+            log.debug("No Type mappings configured");
+        }
     }
 
     /**

Modified: incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java?rev=1301460&r1=1301459&r2=1301460&view=diff
==============================================================================
--- incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java (original)
+++ incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java Fri Mar 16 12:26:27 2012
@@ -353,7 +353,10 @@ public class EntityLinker {
      * @param label
      */
     private void matchLabel(Suggestion match, Text label) {
-        String text = label.getText().toLowerCase();
+        String text = label.getText();
+        if(!config.isCaseSensitiveMatching()){
+            text = text.toLowerCase(); //TODO use language of label for Locale
+        }
         //Tokenize the label and remove remove tokens without alpha numerical chars
         String[] unprocessedLabelTokens = content.tokenize(text);
         int offset = 0;
@@ -396,7 +399,10 @@ public class EntityLinker {
                 && search ;currentIndex++){
             currentToken = state.getSentence().getTokens().get(currentIndex);
             if(currentToken.hasAplhaNumericChar()){
-                currentTokenText = currentToken.getText().toLowerCase();
+                currentTokenText = currentToken.getText();
+                if(!config.isCaseSensitiveMatching()){
+                    currentTokenText = currentTokenText.toLowerCase();
+                }
                 currentTokenLength = currentTokenText.length();
                 boolean isProcessable = isProcessableToken(currentToken);
                 boolean found = false;
@@ -460,7 +466,10 @@ public class EntityLinker {
             String labelTokenText = labelTokens[labelIndex];
             if(labelTokenSet.remove(labelTokenText)){ //still not matched
                 currentToken = state.getSentence().getTokens().get(currentIndex);
-                currentTokenText = currentToken.getText().toLowerCase();
+                currentTokenText = currentToken.getText();
+                if(!config.isCaseSensitiveMatching()){
+                    currentTokenText = currentTokenText.toLowerCase();
+                }
                 currentTokenLength = currentTokenText.length();
                 boolean found = false;
                 float matchFactor = 0f;
@@ -503,7 +512,7 @@ public class EntityLinker {
         //   match (this will be very rare
         if(foundProcessableTokens > 0 && match.getMatchCount() <= foundProcessableTokens) {
             String currentText = state.getTokenText(firstFoundIndex,coveredTokens);
-            if(currentText.equalsIgnoreCase(text)){ 
+            if(config.isCaseSensitiveMatching() ? currentText.equals(text) : currentText.equalsIgnoreCase(text)){ 
                 labelMatch = MATCH.EXACT;
                 //set found to covered: May be lower because only
                 //processable tokens are counted, but Exact also checks

Modified: incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinkerConfig.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinkerConfig.java?rev=1301460&r1=1301459&r2=1301460&view=diff
==============================================================================
--- incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinkerConfig.java (original)
+++ incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinkerConfig.java Fri Mar 16 12:26:27 2012
@@ -88,6 +88,10 @@ public class EntityLinkerConfig {
      */
     public static final String DEFAULT_LANGUAGE = null;
     /**
+     * The default for case sensitive matching is set to <code>false</code>
+     */
+    public static final boolean DEFAULT_CASE_SENSITIVE_MATCHING_STATE = false;
+    /**
      * Default mapping for Concept types to dc:type values added for
      * TextAnnotations.
      */
@@ -98,6 +102,7 @@ public class EntityLinkerConfig {
         mappings.put(OntologicalClasses.DBPEDIA_ORGANISATION.getUnicodeString(), OntologicalClasses.DBPEDIA_ORGANISATION);
         mappings.put(NamespaceEnum.dbpediaOnt+"Newspaper", OntologicalClasses.DBPEDIA_ORGANISATION);
         mappings.put(NamespaceEnum.schema+"Organization", OntologicalClasses.DBPEDIA_ORGANISATION);
+//        mappings.put(NamespaceEnum.dailymed+"organization",OntologicalClasses.DBPEDIA_ORGANISATION);
         
         mappings.put(OntologicalClasses.DBPEDIA_PERSON.getUnicodeString(), OntologicalClasses.DBPEDIA_PERSON);
         mappings.put(NamespaceEnum.foaf+"Person", OntologicalClasses.DBPEDIA_PERSON);
@@ -108,6 +113,25 @@ public class EntityLinkerConfig {
         mappings.put(NamespaceEnum.gml+"_Feature", OntologicalClasses.DBPEDIA_PLACE);
 
         mappings.put(OntologicalClasses.SKOS_CONCEPT.getUnicodeString(), OntologicalClasses.SKOS_CONCEPT);
+
+//        UriRef DRUG = new UriRef(NamespaceEnum.drugbank+"drugs");
+//        mappings.put(DRUG.getUnicodeString(), DRUG);
+//        mappings.put(NamespaceEnum.dbpediaOnt+"Drug", DRUG);
+//        mappings.put(NamespaceEnum.dailymed+"drugs", DRUG);
+//        mappings.put(NamespaceEnum.sider+"drugs", DRUG);
+//        mappings.put(NamespaceEnum.tcm+"Medicine", DRUG);
+//        
+//        UriRef DISEASE = new UriRef(NamespaceEnum.diseasome+"diseases");
+//        mappings.put(DISEASE.getUnicodeString(), DISEASE);
+//        mappings.put(NamespaceEnum.linkedct+"condition", DISEASE);
+//        mappings.put(NamespaceEnum.tcm+"Disease", DISEASE);
+//
+//        UriRef SIDE_EFFECT = new UriRef(NamespaceEnum.sider+"side_effects");
+//        mappings.put(SIDE_EFFECT.getUnicodeString(), SIDE_EFFECT);
+//        
+//        UriRef INGREDIENT = new UriRef(NamespaceEnum.dailymed+"ingredients");
+//        mappings.put(INGREDIENT.getUnicodeString(), INGREDIENT);
+                
         DEFAULT_ENTITY_TYPE_MAPPINGS = Collections.unmodifiableMap(mappings);
     }
     /**
@@ -162,6 +186,8 @@ public class EntityLinkerConfig {
      * more mapped to the actual label of an result.
      */
     private int maxSearchTokens = DEFAULT_MAX_SEARCH_TOKENS;
+    
+    private boolean caseSensitiveMatchingState = DEFAULT_CASE_SENSITIVE_MATCHING_STATE;
     /**
      * Holds the mappings of rdf:type used by concepts to dc:type values used
      * by TextAnnotations. 
@@ -356,6 +382,20 @@ public class EntityLinkerConfig {
         this.maxSearchTokens = maxSearchTokens;
     }
     /**
+     * Getter for the case sensitive matching state
+     * @return the state
+     */
+    public boolean isCaseSensitiveMatching() {
+        return caseSensitiveMatchingState;
+    }
+    /**
+     * Setter for the case sensitive matching state
+     * @param caseSensitiveMatchingState the state
+     */
+    public void setCaseSensitiveMatchingState(boolean state) {
+        this.caseSensitiveMatchingState = state;
+    }
+    /**
      * Removes the mapping for the parsed concept type
      * @param conceptType the concept type to remove the mapping
      * @return the previously mapped dc:type value or <code>null</code> if

Modified: incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/Suggestion.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/Suggestion.java?rev=1301460&r1=1301459&r2=1301460&view=diff
==============================================================================
--- incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/Suggestion.java (original)
+++ incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/Suggestion.java Fri Mar 16 12:26:27 2012
@@ -212,10 +212,12 @@ public class Suggestion implements Compa
      * @return the best match or {@link Suggestion#getMatchedLabel()} if non is found
      */
     public Text getBestLabel(String nameField, String language){
-        Representation rep = getRepresentation(); 
+        Representation rep = getRepresentation();
+        //start with the matched label -> so if we do not find a better one
+        //we will use the matched!
+        Text label = this.label;
         // 1. check if the returned Entity does has a label -> if not return null
         // add labels (set only a single label. Use "en" if available!
-        Text label = null;
         Iterator<Text> labels = rep.getText(nameField);
         boolean matchFound = false;
         while (labels.hasNext() && !matchFound) {

Modified: incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/keywordextraction/src/main/resources/OSGI-INF/metatype/metatype.properties
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/keywordextraction/src/main/resources/OSGI-INF/metatype/metatype.properties?rev=1301460&r1=1301459&r2=1301460&view=diff
==============================================================================
--- incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/keywordextraction/src/main/resources/OSGI-INF/metatype/metatype.properties (original)
+++ incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/keywordextraction/src/main/resources/OSGI-INF/metatype/metatype.properties Fri Mar 16 12:26:27 2012
@@ -56,6 +56,10 @@ org.apache.stanbol.enhancer.engines.keyw
 org.apache.stanbol.enhancer.engines.keywordextraction.typeField.description=The field used to \
 retrieve the types of matched Entities. Values of that field are expected to be URIs
 
+org.apache.stanbol.enhancer.engines.keywordextraction.caseSensitive.name=Case Sensitivity
+org.apache.stanbol.enhancer.engines.keywordextraction.caseSensitive.description=Allows to enable/disable \
+case sensitive matching
+
 org.apache.stanbol.enhancer.engines.keywordextraction.redirectField.name=Redirect Field
 org.apache.stanbol.enhancer.engines.keywordextraction.redirectField.description=Entities may \
 define redirects to other Entities (e.g. "USA"(http://dbpedia.org/resource/USA) -> \
@@ -97,3 +101,17 @@ configuration (e.g. to 'en' in the case 
 org.apache.stanbol.enhancer.engines.keywordextraction.dereference.name=Dereference Entities
 org.apache.stanbol.enhancer.engines.keywordextraction.dereference.description=If enabled additional \
 data for suggested Entities are included.
+
+org.apache.stanbol.enhancer.engines.keywordextraction.typeMappings.name=Type Mappings
+org.apache.stanbol.enhancer.engines.keywordextraction.typeMappings.description=This allows to add \
+additional entity-type > text-annotation-type mappings. Such mappings are used to determine the \
+'dc:type' value of the 'fise:TextAnnotation' created for extracted entities. Usage: \
+variant (a) '{uri}' short for {uri} > {uri} or (b) '{source1};{source2};..;{sourceN} > {target}'. \
+Note that a {source} may be only mapped to a single {target}. Multiple {source} types \
+can be mapped to the same {target}.
+
+org.apache.stanbol.enhancer.engines.keywordextraction.keywordTokenizer.name=Keyword Tokenizer
+org.apache.stanbol.enhancer.engines.keywordextraction.keywordTokenizer.description=This allows \
+to use a special Tokenizer for matching keywords and alpha numeric IDs. Typical language \
+specific Tokenizers tned to split such IDs in several tokens and therefore might prevent \
+a correct matching.

Modified: incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/pom.xml
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/pom.xml?rev=1301460&r1=1301459&r2=1301460&view=diff
==============================================================================
--- incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/pom.xml (original)
+++ incubator/stanbol/branches/0.9.0-incubating/enhancer/engines/pom.xml Fri Mar 16 12:26:27 2012
@@ -45,7 +45,6 @@
     <module>opennlp-ner</module>
     <module>langid</module>
     <module>topic</module>
-    <module>metaxa</module>
     <module>tika</module>
     <module>geonames</module>
     <module>entitytagging</module>

Modified: incubator/stanbol/branches/0.9.0-incubating/enhancer/jersey/src/main/java/org/apache/stanbol/enhancer/jersey/resource/ContentItemResource.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/0.9.0-incubating/enhancer/jersey/src/main/java/org/apache/stanbol/enhancer/jersey/resource/ContentItemResource.java?rev=1301460&r1=1301459&r2=1301460&view=diff
==============================================================================
--- incubator/stanbol/branches/0.9.0-incubating/enhancer/jersey/src/main/java/org/apache/stanbol/enhancer/jersey/resource/ContentItemResource.java (original)
+++ incubator/stanbol/branches/0.9.0-incubating/enhancer/jersey/src/main/java/org/apache/stanbol/enhancer/jersey/resource/ContentItemResource.java Fri Mar 16 12:26:27 2012
@@ -47,10 +47,12 @@ import java.util.Collections;
 import java.util.Date;
 import java.util.EnumMap;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.Map.Entry;
+import java.util.Set;
 import java.util.TreeMap;
 
 import javax.servlet.ServletContext;
@@ -80,6 +82,7 @@ import org.apache.clerezza.rdf.core.seri
 import org.apache.clerezza.rdf.core.sparql.ParseException;
 import org.apache.clerezza.rdf.ontologies.RDF;
 import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang.StringUtils;
 import org.apache.stanbol.commons.indexedgraph.IndexedMGraph;
 import org.apache.stanbol.commons.web.base.resource.BaseStanbolResource;
 import org.apache.stanbol.enhancer.servicesapi.Blob;
@@ -232,7 +235,40 @@ public class ContentItemResource extends
     public URI getMetadataHref() {
         return metadataHref;
     }
-
+    /**
+     * Checks if there are Occurrences
+     */
+    public boolean hasOccurrences(){
+        for(Map<String,EntityExtractionSummary> occ : extractionsByTypeMap.values()){
+            if(!occ.isEmpty()){
+                return true;
+            }
+        }
+        return false;
+    }
+    /**
+     * Used to print occurrences with other types than the natively supported
+     */
+    public Collection<UriRef> getOtherOccurrencyTypes(){
+        Set<UriRef>  types = new HashSet<UriRef>(extractionsByTypeMap.keySet());
+        types.remove(DBPEDIA_PERSON);
+        types.remove(DBPEDIA_ORGANISATION);
+        types.remove(DBPEDIA_PLACE);
+        types.remove(SKOS_CONCEPT);
+        types.remove(null); //other
+        return types;
+    }
+    public String extractLabel(UriRef uri){
+        String fullUri = uri.getUnicodeString();
+        int index = Math.max(fullUri.lastIndexOf('#'),fullUri.lastIndexOf('/'));
+        index = Math.max(index, fullUri.lastIndexOf(':'));
+        //do not convert if the parsed uri does not contain a local name
+        if(index > 0 && index+1 < fullUri.length()){
+            return StringUtils.capitalize(fullUri.substring(index+1).replaceAll("[\\-_]", " "));
+        } else {
+            return uri.getUnicodeString();
+        }
+    }
     public Collection<EntityExtractionSummary> getOccurrences(UriRef type){
         Map<String,EntityExtractionSummary> typeMap = extractionsByTypeMap.get(type);
         Collection<EntityExtractionSummary> typeOccurrences;
@@ -401,13 +437,17 @@ public class ContentItemResource extends
 
         public String getThumbnailSrc() {
             if (suggestions.isEmpty()) {
-                return defaultThumbnails.get(type);
+                return getMissingThumbnailSrc();
             }
             return suggestions.get(0).getThumbnailSrc();
         }
 
         public String getMissingThumbnailSrc() {
-            return defaultThumbnails.get(type);
+            String source = defaultThumbnails.get(type);
+            if(source == null){
+                source = defaultThumbnails.get(null);//default
+            }
+            return source;
         }
 
         public EntitySuggestion getBestGuess() {
@@ -512,11 +552,15 @@ public class ContentItemResource extends
                     return ((UriRef) object).getUnicodeString();
                 }
             }
-            return defaultThumbnails.get(type);
+            return getMissingThumbnailSrc();
         }
 
         public String getMissingThumbnailSrc() {
-            return defaultThumbnails.get(type);
+            String source = defaultThumbnails.get(type);
+            if(source == null){
+                source = defaultThumbnails.get(null);
+            }
+            return source;
         }
 
         public String getSummary() {

Modified: incubator/stanbol/branches/0.9.0-incubating/enhancer/jersey/src/main/resources/org/apache/stanbol/enhancer/jersey/templates/imports/contentitem.ftl
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/0.9.0-incubating/enhancer/jersey/src/main/resources/org/apache/stanbol/enhancer/jersey/templates/imports/contentitem.ftl?rev=1301460&r1=1301459&r2=1301460&view=diff
==============================================================================
--- incubator/stanbol/branches/0.9.0-incubating/enhancer/jersey/src/main/resources/org/apache/stanbol/enhancer/jersey/templates/imports/contentitem.ftl (original)
+++ incubator/stanbol/branches/0.9.0-incubating/enhancer/jersey/src/main/resources/org/apache/stanbol/enhancer/jersey/templates/imports/contentitem.ftl Fri Mar 16 12:26:27 2012
@@ -18,10 +18,10 @@
 <#macro view>
 
 <div class="entitylistings">
-<#if it.personOccurrences?size != 0 || it.organizationOccurrences?size != 0 ||  it.placeOccurrences?size != 0 || it.conceptOccurrences?size != 0 || it.otherOccurrences?size != 0>
+<#if it.hasOccurrences()>
 <h3>Extracted entities</h3>
 </#if>
-
+<#-- First print the predefined types -->
 <div class="entitylisting">
 <#if it.personOccurrences?size != 0>
 <h3>People</h3>
@@ -50,6 +50,15 @@
 </#if>
 </div>
 
+<#-- add Occurrences with other types -->
+<#list it.otherOccurrencyTypes as type>
+  <div class="entitylisting">
+  <h3>${it.extractLabel(type)}</h3>
+  <@entities.listing entities=it.getOccurrences(type) />
+  </div>
+</#list>
+
+<#-- add Occurrences with no type -->
 <div class="entitylisting">
 <#if it.otherOccurrences?size != 0>
 <h3>Others</h3>

Modified: incubator/stanbol/branches/0.9.0-incubating/entityhub/generic/servicesapi/src/main/java/org/apache/stanbol/entityhub/servicesapi/defaults/NamespaceEnum.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/0.9.0-incubating/entityhub/generic/servicesapi/src/main/java/org/apache/stanbol/entityhub/servicesapi/defaults/NamespaceEnum.java?rev=1301460&r1=1301459&r2=1301460&view=diff
==============================================================================
--- incubator/stanbol/branches/0.9.0-incubating/entityhub/generic/servicesapi/src/main/java/org/apache/stanbol/entityhub/servicesapi/defaults/NamespaceEnum.java (original)
+++ incubator/stanbol/branches/0.9.0-incubating/entityhub/generic/servicesapi/src/main/java/org/apache/stanbol/entityhub/servicesapi/defaults/NamespaceEnum.java Fri Mar 16 12:26:27 2012
@@ -79,7 +79,47 @@ public enum NamespaceEnum {
     /**
      * The W3C Ontology for Media Resources http://www.w3.org/TR/mediaont-10/
      */
-    media("http://www.w3.org/ns/ma-ont#")
+    media("http://www.w3.org/ns/ma-ont#"),
+    /*
+     * eHealth domain 
+     */
+    /**
+     * DrugBank is a repository of almost 5000 FDA-approved small molecule and 
+     * biotech drugs. 
+     */
+    drugbank("http://www4.wiwiss.fu-berlin.de/drugbank/resource/drugbank/"),
+    /**
+     * Dailymed is published by the National Library of Medicine, 
+     * and provides high quality information about marketed drugs.
+     */
+    dailymed("http://www4.wiwiss.fu-berlin.de/dailymed/resource/dailymed/"),
+    /**
+     * SIDER contains information on marketed drugs and their adverse effects. 
+     * The information is extracted from public documents and package inserts.
+     */
+    sider("http://www4.wiwiss.fu-berlin.de/sider/resource/sider/"),
+    /**
+     * The Linked Clinical Trials (LinkedCT) project aims at publishing the 
+     * first open Semantic Web data source for clinical trials data.
+     */
+    linkedct("http://data.linkedct.org/resource/linkedct/"),
+    /**
+     * STITCH contains information on chemicals and proteins as well as their 
+     * interactions and links.
+     */
+    stitch("http://www4.wiwiss.fu-berlin.de/stitch/resource/stitch/"),
+    /**
+     * Diseasome publishes a network of 4,300 disorders and disease genes linked 
+     * by known disorder-gene associations for exploring all known phenotype and 
+     * disease gene associations, indicating the common genetic origin of many 
+     * diseases.
+     */
+    diseasome("http://www4.wiwiss.fu-berlin.de/diseasome/resource/diseasome/"),
+    /**
+     * National Cancer Institute Thesaurus (http://www.mindswap.org/2003/CancerOntology/)
+     */
+    nci("http://www.mindswap.org/2003/nciOncology.owl#"),
+    tcm("http://purl.org/net/tcm/tcm.lifescience.ntu.edu.tw/")
     ;
     /**
      * The logger

Modified: incubator/stanbol/branches/0.9.0-incubating/entityhub/yard/solr/src/main/java/org/apache/stanbol/entityhub/yard/solr/query/QueryUtils.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/0.9.0-incubating/entityhub/yard/solr/src/main/java/org/apache/stanbol/entityhub/yard/solr/query/QueryUtils.java?rev=1301460&r1=1301459&r2=1301460&view=diff
==============================================================================
--- incubator/stanbol/branches/0.9.0-incubating/entityhub/yard/solr/src/main/java/org/apache/stanbol/entityhub/yard/solr/query/QueryUtils.java (original)
+++ incubator/stanbol/branches/0.9.0-incubating/entityhub/yard/solr/src/main/java/org/apache/stanbol/entityhub/yard/solr/query/QueryUtils.java Fri Mar 16 12:26:27 2012
@@ -41,7 +41,22 @@ public final class QueryUtils {
      * of STR no whitespace is assumed. Therefore spaces need to be replaced with '+' to search for tokens
      * with the exact name. In all other cases the string need not to be converted.
      * 
-     * Note also that text queries are converted to lower case
+     * <del>Note also that text queries are converted to lower case</del>
+     * Note: since 2012-03-14 parsed values are only converted to lower case.
+     * <p>
+     * <b>TODO:</b> Until Solr 3.6 is released and the implementation of
+     * <a href="https://issues.apache.org/jira/browse/">SOLR-2438</a> is
+     * released this needs to still convert wildcard queries to lower case.<br>
+     * Because of that:<ul>
+     * <li> in case <code>escape=true</code>. Non-wildcard queries should support
+     * case sensitive searches. If the searched solr field uses a lowerCase
+     * filter than this will be done by Solr anyway and if not that case
+     * sensitivity might be important!
+     * <li> for <code>escape=false</code> - wild card searches the values are
+     * still converted to lower case to keep compatible with previous versions.
+     * TODO: the caseSensitive parameter of TextConstraints should be used
+     * instead
+     * </ul>
      * 
      * @param value
      *            the index value
@@ -62,13 +77,17 @@ public final class QueryUtils {
             value = SolrUtil.escapeWildCardString(value);
         }
         if (IndexDataTypeEnum.TXT.getIndexType().equals(indexValue.getType())) {
-        	value = value.toLowerCase();
+            if(!escape){ 
+                value = value.toLowerCase();
+            } //rw: 20120314: respect case sensitivity for escaped (non wildcard)
             Collection<String> tokens = new HashSet<String>(
                     Arrays.asList(value.split(" ")));
             tokens.remove("");
             queryConstraints = tokens.toArray(new String[tokens.size()]);
         } else if (IndexDataTypeEnum.STR.getIndexType().equals(indexValue.getType())) {
-            value = value.toLowerCase();
+            if(!escape){ 
+                value = value.toLowerCase();
+            } //rw: 20120314: respect case sensitivity for escaped (non wildcard)
             queryConstraints = new String[] {value.replace(' ', '+')};
         } else {
             queryConstraints = new String[] {value};

Propchange: incubator/stanbol/branches/0.9.0-incubating/reasoners/
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Fri Mar 16 12:26:27 2012
@@ -1,2 +1,3 @@
 /incubator/stanbol/branches/jena-reasoners/reasoners:1156596-1163703
 /incubator/stanbol/branches/lto-reasoners/reasoners:1180011-1205767
+/incubator/stanbol/trunk/reasoners:1301064-1301458