You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2012/03/16 06:21:23 UTC
svn commit: r1301339 - in /incubator/stanbol/trunk: commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/ enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/ enhancer/engines/keywor...

Author: rwesten
Date: Fri Mar 16 05:21:22 2012
New Revision: 1301339

URL: http://svn.apache.org/viewvc?rev=1301339&view=rev
Log:
Several Improvements to the KeywordLinkingEngine

* Type Mappings are now configureable (implementation of STANBOL-536)
* Added an OpenNLP Tokenizer implementation that is optimized for matching Keywords. The use of this Tokenizer can be forced by an option of the KeywordLinkingEngine (implementation of STANBOL-538)
* The KeywordLinkingEngine now supports case sensitive matching (implementation of STANBOL-535). This required also changes to the SolrYard as up to now parsed queries where converted to lower case because wild case queries are not processed by the query analyzer. Now the SolrYard only converts Query string for wildcard queries to lower case. This special treatment can be removed with the release of Solr 3.6 because than it will be supported that also wildcard queries are converted to lowercase if the schema.xml is configured accordingly


other changes:

* The Stanbol Enhancer Web UI uses now separate categories for any dc:type value found for TextAnnotations.
* Added the following Namespaces to the NamespaceEnum of the Entityhub

    drugbank "http://www4.wiwiss.fu-berlin.de/drugbank/resource/drugbank/"
    dailymed "http://www4.wiwiss.fu-berlin.de/dailymed/resource/dailymed/"
    sider "http://www4.wiwiss.fu-berlin.de/sider/resource/sider/"
    linkedct "http://data.linkedct.org/resource/linkedct/"
    stitch "http://www4.wiwiss.fu-berlin.de/stitch/resource/stitch/"
    diseasome "http://www4.wiwiss.fu-berlin.de/diseasome/resource/diseasome/"
    nci "http://www.mindswap.org/2003/nciOncology.owl#"
    tcm "http://purl.org/net/tcm/tcm.lifescience.ntu.edu.tw/"
    

Added:
    incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/KeywordTokenizer.java   (with props)
Modified:
    incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/TextAnalyzer.java
    incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java
    incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java
    incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinkerConfig.java
    incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/Suggestion.java
    incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/resources/OSGI-INF/metatype/metatype.properties
    incubator/stanbol/trunk/enhancer/jersey/src/main/java/org/apache/stanbol/enhancer/jersey/resource/ContentItemResource.java
    incubator/stanbol/trunk/enhancer/jersey/src/main/resources/org/apache/stanbol/enhancer/jersey/templates/imports/contentitem.ftl
    incubator/stanbol/trunk/entityhub/generic/servicesapi/src/main/java/org/apache/stanbol/entityhub/servicesapi/defaults/NamespaceEnum.java
    incubator/stanbol/trunk/entityhub/yard/solr/src/main/java/org/apache/stanbol/entityhub/yard/solr/query/QueryUtils.java

Added: incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/KeywordTokenizer.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/KeywordTokenizer.java?rev=1301339&view=auto
==============================================================================
--- incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/KeywordTokenizer.java (added)
+++ incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/KeywordTokenizer.java Fri Mar 16 05:21:22 2012
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreemnets.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.commons.opennlp;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.tokenize.Tokenizer;
+import opennlp.tools.util.Span;
+import opennlp.tools.util.StringUtil;
+
+/**
+ * Performs tokenization using the character class whitespace. Will create
+ * seperate tokens for punctation at the end of words. 
+ * Intended to be used to extract alphanumeric
+ * keywords from texts
+ * 
+ * @author Rupert Westenthaler
+ */
+public class KeywordTokenizer implements Tokenizer {
+
+    public static final KeywordTokenizer INSTANCE;
+
+    static {
+        INSTANCE = new KeywordTokenizer();
+    }
+
+    private KeywordTokenizer() {}
+
+    public String[] tokenize(String s) {
+        return Span.spansToStrings(tokenizePos(s), s);
+    }
+
+    public Span[] tokenizePos(String s) {
+        boolean isWhitespace;
+        List<Span> tokens = new ArrayList<Span>();
+        int sl = s.length();
+        int start = -1;
+        char pc = 0;
+        for (int ci = 0; ci <= sl; ci++) {
+            char c = ci < sl ? s.charAt(ci) : ' ';
+            isWhitespace = StringUtil.isWhitespace(c);
+            if (!isWhitespace & start < 0) { // new token starts
+                start = ci;
+            }
+            if (isWhitespace && start >= 0) { // end of token
+                // limited support for punctations at the end of words
+                if (start < ci - 1 && (pc == '.' || pc == ',' || 
+                        pc == '!' || pc == '?' || pc == ';' || pc == ':')) {
+                    tokens.add(new Span(start, ci - 1));
+                    tokens.add(new Span(ci - 1, ci));
+                } else {
+                    tokens.add(new Span(start, ci));
+                }
+                start = -1;
+            }
+        }
+        return (Span[]) tokens.toArray(new Span[tokens.size()]);
+    }
+}
\ No newline at end of file

Propchange: incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/KeywordTokenizer.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Modified: incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/TextAnalyzer.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/TextAnalyzer.java?rev=1301339&r1=1301338&r2=1301339&view=diff
==============================================================================
--- incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/TextAnalyzer.java (original)
+++ incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/TextAnalyzer.java Fri Mar 16 05:21:22 2012
@@ -52,6 +52,7 @@ public class TextAnalyzer {
     
     public static final class TextAnalyzerConfig {
         protected boolean forceSimpleTokenizer = false; //default to false
+        protected boolean forceKeywordTokenizer = false; //default to false
         protected boolean enablePosTagger = true;
         protected boolean enableChunker = true;
         protected boolean enableSentenceDetector = true;
@@ -67,6 +68,19 @@ public class TextAnalyzer {
     
         public final void forceSimpleTokenizer(boolean useSimpleTokenizer) {
             this.forceSimpleTokenizer = useSimpleTokenizer;
+            if(useSimpleTokenizer){
+                this.forceKeywordTokenizer = false;
+            }
+        }
+        public final boolean isKeywordTokenizerForced() {
+            return forceKeywordTokenizer;
+        }
+    
+        public final void forceKeywordTokenizer(boolean useKeywordTokenizer) {
+            this.forceKeywordTokenizer = useKeywordTokenizer;
+            if(useKeywordTokenizer){
+                this.forceSimpleTokenizer = false;
+            }
         }
     
         public final boolean isPosTaggerEnable() {
@@ -237,6 +251,8 @@ public class TextAnalyzer {
         if(tokenizer == null){
             if(config.forceSimpleTokenizer){
                 tokenizer = SimpleTokenizer.INSTANCE;
+            } else if(config.forceKeywordTokenizer){
+                tokenizer = KeywordTokenizer.INSTANCE;
             } else {
                 tokenizer = openNLP.getTokenizer(language);
                 if(tokenizer == null){

Modified: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java?rev=1301339&r1=1301338&r2=1301339&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java Fri Mar 16 05:21:22 2012
@@ -19,6 +19,8 @@ package org.apache.stanbol.enhancer.engi
 import static org.apache.stanbol.entityhub.servicesapi.defaults.NamespaceEnum.getFullName;
 
 import java.io.IOException;
+import java.net.URI;
+import java.net.URISyntaxException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
@@ -77,6 +79,7 @@ import org.apache.stanbol.enhancer.servi
 import org.apache.stanbol.enhancer.servicesapi.rdf.Properties;
 import org.apache.stanbol.entityhub.model.clerezza.RdfValueFactory;
 import org.apache.stanbol.entityhub.servicesapi.Entityhub;
+import org.apache.stanbol.entityhub.servicesapi.defaults.NamespaceEnum;
 import org.apache.stanbol.entityhub.servicesapi.model.Reference;
 import org.apache.stanbol.entityhub.servicesapi.model.Text;
 import org.apache.stanbol.entityhub.servicesapi.site.ReferencedSite;
@@ -98,6 +101,7 @@ import org.slf4j.LoggerFactory;
     @Property(name=EnhancementEngine.PROPERTY_NAME),
     @Property(name=KeywordLinkingEngine.REFERENCED_SITE_ID),
     @Property(name=KeywordLinkingEngine.NAME_FIELD,value=EntityLinkerConfig.DEFAULT_NAME_FIELD),
+    @Property(name=KeywordLinkingEngine.CASE_SENSITIVE,boolValue=EntityLinkerConfig.DEFAULT_CASE_SENSITIVE_MATCHING_STATE),
     @Property(name=KeywordLinkingEngine.TYPE_FIELD,value=EntityLinkerConfig.DEFAULT_TYPE_FIELD),
     @Property(name=KeywordLinkingEngine.REDIRECT_FIELD,value=EntityLinkerConfig.DEFAULT_REDIRECT_FIELD),
     @Property(name=KeywordLinkingEngine.REDIRECT_PROCESSING_MODE,options={
@@ -113,10 +117,12 @@ import org.slf4j.LoggerFactory;
         },value="IGNORE"),
     @Property(name=KeywordLinkingEngine.MIN_SEARCH_TOKEN_LENGTH,
         intValue=EntityLinkerConfig.DEFAULT_MIN_SEARCH_TOKEN_LENGTH),
+    @Property(name=KeywordLinkingEngine.KEYWORD_TOKENIZER,boolValue=false),
     @Property(name=KeywordLinkingEngine.MAX_SUGGESTIONS,
         intValue=EntityLinkerConfig.DEFAULT_SUGGESTIONS),
     @Property(name=KeywordLinkingEngine.PROCESSED_LANGUAGES,value=""),
     @Property(name=KeywordLinkingEngine.DEFAULT_MATCHING_LANGUAGE,value=""),
+    @Property(name=KeywordLinkingEngine.TYPE_MAPPINGS,cardinality=1000),
     @Property(name=KeywordLinkingEngine.DEREFERENCE_ENTITIES,
         boolValue=KeywordLinkingEngine.DEFAULT_DEREFERENCE_ENTITIES_STATE),
     @Property(name=Constants.SERVICE_RANKING,intValue=0)
@@ -147,6 +153,7 @@ public class KeywordLinkingEngine 
     public static final String REFERENCED_SITE_ID = "org.apache.stanbol.enhancer.engines.keywordextraction.referencedSiteId";
     public static final String NAME_FIELD = "org.apache.stanbol.enhancer.engines.keywordextraction.nameField";
     public static final String TYPE_FIELD = "org.apache.stanbol.enhancer.engines.keywordextraction.typeField";
+    public static final String CASE_SENSITIVE = "org.apache.stanbol.enhancer.engines.keywordextraction.caseSensitive";
     public static final String REDIRECT_FIELD = "org.apache.stanbol.enhancer.engines.keywordextraction.redirectField";
     public static final String REDIRECT_PROCESSING_MODE = "org.apache.stanbol.enhancer.engines.keywordextraction.redirectMode";
     public static final String MIN_SEARCH_TOKEN_LENGTH = "org.apache.stanbol.enhancer.engines.keywordextraction.minSearchTokenLength";
@@ -155,7 +162,8 @@ public class KeywordLinkingEngine 
     public static final String MIN_FOUND_TOKENS= "org.apache.stanbol.enhancer.engines.keywordextraction.minFoundTokens";
     public static final String DEFAULT_MATCHING_LANGUAGE = "org.apache.stanbol.enhancer.engines.keywordextraction.defaultMatchingLanguage";
     public static final String MIN_POS_TAG_PROBABILITY = "org.apache.stanbol.enhancer.engines.keywordextraction.minPosTagProbability";
-//  public static final String SIMPLE_TOKENIZER = "org.apache.stanbol.enhancer.engines.keywordextraction.simpleTokenizer";
+    public static final String TYPE_MAPPINGS = "org.apache.stanbol.enhancer.engines.keywordextraction.typeMappings";
+    public static final String KEYWORD_TOKENIZER = "org.apache.stanbol.enhancer.engines.keywordextraction.keywordTokenizer";
 //  public static final String ENABLE_CHUNKER = "org.apache.stanbol.enhancer.engines.keywordextraction.enableChunker";
     /**
      * Adds the dereference feature (STANBOL-333) also to this engine.
@@ -590,6 +598,13 @@ public class KeywordLinkingEngine 
                 "The configured min POS tag probability MUST BE in the range [0..1] " +
                 "or < 0 to deactivate this feature (parsed value "+value+")!");
         }
+        value = configuration.get(KEYWORD_TOKENIZER);
+        //the keyword tokenizer config
+        if(value instanceof Boolean){
+            nlpConfig.forceKeywordTokenizer((Boolean)value);
+        } else if(value != null && !value.toString().isEmpty()){
+            nlpConfig.forceKeywordTokenizer(Boolean.valueOf(value.toString()));
+        }
         nlpConfig.setMinPosTagProbability(minPosTagProb);
         analysedContentFactory = OpenNlpAnalysedContentFactory.getInstance(openNLP,nlpConfig);
     }
@@ -626,6 +641,13 @@ public class KeywordLinkingEngine 
             }
             linkerConfig.setNameField(value.toString());
         }
+        //init case sensitivity
+        value = configuration.get(CASE_SENSITIVE);
+        if(value instanceof Boolean){
+            linkerConfig.setCaseSensitiveMatchingState((Boolean)value);
+        } else if(value != null && !value.toString().isEmpty()){
+            linkerConfig.setCaseSensitiveMatchingState(Boolean.valueOf(value.toString()));
+        } //if NULL or empty use default
         //init TYPE_FIELD
         value = configuration.get(TYPE_FIELD);
         if(value != null){
@@ -725,6 +747,66 @@ public class KeywordLinkingEngine 
                 linkerConfig.setDefaultLanguage(defaultLang);
             }
         }
+        //init type mappings
+        value = configuration.get(TYPE_MAPPINGS);
+        if(value instanceof String[]){ //support array
+            value = Arrays.asList((String[])value);
+        } else if(value instanceof String) { //single value
+            value = Collections.singleton(value);
+        }
+        if(value instanceof Collection<?>){ //and collection
+            log.info("Init Type Mappings");
+            configs :
+            for(Object o : (Iterable<?>)value){
+                if(o != null){
+                    StringBuilder usage = new StringBuilder("useages: ");
+                    usage.append("a: '{uri}' short for {uri} > {uri} | ");
+                    usage.append("b: '{source1};{source2};..;{sourceN} > {target}'");
+                    String[] config = o.toString().split(">");
+                    if(config[0].isEmpty()){
+                        log.warn("Invalid Type Mapping Config '{}': Missing Source Type ({}) -> ignore this config",
+                            o,usage);
+                        continue configs;
+                    }
+                    String[] sourceTypes = config[0].split(";");
+                    if(sourceTypes.length > 1 && (config.length < 2 || config[1].isEmpty())){
+                        log.warn("Invalid Type Mapping Config '{}': Missing Target Type '{}' ({}) -> ignore this config",
+                            o,usage);
+                        continue configs;
+                    }
+                    String targetType = config.length < 2 ? sourceTypes[0] : config[1];
+                    targetType = getFullName(targetType.trim()); //support for ns:localName
+                    try { //validate
+                        new URI(targetType);
+                    } catch (URISyntaxException e) {
+                        log.warn("Invalid URI '{}' in Type Mapping Config '{}' -> ignore this config",
+                            sourceTypes[0],o);
+                        continue configs;
+                    }
+                    UriRef targetUri = new UriRef(targetType);
+                    for(String sourceType : sourceTypes){
+                        if(!sourceType.isEmpty()){
+                            sourceType = getFullName(sourceType.trim()); //support for ns:localName
+                            try { //validate
+                                new URI(sourceType);
+                                UriRef old = linkerConfig.setTypeMapping(sourceType, targetUri);
+                                if(old == null){
+                                    log.info(" > add type mapping {} > {}", sourceType,targetType);
+                                } else {
+                                    log.info(" > set type mapping {} > {} (old: {})", 
+                                        new Object[]{sourceType,targetType,old.getUnicodeString()});
+                                }
+                            } catch (URISyntaxException e) {
+                                log.warn("Invalid URI '{}' in Type Mapping Config '{}' -> ignore this source type",
+                                    sourceTypes[0],o);
+                            }
+                        }
+                    }
+                }
+            }
+        } else {
+            log.debug("No Type mappings configured");
+        }
     }
 
     /**

Modified: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java?rev=1301339&r1=1301338&r2=1301339&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java Fri Mar 16 05:21:22 2012
@@ -353,7 +353,10 @@ public class EntityLinker {
      * @param label
      */
     private void matchLabel(Suggestion match, Text label) {
-        String text = label.getText().toLowerCase();
+        String text = label.getText();
+        if(!config.isCaseSensitiveMatching()){
+            text = text.toLowerCase(); //TODO use language of label for Locale
+        }
         //Tokenize the label and remove remove tokens without alpha numerical chars
         String[] unprocessedLabelTokens = content.tokenize(text);
         int offset = 0;
@@ -396,7 +399,10 @@ public class EntityLinker {
                 && search ;currentIndex++){
             currentToken = state.getSentence().getTokens().get(currentIndex);
             if(currentToken.hasAplhaNumericChar()){
-                currentTokenText = currentToken.getText().toLowerCase();
+                currentTokenText = currentToken.getText();
+                if(!config.isCaseSensitiveMatching()){
+                    currentTokenText = currentTokenText.toLowerCase();
+                }
                 currentTokenLength = currentTokenText.length();
                 boolean isProcessable = isProcessableToken(currentToken);
                 boolean found = false;
@@ -460,7 +466,10 @@ public class EntityLinker {
             String labelTokenText = labelTokens[labelIndex];
             if(labelTokenSet.remove(labelTokenText)){ //still not matched
                 currentToken = state.getSentence().getTokens().get(currentIndex);
-                currentTokenText = currentToken.getText().toLowerCase();
+                currentTokenText = currentToken.getText();
+                if(!config.isCaseSensitiveMatching()){
+                    currentTokenText = currentTokenText.toLowerCase();
+                }
                 currentTokenLength = currentTokenText.length();
                 boolean found = false;
                 float matchFactor = 0f;
@@ -503,7 +512,7 @@ public class EntityLinker {
         //   match (this will be very rare
         if(foundProcessableTokens > 0 && match.getMatchCount() <= foundProcessableTokens) {
             String currentText = state.getTokenText(firstFoundIndex,coveredTokens);
-            if(currentText.equalsIgnoreCase(text)){ 
+            if(config.isCaseSensitiveMatching() ? currentText.equals(text) : currentText.equalsIgnoreCase(text)){ 
                 labelMatch = MATCH.EXACT;
                 //set found to covered: May be lower because only
                 //processable tokens are counted, but Exact also checks

Modified: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinkerConfig.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinkerConfig.java?rev=1301339&r1=1301338&r2=1301339&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinkerConfig.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinkerConfig.java Fri Mar 16 05:21:22 2012
@@ -88,6 +88,10 @@ public class EntityLinkerConfig {
      */
     public static final String DEFAULT_LANGUAGE = null;
     /**
+     * The default for case sensitive matching is set to <code>false</code>
+     */
+    public static final boolean DEFAULT_CASE_SENSITIVE_MATCHING_STATE = false;
+    /**
      * Default mapping for Concept types to dc:type values added for
      * TextAnnotations.
      */
@@ -98,6 +102,7 @@ public class EntityLinkerConfig {
         mappings.put(OntologicalClasses.DBPEDIA_ORGANISATION.getUnicodeString(), OntologicalClasses.DBPEDIA_ORGANISATION);
         mappings.put(NamespaceEnum.dbpediaOnt+"Newspaper", OntologicalClasses.DBPEDIA_ORGANISATION);
         mappings.put(NamespaceEnum.schema+"Organization", OntologicalClasses.DBPEDIA_ORGANISATION);
+//        mappings.put(NamespaceEnum.dailymed+"organization",OntologicalClasses.DBPEDIA_ORGANISATION);
         
         mappings.put(OntologicalClasses.DBPEDIA_PERSON.getUnicodeString(), OntologicalClasses.DBPEDIA_PERSON);
         mappings.put(NamespaceEnum.foaf+"Person", OntologicalClasses.DBPEDIA_PERSON);
@@ -108,6 +113,25 @@ public class EntityLinkerConfig {
         mappings.put(NamespaceEnum.gml+"_Feature", OntologicalClasses.DBPEDIA_PLACE);
 
         mappings.put(OntologicalClasses.SKOS_CONCEPT.getUnicodeString(), OntologicalClasses.SKOS_CONCEPT);
+
+//        UriRef DRUG = new UriRef(NamespaceEnum.drugbank+"drugs");
+//        mappings.put(DRUG.getUnicodeString(), DRUG);
+//        mappings.put(NamespaceEnum.dbpediaOnt+"Drug", DRUG);
+//        mappings.put(NamespaceEnum.dailymed+"drugs", DRUG);
+//        mappings.put(NamespaceEnum.sider+"drugs", DRUG);
+//        mappings.put(NamespaceEnum.tcm+"Medicine", DRUG);
+//        
+//        UriRef DISEASE = new UriRef(NamespaceEnum.diseasome+"diseases");
+//        mappings.put(DISEASE.getUnicodeString(), DISEASE);
+//        mappings.put(NamespaceEnum.linkedct+"condition", DISEASE);
+//        mappings.put(NamespaceEnum.tcm+"Disease", DISEASE);
+//
+//        UriRef SIDE_EFFECT = new UriRef(NamespaceEnum.sider+"side_effects");
+//        mappings.put(SIDE_EFFECT.getUnicodeString(), SIDE_EFFECT);
+//        
+//        UriRef INGREDIENT = new UriRef(NamespaceEnum.dailymed+"ingredients");
+//        mappings.put(INGREDIENT.getUnicodeString(), INGREDIENT);
+                
         DEFAULT_ENTITY_TYPE_MAPPINGS = Collections.unmodifiableMap(mappings);
     }
     /**
@@ -162,6 +186,8 @@ public class EntityLinkerConfig {
      * more mapped to the actual label of an result.
      */
     private int maxSearchTokens = DEFAULT_MAX_SEARCH_TOKENS;
+    
+    private boolean caseSensitiveMatchingState = DEFAULT_CASE_SENSITIVE_MATCHING_STATE;
     /**
      * Holds the mappings of rdf:type used by concepts to dc:type values used
      * by TextAnnotations. 
@@ -356,6 +382,20 @@ public class EntityLinkerConfig {
         this.maxSearchTokens = maxSearchTokens;
     }
     /**
+     * Getter for the case sensitive matching state
+     * @return the state
+     */
+    public boolean isCaseSensitiveMatching() {
+        return caseSensitiveMatchingState;
+    }
+    /**
+     * Setter for the case sensitive matching state
+     * @param caseSensitiveMatchingState the state
+     */
+    public void setCaseSensitiveMatchingState(boolean state) {
+        this.caseSensitiveMatchingState = state;
+    }
+    /**
      * Removes the mapping for the parsed concept type
      * @param conceptType the concept type to remove the mapping
      * @return the previously mapped dc:type value or <code>null</code> if

Modified: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/Suggestion.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/Suggestion.java?rev=1301339&r1=1301338&r2=1301339&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/Suggestion.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/Suggestion.java Fri Mar 16 05:21:22 2012
@@ -212,10 +212,12 @@ public class Suggestion implements Compa
      * @return the best match or {@link Suggestion#getMatchedLabel()} if non is found
      */
     public Text getBestLabel(String nameField, String language){
-        Representation rep = getRepresentation(); 
+        Representation rep = getRepresentation();
+        //start with the matched label -> so if we do not find a better one
+        //we will use the matched!
+        Text label = this.label;
         // 1. check if the returned Entity does has a label -> if not return null
         // add labels (set only a single label. Use "en" if available!
-        Text label = null;
         Iterator<Text> labels = rep.getText(nameField);
         boolean matchFound = false;
         while (labels.hasNext() && !matchFound) {

Modified: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/resources/OSGI-INF/metatype/metatype.properties
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/resources/OSGI-INF/metatype/metatype.properties?rev=1301339&r1=1301338&r2=1301339&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/resources/OSGI-INF/metatype/metatype.properties (original)
+++ incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/resources/OSGI-INF/metatype/metatype.properties Fri Mar 16 05:21:22 2012
@@ -56,6 +56,10 @@ org.apache.stanbol.enhancer.engines.keyw
 org.apache.stanbol.enhancer.engines.keywordextraction.typeField.description=The field used to \
 retrieve the types of matched Entities. Values of that field are expected to be URIs
 
+org.apache.stanbol.enhancer.engines.keywordextraction.caseSensitive.name=Case Sensitivity
+org.apache.stanbol.enhancer.engines.keywordextraction.caseSensitive.description=Allows to enable/disable \
+case sensitive matching
+
 org.apache.stanbol.enhancer.engines.keywordextraction.redirectField.name=Redirect Field
 org.apache.stanbol.enhancer.engines.keywordextraction.redirectField.description=Entities may \
 define redirects to other Entities (e.g. "USA"(http://dbpedia.org/resource/USA) -> \
@@ -97,3 +101,17 @@ configuration (e.g. to 'en' in the case 
 org.apache.stanbol.enhancer.engines.keywordextraction.dereference.name=Dereference Entities
 org.apache.stanbol.enhancer.engines.keywordextraction.dereference.description=If enabled additional \
 data for suggested Entities are included.
+
+org.apache.stanbol.enhancer.engines.keywordextraction.typeMappings.name=Type Mappings
+org.apache.stanbol.enhancer.engines.keywordextraction.typeMappings.description=This allows to add \
+additional entity-type > text-annotation-type mappings. Such mappings are used to determine the \
+'dc:type' value of the 'fise:TextAnnotation' created for extracted entities. Usage: \
+variant (a) '{uri}' short for {uri} > {uri} or (b) '{source1};{source2};..;{sourceN} > {target}'. \
+Note that a {source} may be only mapped to a single {target}. Multiple {source} types \
+can be mapped to the same {target}.
+
+org.apache.stanbol.enhancer.engines.keywordextraction.keywordTokenizer.name=Keyword Tokenizer
+org.apache.stanbol.enhancer.engines.keywordextraction.keywordTokenizer.description=This allows \
+to use a special Tokenizer for matching keywords and alpha numeric IDs. Typical language \
+specific Tokenizers tned to split such IDs in several tokens and therefore might prevent \
+a correct matching.

Modified: incubator/stanbol/trunk/enhancer/jersey/src/main/java/org/apache/stanbol/enhancer/jersey/resource/ContentItemResource.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/jersey/src/main/java/org/apache/stanbol/enhancer/jersey/resource/ContentItemResource.java?rev=1301339&r1=1301338&r2=1301339&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/jersey/src/main/java/org/apache/stanbol/enhancer/jersey/resource/ContentItemResource.java (original)
+++ incubator/stanbol/trunk/enhancer/jersey/src/main/java/org/apache/stanbol/enhancer/jersey/resource/ContentItemResource.java Fri Mar 16 05:21:22 2012
@@ -47,10 +47,12 @@ import java.util.Collections;
 import java.util.Date;
 import java.util.EnumMap;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.Map.Entry;
+import java.util.Set;
 import java.util.TreeMap;
 
 import javax.servlet.ServletContext;
@@ -80,6 +82,7 @@ import org.apache.clerezza.rdf.core.seri
 import org.apache.clerezza.rdf.core.sparql.ParseException;
 import org.apache.clerezza.rdf.ontologies.RDF;
 import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang.StringUtils;
 import org.apache.stanbol.commons.indexedgraph.IndexedMGraph;
 import org.apache.stanbol.commons.web.base.resource.BaseStanbolResource;
 import org.apache.stanbol.enhancer.servicesapi.Blob;
@@ -232,7 +235,40 @@ public class ContentItemResource extends
     public URI getMetadataHref() {
         return metadataHref;
     }
-
+    /**
+     * Checks if there are Occurrences
+     */
+    public boolean hasOccurrences(){
+        for(Map<String,EntityExtractionSummary> occ : extractionsByTypeMap.values()){
+            if(!occ.isEmpty()){
+                return true;
+            }
+        }
+        return false;
+    }
+    /**
+     * Used to print occurrences with other types than the natively supported
+     */
+    public Collection<UriRef> getOtherOccurrencyTypes(){
+        Set<UriRef>  types = new HashSet<UriRef>(extractionsByTypeMap.keySet());
+        types.remove(DBPEDIA_PERSON);
+        types.remove(DBPEDIA_ORGANISATION);
+        types.remove(DBPEDIA_PLACE);
+        types.remove(SKOS_CONCEPT);
+        types.remove(null); //other
+        return types;
+    }
+    public String extractLabel(UriRef uri){
+        String fullUri = uri.getUnicodeString();
+        int index = Math.max(fullUri.lastIndexOf('#'),fullUri.lastIndexOf('/'));
+        index = Math.max(index, fullUri.lastIndexOf(':'));
+        //do not convert if the parsed uri does not contain a local name
+        if(index > 0 && index+1 < fullUri.length()){
+            return StringUtils.capitalize(fullUri.substring(index+1).replaceAll("[\\-_]", " "));
+        } else {
+            return uri.getUnicodeString();
+        }
+    }
     public Collection<EntityExtractionSummary> getOccurrences(UriRef type){
         Map<String,EntityExtractionSummary> typeMap = extractionsByTypeMap.get(type);
         Collection<EntityExtractionSummary> typeOccurrences;
@@ -401,13 +437,17 @@ public class ContentItemResource extends
 
         public String getThumbnailSrc() {
             if (suggestions.isEmpty()) {
-                return defaultThumbnails.get(type);
+                return getMissingThumbnailSrc();
             }
             return suggestions.get(0).getThumbnailSrc();
         }
 
         public String getMissingThumbnailSrc() {
-            return defaultThumbnails.get(type);
+            String source = defaultThumbnails.get(type);
+            if(source == null){
+                source = defaultThumbnails.get(null);//default
+            }
+            return source;
         }
 
         public EntitySuggestion getBestGuess() {
@@ -512,11 +552,15 @@ public class ContentItemResource extends
                     return ((UriRef) object).getUnicodeString();
                 }
             }
-            return defaultThumbnails.get(type);
+            return getMissingThumbnailSrc();
         }
 
         public String getMissingThumbnailSrc() {
-            return defaultThumbnails.get(type);
+            String source = defaultThumbnails.get(type);
+            if(source == null){
+                source = defaultThumbnails.get(null);
+            }
+            return source;
         }
 
         public String getSummary() {

Modified: incubator/stanbol/trunk/enhancer/jersey/src/main/resources/org/apache/stanbol/enhancer/jersey/templates/imports/contentitem.ftl
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/jersey/src/main/resources/org/apache/stanbol/enhancer/jersey/templates/imports/contentitem.ftl?rev=1301339&r1=1301338&r2=1301339&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/jersey/src/main/resources/org/apache/stanbol/enhancer/jersey/templates/imports/contentitem.ftl (original)
+++ incubator/stanbol/trunk/enhancer/jersey/src/main/resources/org/apache/stanbol/enhancer/jersey/templates/imports/contentitem.ftl Fri Mar 16 05:21:22 2012
@@ -18,10 +18,10 @@
 <#macro view>
 
 <div class="entitylistings">
-<#if it.personOccurrences?size != 0 || it.organizationOccurrences?size != 0 ||  it.placeOccurrences?size != 0 || it.conceptOccurrences?size != 0 || it.otherOccurrences?size != 0>
+<#if it.hasOccurrences()>
 <h3>Extracted entities</h3>
 </#if>
-
+<#-- First print the predefined types -->
 <div class="entitylisting">
 <#if it.personOccurrences?size != 0>
 <h3>People</h3>
@@ -50,6 +50,15 @@
 </#if>
 </div>
 
+<#-- add Occurrences with other types -->
+<#list it.otherOccurrencyTypes as type>
+  <div class="entitylisting">
+  <h3>${it.extractLabel(type)}</h3>
+  <@entities.listing entities=it.getOccurrences(type) />
+  </div>
+</#list>
+
+<#-- add Occurrences with no type -->
 <div class="entitylisting">
 <#if it.otherOccurrences?size != 0>
 <h3>Others</h3>

Modified: incubator/stanbol/trunk/entityhub/generic/servicesapi/src/main/java/org/apache/stanbol/entityhub/servicesapi/defaults/NamespaceEnum.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/entityhub/generic/servicesapi/src/main/java/org/apache/stanbol/entityhub/servicesapi/defaults/NamespaceEnum.java?rev=1301339&r1=1301338&r2=1301339&view=diff
==============================================================================
--- incubator/stanbol/trunk/entityhub/generic/servicesapi/src/main/java/org/apache/stanbol/entityhub/servicesapi/defaults/NamespaceEnum.java (original)
+++ incubator/stanbol/trunk/entityhub/generic/servicesapi/src/main/java/org/apache/stanbol/entityhub/servicesapi/defaults/NamespaceEnum.java Fri Mar 16 05:21:22 2012
@@ -79,7 +79,47 @@ public enum NamespaceEnum {
     /**
      * The W3C Ontology for Media Resources http://www.w3.org/TR/mediaont-10/
      */
-    media("http://www.w3.org/ns/ma-ont#")
+    media("http://www.w3.org/ns/ma-ont#"),
+    /*
+     * eHealth domain 
+     */
+    /**
+     * DrugBank is a repository of almost 5000 FDA-approved small molecule and 
+     * biotech drugs. 
+     */
+    drugbank("http://www4.wiwiss.fu-berlin.de/drugbank/resource/drugbank/"),
+    /**
+     * Dailymed is published by the National Library of Medicine, 
+     * and provides high quality information about marketed drugs.
+     */
+    dailymed("http://www4.wiwiss.fu-berlin.de/dailymed/resource/dailymed/"),
+    /**
+     * SIDER contains information on marketed drugs and their adverse effects. 
+     * The information is extracted from public documents and package inserts.
+     */
+    sider("http://www4.wiwiss.fu-berlin.de/sider/resource/sider/"),
+    /**
+     * The Linked Clinical Trials (LinkedCT) project aims at publishing the 
+     * first open Semantic Web data source for clinical trials data.
+     */
+    linkedct("http://data.linkedct.org/resource/linkedct/"),
+    /**
+     * STITCH contains information on chemicals and proteins as well as their 
+     * interactions and links.
+     */
+    stitch("http://www4.wiwiss.fu-berlin.de/stitch/resource/stitch/"),
+    /**
+     * Diseasome publishes a network of 4,300 disorders and disease genes linked 
+     * by known disorder-gene associations for exploring all known phenotype and 
+     * disease gene associations, indicating the common genetic origin of many 
+     * diseases.
+     */
+    diseasome("http://www4.wiwiss.fu-berlin.de/diseasome/resource/diseasome/"),
+    /**
+     * National Cancer Institute Thesaurus (http://www.mindswap.org/2003/CancerOntology/)
+     */
+    nci("http://www.mindswap.org/2003/nciOncology.owl#"),
+    tcm("http://purl.org/net/tcm/tcm.lifescience.ntu.edu.tw/")
     ;
     /**
      * The logger

Modified: incubator/stanbol/trunk/entityhub/yard/solr/src/main/java/org/apache/stanbol/entityhub/yard/solr/query/QueryUtils.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/entityhub/yard/solr/src/main/java/org/apache/stanbol/entityhub/yard/solr/query/QueryUtils.java?rev=1301339&r1=1301338&r2=1301339&view=diff
==============================================================================
--- incubator/stanbol/trunk/entityhub/yard/solr/src/main/java/org/apache/stanbol/entityhub/yard/solr/query/QueryUtils.java (original)
+++ incubator/stanbol/trunk/entityhub/yard/solr/src/main/java/org/apache/stanbol/entityhub/yard/solr/query/QueryUtils.java Fri Mar 16 05:21:22 2012
@@ -41,7 +41,22 @@ public final class QueryUtils {
      * of STR no whitespace is assumed. Therefore spaces need to be replaced with '+' to search for tokens
      * with the exact name. In all other cases the string need not to be converted.
      * 
-     * Note also that text queries are converted to lower case
+     * <del>Note also that text queries are converted to lower case</del>
+     * Note: since 2012-03-14 parsed values are only converted to lower case.
+     * <p>
+     * <b>TODO:</b> Until Solr 3.6 is released and the implementation of
+     * <a href="https://issues.apache.org/jira/browse/">SOLR-2438</a> is
+     * released this needs to still convert wildcard queries to lower case.<br>
+     * Because of that:<ul>
+     * <li> in case <code>escape=true</code>. Non-wildcard queries should support
+     * case sensitive searches. If the searched solr field uses a lowerCase
+     * filter than this will be done by Solr anyway and if not that case
+     * sensitivity might be important!
+     * <li> for <code>escape=false</code> - wild card searches the values are
+     * still converted to lower case to keep compatible with previous versions.
+     * TODO: the caseSensitive parameter of TextConstraints should be used
+     * instead
+     * </ul>
      * 
      * @param value
      *            the index value
@@ -62,13 +77,17 @@ public final class QueryUtils {
             value = SolrUtil.escapeWildCardString(value);
         }
         if (IndexDataTypeEnum.TXT.getIndexType().equals(indexValue.getType())) {
-        	value = value.toLowerCase();
+            if(!escape){ 
+                value = value.toLowerCase();
+            } //rw: 20120314: respect case sensitivity for escaped (non wildcard)
             Collection<String> tokens = new HashSet<String>(
                     Arrays.asList(value.split(" ")));
             tokens.remove("");
             queryConstraints = tokens.toArray(new String[tokens.size()]);
         } else if (IndexDataTypeEnum.STR.getIndexType().equals(indexValue.getType())) {
-            value = value.toLowerCase();
+            if(!escape){ 
+                value = value.toLowerCase();
+            } //rw: 20120314: respect case sensitivity for escaped (non wildcard)
             queryConstraints = new String[] {value.replace(' ', '+')};
         } else {
             queryConstraints = new String[] {value};