You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2012/09/19 10:48:39 UTC
svn commit: r1387488 [3/5] - in /incubator/stanbol/branches/stanbol-nlp-processing: ./ data/ data/bundlelists/sentiment/ data/bundlelists/sentiment/src/ data/bundlelists/sentiment/src/main/ data/bundlelists/sentiment/src/main/bundles/ data/opennlp/lang...

Added: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/classifiers/SentiWordNet.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/classifiers/SentiWordNet.java?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/classifiers/SentiWordNet.java (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/classifiers/SentiWordNet.java Wed Sep 19 08:48:32 2012
@@ -0,0 +1,258 @@
+/*
+ * Copyright (c) 2012 Sebastian Schaffert
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+package org.apache.stanbol.enhancer.engines.sentiment.classifiers;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.Dictionary;
+import java.util.HashMap;
+import java.util.Hashtable;
+import java.util.Map;
+import java.util.TreeMap;
+import java.util.concurrent.locks.ReadWriteLock;
+import java.util.concurrent.locks.ReentrantReadWriteLock;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.felix.scr.annotations.Activate;
+import org.apache.felix.scr.annotations.Component;
+import org.apache.felix.scr.annotations.Deactivate;
+import org.apache.felix.scr.annotations.Reference;
+import org.apache.lucene.analysis.en.EnglishMinimalStemmer;
+import org.apache.stanbol.commons.stanboltools.datafileprovider.DataFileListener;
+import org.apache.stanbol.commons.stanboltools.datafileprovider.DataFileTracker;
+import org.apache.stanbol.enhancer.engines.sentiment.api.LexicalCategoryClassifier;
+import org.apache.stanbol.enhancer.engines.sentiment.api.SentimentClassifier;
+import org.osgi.framework.BundleContext;
+import org.osgi.framework.ServiceRegistration;
+import org.osgi.service.component.ComponentContext;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * A word classifier for the english language based on SentiWordNet. Reads in a SentiWordNet file and
+ * represents mappings from word to sentiment score between -1 and 1 in a hashmap.
+ * <p/>
+ * Future versions might make use of a disk-based storage of the hashmap to improve memory performance.
+ * <p/>
+ * Note that a license for SentiWordNet is required if you intend to use the classifier in commercial
+ * settings.
+ * <p/>
+ * @author Sebastian Schaffert
+ */
+@Component(immediate = true)
+public class SentiWordNet {
+
+    private static final Map<String,String> modelProperties = new HashMap<String,String>();
+    static {
+        modelProperties.put("Description", "Sentiment Word List (German)");
+        modelProperties.put("Download Location", "http://wordnet.princeton.edu/");
+    }
+    private static Logger log = LoggerFactory.getLogger(SentiWordNet.class);
+
+    private static final String SENTIWORDNET_RESOURCE = "SentiWordNet_3.0.0_20120206.txt";
+
+    protected String sentiWordNetFile;
+    
+    private ModelListener modelListener = new ModelListener();
+    
+    @Reference
+    private DataFileTracker dataFileTracker;
+
+    private BundleContext bundleContext;
+
+    protected SentiWordNetClassifierEN classifier;
+    
+    protected ServiceRegistration classifierRegistration;
+    
+    public SentiWordNet() {}
+    
+    @Activate
+    protected void activate(ComponentContext ctx){
+        bundleContext = ctx.getBundleContext();
+        //TODO: make configurable
+        sentiWordNetFile = SENTIWORDNET_RESOURCE;
+        
+        classifier = new SentiWordNetClassifierEN();
+
+        dataFileTracker.add(modelListener, sentiWordNetFile, modelProperties);
+    }
+
+    @Deactivate
+    protected void deactivate(ComponentContext ctx){
+        if(classifierRegistration != null){
+            classifierRegistration.unregister();
+            classifierRegistration = null;
+        }
+        if(classifier != null){
+            classifier.close();
+            classifier = null;
+        }
+        dataFileTracker.removeAll(modelListener);
+        sentiWordNetFile = null;
+    }
+    
+    /**
+     * Tracks the SentiWS files and triggers the registration of the service
+     */
+    private class ModelListener implements DataFileListener {
+
+        @Override
+        public boolean available(String resourceName, InputStream is) {
+            if(sentiWordNetFile.equals(resourceName)){
+                log.info("{} resource available",resourceName);
+                try {
+                    long start = System.currentTimeMillis();
+                    if(classifier != null){
+                        classifier.parseSentiWordNet(is);
+                        log.info("   ... loaded in {} ms",(System.currentTimeMillis()-start));
+                        registerService(); //register the service
+                    }
+                } catch (IOException e) {
+                    log.warn("Unable to load '"+resourceName+"'!",e);
+                    return false; //keep tracking
+                } catch (RuntimeException e) {
+                    log.error("RuntimeException while loading '"
+                            +resourceName+"!",e);
+                    return false; //keep tracking
+                }
+            } else {
+                log.warn("Tracker notified event for non-tracked resource '{}'"
+                    + "(tracked: {})!",resourceName,sentiWordNetFile);
+            }
+            //remove registration
+            return true;
+        }
+
+        @Override
+        public boolean unavailable(String resourceName) {
+            //not used;
+            return false;
+        }
+        
+    }
+    
+    protected void registerService() {
+        Dictionary<String,Object> serviceProperties = new Hashtable<String,Object>();
+        serviceProperties.put("language", "en"); //set the language
+        BundleContext bc = bundleContext;
+        if(bc != null){
+            classifierRegistration = bc.registerService(
+                SentimentClassifier.class.getName(), classifier, 
+                serviceProperties);
+        }
+    }
+    /**
+     * The OSGI service registered as soon as the required DataFiles are
+     * available
+     */
+    public static class SentiWordNetClassifierEN extends LexicalCategoryClassifier implements SentimentClassifier {
+
+        private ReadWriteLock lock = new ReentrantReadWriteLock();
+        private Map<String,Double> wordMap = new TreeMap<String,Double>();
+
+        private EnglishMinimalStemmer stemmer = new EnglishMinimalStemmer();
+
+        protected SentiWordNetClassifierEN() {}
+
+        protected void parseSentiWordNet(InputStream is) throws IOException {
+            BufferedReader in = new BufferedReader(new InputStreamReader(is));
+            lock.writeLock().lock();
+            try {
+                // read line by line:
+                // - lines starting with # are ignored
+                // - valid lines have the format POS ID POSSCORE NEGSCORE SYNONYMS GLOSS separated by tags
+                for (String line = in.readLine(); line != null; line = in.readLine()) {
+                    line = line.trim();
+                    if (line.length() > 0 && line.charAt(0) != '#') {
+                        String[] components = line.split("\t");
+    
+                        try {
+                            double posScore = Double.parseDouble(components[2]);
+                            double negScore = Double.parseDouble(components[3]);
+                            String synonyms = components[4];
+    
+                            Double score = posScore - negScore;
+    
+                            if (score != 0.0) {
+                                for (String synonymToken : synonyms.split(" ")) {
+                                    // synonymTokens are of the form word#position, so we strip out the position
+                                    // part
+                                    String[] synonym = synonymToken.split("#");
+                                    wordMap.put(getStemmed(synonym[0]), score);
+                                }
+                            }
+    
+                        } catch (Exception ex) {
+                            log.warn("could not parse SentiWordNet line '{}': {}", line, ex.getMessage());
+                        }
+                    }
+                }
+            } finally {
+                lock.writeLock().unlock();
+                IOUtils.closeQuietly(in);
+            }
+        }
+
+        public int getWordCount() {
+            lock.readLock().lock();
+            try {
+                return wordMap.size();
+            } finally {
+                lock.readLock().unlock();
+            }
+        }
+
+        /**
+         * Given the word passed as argument, return a value between -1 and 1 indicating its sentiment value
+         * from very negative to very positive. Unknown words should return the value 0.
+         * 
+         * @param word
+         * @return
+         */
+        @Override
+        public double classifyWord(String word) {
+            String stemmed = getStemmed(word);
+            lock.readLock().lock();
+            try {
+                Double sentiment = wordMap.get(stemmed);
+                return sentiment != null ? sentiment.doubleValue() : 0.0;
+            } finally {
+                lock.readLock().unlock();
+            }
+        }
+
+        private String getStemmed(String word) {
+            return word.substring(0, stemmer.stem(word.toCharArray(), word.length()));
+        }
+
+        @Override
+        public String getLanguage() {
+            return "en";
+        }
+        
+        protected void close(){
+            lock.writeLock().lock();
+            try {
+                wordMap.clear();
+            } finally {
+                lock.writeLock().unlock();
+            }
+        }
+    }
+}

Added: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/services/SentimentEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/services/SentimentEngine.java?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/services/SentimentEngine.java (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/services/SentimentEngine.java Wed Sep 19 08:48:32 2012
@@ -0,0 +1,384 @@
+/*
+ * Copyright (c) 2012 Sebastian Schaffert
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+package org.apache.stanbol.enhancer.engines.sentiment.services;
+
+import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.sentimentAnnotation;
+import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText;
+import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage;
+
+import org.apache.felix.scr.annotations.Activate;
+import org.apache.felix.scr.annotations.Component;
+import org.apache.felix.scr.annotations.ConfigurationPolicy;
+import org.apache.felix.scr.annotations.Deactivate;
+import org.apache.felix.scr.annotations.Properties;
+import org.apache.felix.scr.annotations.Property;
+import org.apache.felix.scr.annotations.Reference;
+import org.apache.felix.scr.annotations.ReferenceCardinality;
+import org.apache.felix.scr.annotations.ReferencePolicy;
+import org.apache.felix.scr.annotations.ReferenceStrategy;
+import org.apache.felix.scr.annotations.Service;
+import org.apache.lucene.messages.NLS;
+import org.apache.stanbol.commons.stanboltools.datafileprovider.DataFileProvider;
+import org.apache.stanbol.enhancer.engines.sentiment.api.SentimentClassifier;
+import org.apache.stanbol.enhancer.engines.sentiment.classifiers.SentiWSComponent;
+import org.apache.stanbol.enhancer.engines.sentiment.classifiers.SentiWordNet;
+import org.apache.stanbol.enhancer.nlp.NlpAnnotations;
+import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
+import org.apache.stanbol.enhancer.nlp.model.AnalysedTextUtils;
+import org.apache.stanbol.enhancer.nlp.model.Sentence;
+import org.apache.stanbol.enhancer.nlp.model.Token;
+import org.apache.stanbol.enhancer.nlp.model.annotation.Annotation;
+import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
+import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
+import org.apache.stanbol.enhancer.nlp.pos.PosTag;
+import org.apache.stanbol.enhancer.nlp.sentiment.SentimentTag;
+import org.apache.stanbol.enhancer.nlp.utils.LanguageConfiguration;
+import org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper;
+import org.apache.stanbol.enhancer.servicesapi.ContentItem;
+import org.apache.stanbol.enhancer.servicesapi.EngineException;
+import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
+import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
+import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
+import org.osgi.service.cm.ConfigurationException;
+import org.osgi.service.component.ComponentContext;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.lang.reflect.Constructor;
+import java.lang.reflect.InvocationTargetException;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Dictionary;
+import java.util.EnumSet;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * A Stanbol engine that associates sentiment values with the tokens created by the POS tagging engine.
+ * Sentiment values are added to the POSContentPart of the content item and can by further analysed by other
+ * engines, e.g. to compute sentiment values for the whole content item or in relation to certain nouns.
+ * <p/>
+ * The configuration allows specifying whether to analyse all words or only adjectives and nouns (a typical case).
+ * <p/>
+ * Currently, sentiment analysis is available for English and for German language. It uses the following word lists:
+ * <ul>
+ *     <li>English: SentiWordNet (http://wordnet.princeton.edu/), license allows commercial use</li>
+ *     <li>German: SentiWS (http://wortschatz.informatik.uni-leipzig.de/download/), license does NOT allow commercial use</li>
+ * </ul>
+ * <p/>
+ * Author: Sebastian Schaffert
+ */
+@Component(immediate = true, metatype = true, configurationFactory = true, policy = ConfigurationPolicy.REQUIRE)
+@Service
+@Properties(value={
+        @Property(name= EnhancementEngine.PROPERTY_NAME,value="sentiment")
+})
+
+public class SentimentEngine  extends AbstractEnhancementEngine<RuntimeException,RuntimeException> {
+
+    /**
+     * Language configuration. Takes a list of ISO language codes of supported languages. Currently supported
+     * are the languages given as default value.
+     */
+    @Property(value={SentimentEngine.DEFAULT_LANGUAGE_CONFIG})
+    public static final String CONFIG_LANGUAGES = "org.apache.stanbol.enhancer.sentiment.languages";
+
+    /**
+     * When set to true, only adjectives and nouns will be considered in sentiment analysis.
+     */
+    @Property(boolValue = SentimentEngine.DEFAULT_PROCESS_ADJECTIVES_ONLY )
+    public static final String CONFIG_ADJECTIVES = "org.apache.stanbol.enhancer.sentiment.adjectives";
+    /**
+     * POS tags that are not selected by {@link SentimentClassifier#isAdjective(PosTag)}
+     * or {@link SentimentClassifier#isNoun(PosTag)} are ignored if their confidence
+     * is &gt= the configured values. If there are multiple POS tag suggestions, 
+     * that Words that do have a suitable TAG are still considered if the
+     * confidence of the fitting tag is &gt;= {min-pos-confidence}/2
+     */
+    @Property(doubleValue = SentimentEngine.DEFAULT_MIN_POS_CONFIDNECE)
+    public static final String CONFIG_MIN_POS_CONFIDENCE = "org.apache.stanbol.enhancer.sentiment.min-pos-confidence";
+
+    @Property(boolValue=true)
+    public static final String DEBUG_SENTIMENTS = "debug";
+    boolean debugSentiments;
+    
+    public static final String DEFAULT_LANGUAGE_CONFIG = "*";
+    private LanguageConfiguration langaugeConfig = 
+            new LanguageConfiguration(CONFIG_LANGUAGES, new String[]{DEFAULT_LANGUAGE_CONFIG});
+
+    /**
+     * The minimum confidence of POS tags so that a token is NOT processed if
+     * the {@link LexicalCategory} is NOT {@link LexicalCategory#Adjective} (or
+     * {@link LexicalCategory#Noun Noun} if {@link #CONFIG_ADJECTIVES} is
+     * deactivated) - default: 0.8<p>
+     */
+    private static final double DEFAULT_MIN_POS_CONFIDNECE = 0.8;
+
+    private static final boolean DEFAULT_PROCESS_ADJECTIVES_ONLY = false;
+
+
+    private static Logger log = LoggerFactory.getLogger(SentimentEngine.class);
+
+    /**
+     * {@link SentimentClassifier} are now OSGI services and injected via events
+     * (calls to {@link #bindClassifier(SentimentClassifier)} and 
+     * {@link #unbindClassifier(SentimentClassifier)}) as soon as they become
+     * (un)available.
+     */
+    @Reference(referenceInterface=SentimentClassifier.class,
+        cardinality=ReferenceCardinality.OPTIONAL_MULTIPLE,
+        bind="bindClassifier",
+        unbind="unbindClassifier",
+        policy=ReferencePolicy.DYNAMIC,
+        strategy=ReferenceStrategy.EVENT)
+    private Map<String,SentimentClassifier> classifiers = Collections.synchronizedMap(
+        new HashMap<String,SentimentClassifier>());
+    /** bind method for {@link #classifiers} */
+    protected void bindClassifier(SentimentClassifier classifier){
+        log.info("  ... bind Sentiment Classifier {} for language {}",
+            classifier.getClass().getSimpleName(),classifier.getLanguage());
+        SentimentClassifier old = classifiers.put(classifier.getLanguage(), classifier);
+        if(old != null){
+            log.warn("Replaced Sentiment Classifier for language {} (old: {}, new: {}",
+                new Object[]{old.getLanguage(),old,classifier});
+        }
+    }
+    /** unbind method for {@link #classifiers} */
+    protected void unbindClassifier(SentimentClassifier classifier){
+        String lang = classifier.getLanguage();
+        synchronized (classifiers) {
+            SentimentClassifier current = classifiers.remove(lang);
+            if(!classifier.equals(current) //the current is not the parsed one
+                    && current != null){
+                classifiers.put(lang,current); //re-add the value
+            } else {
+                log.info("  ... unbind Sentiment Classifier {} for language {}",
+                    classifier.getClass().getSimpleName(),lang);
+            }
+        }
+    }
+    
+    /**
+     * The processed {@link LexicalCategory LexicalCategories}.
+     */
+    boolean adjectivesOnly = DEFAULT_PROCESS_ADJECTIVES_ONLY;
+    
+    /**
+     * The minimum {@link PosTag} value {@link Value#probability() confidence}.<p>
+     * This means that if the {@link Value#probability() confidence} of a
+     * {@link NlpAnnotations#POSAnnotation}s (returned by
+     * {@link Token#getAnnotations(Annotation)}) is greater than 
+     * {@link #minPOSConfidence} that the result of 
+     * {@link SentimentClassifier#isAdjective(PosTag)} (and 
+     * {@link SentimentClassifier#isNoun(PosTag)}  - if #CONFIG_ADJECTIVES is 
+     * deactivated) is used to decide if a Token needs to be processed or not.
+     * Otherwise further {@link NlpAnnotations#POSAnnotation}s are analysed for
+     * processable POS tags. Processable POS tags are accepted until
+     * <code>{@link #minPOSConfidence}/2</code>.  
+     */
+    private double minPOSConfidence = DEFAULT_MIN_POS_CONFIDNECE;
+
+    /**
+     * Indicate if this engine can enhance supplied ContentItem, and if it
+     * suggests enhancing it synchronously or asynchronously. The
+     * {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager} can force sync/async mode if desired, it is
+     * just a suggestion from the engine.
+     * <p/>
+     * Returns {@link EnhancementEngine}#ENHANCE_ASYNC if <ul>
+     * <li> the {@link AnalysedText} content part is present
+     * <li> the language of the content is known
+     * <li> the language is active based on the language configuration and
+     * <li> a sentiment classifier is available for the language
+     * </ul>
+     *
+     * @throws org.apache.stanbol.enhancer.servicesapi.EngineException
+     *          if the introspecting process of the content item
+     *          fails
+     */
+    @Override
+    public int canEnhance(ContentItem ci) throws EngineException {
+        if(getAnalysedText(this,ci, false) == null){
+            return CANNOT_ENHANCE;
+        }
+        String language = getLanguage(this, ci,false);
+
+        if(language == null) {
+            return CANNOT_ENHANCE;
+        }
+        if(classifiers.containsKey(language)){
+            return ENHANCE_ASYNC;
+        } else {
+            return CANNOT_ENHANCE;
+        }
+    }
+
+
+    /**
+     * Compute enhancements for supplied ContentItem. The results of the process
+     * are expected to be stored in the metadata of the content item.
+     * <p/>
+     * The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
+     * persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
+     *
+     * @throws org.apache.stanbol.enhancer.servicesapi.EngineException
+     *          if the underlying process failed to work as
+     *          expected
+     */
+    @Override
+    public void computeEnhancements(ContentItem ci) throws EngineException {
+        AnalysedText analysedText = getAnalysedText(this,ci, true);
+        String language = getLanguage(this, ci, true);
+        SentimentClassifier classifier = classifiers.get(language);
+        if(classifier == null){
+            throw new IllegalStateException("Sentiment Classifier for language '"
+                + language +"' not available. As this is also checked in "
+                + " canEnhance this may indicate an Bug in the used "
+                + "EnhancementJobManager!");
+        }
+        //TODO: locking for AnalysedText not yet defined
+//        ci.getLock().writeLock().lock();
+//        try {
+        Iterator<Token> tokens = analysedText.getTokens();
+        while(tokens.hasNext()){
+            Token token = tokens.next();
+            boolean process = !adjectivesOnly;
+            if(!process){ //check POS types
+                Iterator<Value<PosTag>> posTags = token.getAnnotations(NlpAnnotations.POSAnnotation).iterator();
+                boolean ignore = false;
+                while(!ignore && !process && posTags.hasNext()) {
+                    Value<PosTag> value = posTags.next();
+                    PosTag tag = value.value();
+                    boolean state = classifier.isAdjective(tag) || classifier.isNoun(tag);
+                    ignore = !state && value.probability() >= minPOSConfidence;
+                    process = state && value.probability() >= (minPOSConfidence/2.0);
+                }
+            } //else process all tokens ... no POS tag checking needed
+            if(process){
+                double sentiment = classifier.classifyWord(token.getSpan());
+                if(sentiment != 0.0){
+                    token.addAnnotation(sentimentAnnotation, 
+                        new Value<SentimentTag>(sentiment > 0 ? 
+                                SentimentTag.POSITIVE : SentimentTag.NEGATIVE,
+                                Math.abs(sentiment)));
+                } //else do not set sentiments with 0.0
+            }
+        }
+//        } finally {
+//            ci.getLock().writeLock().unlock();
+//        }
+//        if(debugSentiments){
+//            Iterator<Sentence> sentences = analysedText.getSentences();
+//            if(sentences.hasNext()){
+//                while(sentences.hasNext()){
+//                    Sentence sent = sentences.next();
+//                    log.info("Sentence: {}", sent.getSpan());
+//                    tokens = sent.getTokens();
+//                    double positive = 0.0;
+//                    double negaitve = 0.0;
+//                    while (tokens.hasNext()){
+//                        Token token = tokens.next();
+//                        Value<SentimentTag> sentiment = token.getAnnotation(NlpAnnotations.sentimentAnnotation);
+//                        if(sentiment != null){
+//                            if(sentiment.value().isPositive()){
+//                                positive = positive+sentiment.probability();
+//                            } else {
+//                                negaitve = negaitve+sentiment.probability();
+//                            }
+//                            Value<PosTag> posTag = token.getAnnotation(NlpAnnotations.POSAnnotation);
+//                            log.info("   - {} '{}'[{}] - value: {}",
+//                                new Object []{
+//                                sentiment.value().isPositive()?"positive":"negative",
+//                                token.getSpan(),
+//                                posTag != null ? posTag.value(): "POS unknown",
+//                                sentiment.probability()
+//                                });
+//                        }
+//                    }
+//                    log.info(" > positive: {} | negative: {} | sum: {}",
+//                        new Object []{positive, negaitve, (positive - negaitve)});
+//                }
+//            } else {
+//                
+//            }
+//        }
+    }
+
+
+    /**
+     * Activate and read the properties. Configures and initialises a ChunkerHelper for each language configured in
+     * CONFIG_LANGUAGES.
+     *
+     * @param ce the {@link org.osgi.service.component.ComponentContext}
+     */
+    @Activate
+    protected void activate(ComponentContext ce) throws ConfigurationException {
+        log.info("activating POS tagging engine");
+        super.activate(ce);
+        @SuppressWarnings("unchecked")
+        Dictionary<String, Object> properties = ce.getProperties();
+
+        //parse the configured languages
+        langaugeConfig.setConfiguration(properties);
+        
+        //set the processed lexical categories
+        Object value = properties.get(CONFIG_ADJECTIVES);
+        adjectivesOnly = value instanceof Boolean ? (Boolean)value :
+            value != null ? Boolean.parseBoolean(value.toString()) : 
+                DEFAULT_PROCESS_ADJECTIVES_ONLY;
+        
+        //set minimum POS confidence
+        value = properties.get(CONFIG_MIN_POS_CONFIDENCE);
+        if(value instanceof Number){
+            minPOSConfidence = ((Number)value).doubleValue();
+        } else if(value != null){
+            try {
+                minPOSConfidence = Double.parseDouble(value.toString());
+            } catch (NumberFormatException e) {
+                throw new ConfigurationException(CONFIG_MIN_POS_CONFIDENCE, 
+                    "Unable to parsed minimum POS confidence value from '"
+                    + value +"'!",e);
+            }
+        } else {
+            minPOSConfidence = DEFAULT_MIN_POS_CONFIDNECE;
+        }
+        if(minPOSConfidence <= 0 || minPOSConfidence >= 1){
+            throw new ConfigurationException(CONFIG_MIN_POS_CONFIDENCE, 
+                "The configured minimum POS confidence value '"
+                +minPOSConfidence+"' MUST BE > 0 and < 1!");
+        }
+        
+        //TODO: just for testing
+        value = properties.get(DEBUG_SENTIMENTS);
+        debugSentiments = value instanceof Boolean ? (Boolean)value :
+            value != null ? Boolean.parseBoolean(value.toString()) : 
+                false;
+    }
+    
+    @Deactivate
+    protected void deactivate(ComponentContext ctx){
+        //remove remaining classifiers
+        this.classifiers.clear();
+        langaugeConfig.setDefault();
+        super.deactivate(ctx);
+    }
+
+}

Added: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/engines/sentiment-wordclassifier/src/main/resources/OSGI-INF/metatype/metatype.properties
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/engines/sentiment-wordclassifier/src/main/resources/OSGI-INF/metatype/metatype.properties?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/engines/sentiment-wordclassifier/src/main/resources/OSGI-INF/metatype/metatype.properties (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/engines/sentiment-wordclassifier/src/main/resources/OSGI-INF/metatype/metatype.properties Wed Sep 19 08:48:32 2012
@@ -0,0 +1,47 @@
+#
+# Copyright 2012, FORMCEPT [http://www.formcept.com]
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+org.apache.stanbol.enhancer.engines.sentiment.services.SentimentEngine.name=Apache Stanbol Enhancer Engine: Sentiment Analysis
+
+stanbol.enhancer.engine.name.name=Sentiment Analysis Engine
+stanbol.enhancer.engine.name.description=The name of the enhancement engine as \
+used in the RESTful interface '/engine/<name>'
+service.ranking.name=Ranking
+service.ranking.description=If two enhancement engines with the same name are active the \
+one with the higher ranking will be used to process parsed content items.
+
+#====================================================
+#Properties used to configure FORMCEPT Enhancer
+#====================================================
+
+
+org.apache.stanbol.enhancer.sentiment.languages.name=Language configuration
+org.apache.stanbol.enhancer.sentiment.languages.description=Takes a list of ISO \
+  language codes of supported languages. Currently supported are the languages given as default value.
+
+org.apache.stanbol.enhancer.sentiment.adjectives.name=Adjectives/Nouns only
+org.apache.stanbol.enhancer.sentiment.adjectives.description=When set to true, only adjectives and nouns \
+  will be considered in sentiment analysis. Note that this will case this engine only to Tag words \
+  if POS tags are available.
+  
+org.apache.stanbol.enhancer.sentiment.min-pos-confidence.name=Minimum POS Tag Confidence
+org.apache.stanbol.enhancer.sentiment.min-pos-confidence.description=If "Adjectives/Nouns only" \
+  is activated this is used as minimum confidence for POS tags. All non Noun \
+  and Adjective tokens with a confidence >= the configured value are filtered. \
+  NOTE that for words with ambiguous POS tags (multiple POS tags and no tag with \
+  an confidence >= the configured value) POS tags representing a Noun or Adjective \
+  are also considered if their confidence >= half of the configured value. 
+  

Added: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/engines/sentiment-wordclassifier/src/main/resources/config/at.newmedialab.stanbol.enhancer.person.PersonEnhancer-snml.config
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/engines/sentiment-wordclassifier/src/main/resources/config/at.newmedialab.stanbol.enhancer.person.PersonEnhancer-snml.config?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/engines/sentiment-wordclassifier/src/main/resources/config/at.newmedialab.stanbol.enhancer.person.PersonEnhancer-snml.config (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/engines/sentiment-wordclassifier/src/main/resources/config/at.newmedialab.stanbol.enhancer.person.PersonEnhancer-snml.config Wed Sep 19 08:48:32 2012
@@ -0,0 +1,2 @@
+stanbol.enhancer.chain.name="person-enhancer"
+stanbol.enhancer.chain.list.enginelist=["tika","langid","snml-person-enhancer"]

Propchange: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/
------------------------------------------------------------------------------
--- svn:ignore (added)
+++ svn:ignore Wed Sep 19 08:48:32 2012
@@ -0,0 +1,7 @@
+.project
+
+.settings
+
+.classpath
+
+target

Added: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/pom.xml
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/pom.xml?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/pom.xml (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/pom.xml Wed Sep 19 08:48:32 2012
@@ -0,0 +1,125 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
+  license agreements. See the NOTICE file distributed with this work for additional 
+  information regarding copyright ownership. The ASF licenses this file to 
+  You under the Apache License, Version 2.0 (the "License"); you may not use 
+  this file except in compliance with the License. You may obtain a copy of 
+  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+  by applicable law or agreed to in writing, software distributed under the 
+  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+  OF ANY KIND, either express or implied. See the License for the specific 
+  language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>org.apache.stanbol</groupId>
+    <artifactId>org.apache.stanbol.enhancer.parent</artifactId>
+    <version>0.10.0-incubating-SNAPSHOT</version>
+    <relativePath>../../parent</relativePath>
+  </parent>
+  <groupId>org.apache.stanbol</groupId>
+  <artifactId>org.apache.stanbol.enhancer.nlp</artifactId>
+  <version>0.10.0-incubating-SNAPSHOT</version>
+  <packaging>bundle</packaging>
+
+  <name>Apache Stanbol Enhancer NLP</name>
+  <description>
+        Module that defines the ContentPart defining the NLP processing metadata.
+    </description>
+  <inceptionYear>2012</inceptionYear>
+
+  <scm>
+    <connection>
+            scm:svn:http://svn.apache.org/repos/asf/incubator/stanbol/trunk/enhancer/generic/nlp/
+        </connection>
+    <developerConnection>
+            scm:svn:https://svn.apache.org/repos/asf/incubator/stanbol/trunk/enhancer/generic/nlp/
+        </developerConnection>
+    <url>http://incubator.apache.org/stanbol/</url>
+  </scm>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.felix</groupId>
+        <artifactId>maven-bundle-plugin</artifactId>
+        <extensions>true</extensions>
+        <configuration>
+          <instructions>
+            <!-- Enable this for including your enhancement chain configuration -->
+            <!-- TODO: maybe include POS AnnotationModel definitions -->
+            <!-- <Install-Path>config</Install-Path> -->
+            <Export-Package>
+              org.apache.stanbol.enhancer.nlp;version=${project.version},
+              org.apache.stanbol.enhancer.nlp.model;version=${project.version},
+              org.apache.stanbol.enhancer.nlp.model.annotation;version=${project.version},
+              org.apache.stanbol.enhancer.nlp.ontology;version=${project.version},
+              org.apache.stanbol.enhancer.nlp.pos;version=${project.version},
+              org.apache.stanbol.enhancer.nlp.pos.*;version=${project.version},
+              org.apache.stanbol.enhancer.nlp.phrase;version=${project.version},
+              org.apache.stanbol.enhancer.nlp.sentiment;version=${project.version},
+              org.apache.stanbol.enhancer.nlp.utils;version=${project.version}
+            </Export-Package>
+            <Private-Package>
+              org.apache.stanbol.enhancer.nlp.model.impl.*;version=${project.version}
+            </Private-Package>
+            <Embed-Dependency></Embed-Dependency>
+          </instructions>
+        </configuration>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.rat</groupId>
+        <artifactId>apache-rat-plugin</artifactId>
+        <configuration>
+          <excludes>
+            <exclude>src/license/THIRD-PARTY.properties</exclude>
+          </excludes>
+        </configuration>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.felix</groupId>
+        <artifactId>maven-scr-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+
+  <dependencies>
+    <dependency>
+      <groupId>org.apache.stanbol</groupId>
+      <artifactId>org.apache.stanbol.enhancer.servicesapi</artifactId>
+      <version>0.10.0-incubating-SNAPSHOT</version>
+    </dependency>
+    <dependency>
+      <groupId>commons-collections</groupId>
+      <artifactId>commons-collections</artifactId>
+      <version>3.2.1</version>
+    </dependency>
+    <!-- Logging -->
+    <dependency>
+      <groupId>org.slf4j</groupId>
+      <artifactId>slf4j-api</artifactId>
+      <version>1.6.1</version>
+    </dependency>
+
+    <!-- test dependencies -->
+    <dependency>
+      <groupId>org.apache.stanbol</groupId>
+      <artifactId>org.apache.stanbol.enhancer.core</artifactId>
+      <version>0.10.0-incubating-SNAPSHOT</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.slf4j</groupId>
+      <artifactId>slf4j-simple</artifactId>
+      <scope>test</scope>
+    </dependency>
+
+  </dependencies>
+
+</project>

Added: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/NlpAnnotations.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/NlpAnnotations.java?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/NlpAnnotations.java (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/NlpAnnotations.java Wed Sep 19 08:48:32 2012
@@ -0,0 +1,38 @@
+package org.apache.stanbol.enhancer.nlp;
+
+import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
+import org.apache.stanbol.enhancer.nlp.model.Chunk;
+import org.apache.stanbol.enhancer.nlp.model.Token;
+import org.apache.stanbol.enhancer.nlp.model.annotation.Annotation;
+import org.apache.stanbol.enhancer.nlp.phrase.PhraseTag;
+import org.apache.stanbol.enhancer.nlp.pos.PosTag;
+import org.apache.stanbol.enhancer.nlp.sentiment.SentimentTag;
+
+/**
+ * Defines the {@link Annotation} constants typically used by NLP components
+ */
+public interface NlpAnnotations {
+    
+    /**
+     * The POS {@link Annotation} added by POS taggers to {@link Token}s of
+     * an {@link AnalysedText}.
+     */
+    Annotation<String,PosTag> POSAnnotation = new Annotation<String,PosTag>(
+            "stanbol.enhancer.nlp.pos", PosTag.class);
+    
+    
+    /**
+     * The Phrase {@link Annotation} added by chunker to a group of
+     * [1..*] {@link Token}s.<p>
+     * This annotation is typically found on {@link Chunk}s.
+     */
+    Annotation<String,PhraseTag> phraseAnnotation = new Annotation<String,PhraseTag>(
+            "stanbol.enhancer.nlp.phrase", PhraseTag.class);
+    
+    /**
+     * The Sentiment {@link Annotation} added by a sentiment tagger typically
+     * to single {@link Token}s that do carry a positive or negative sentiment.
+     */
+    Annotation<String,SentimentTag> sentimentAnnotation = new Annotation<String,SentimentTag>(
+            "stanbol.enhancer.nlp.sentiment", SentimentTag.class);
+}

Added: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/Tag.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/Tag.java?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/Tag.java (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/Tag.java Wed Sep 19 08:48:32 2012
@@ -0,0 +1,62 @@
+package org.apache.stanbol.enhancer.nlp;
+
+import org.apache.stanbol.enhancer.nlp.pos.PosTag;
+
+
+public abstract class Tag<T extends Tag<T>> { //lol ??!! is that how to define T
+
+    
+    protected final String tag;
+    private TagSet<T> annotationModel;
+
+    /**
+     * Creates a PosTag for the given String
+     * @param tag the tag
+     * @throws IllegalArgumentException if the parsed tag is <code>null</code>
+     * or empty.
+     */
+    public Tag(String tag){
+        if(tag == null || tag.isEmpty()){
+            throw new IllegalArgumentException("The tag MUST NOT be NULL!");
+        }
+        this.tag = tag;
+    }
+    
+    public final String getTag() {
+        return tag;
+    }
+    /**
+     * @return the annotationModel
+     */
+    public final TagSet<T> getAnnotationModel() {
+        return annotationModel;
+    }
+    /**
+     * Used by the {@link TagSet} class to assign itself to an PosTag
+     * that is {@link TagSet#addTag(PosTag) added}.
+     * @param annotationModel the annotationModel to set
+     */
+    protected final void setAnnotationModel(TagSet<T> annotationModel) {
+        this.annotationModel = annotationModel;
+    }
+    
+    @Override
+    public String toString() {
+        return String.format("{} %s ", getClass().getSimpleName(), tag);
+    }
+    
+    @Override
+    public int hashCode() {
+        return tag.hashCode();
+    }
+    
+    @Override
+    public boolean equals(Object obj) {
+        if(obj instanceof Tag && tag.equals(((Tag<?>)obj).tag)){
+            return (annotationModel == null && ((Tag<?>)obj).annotationModel == null) ||
+                    (annotationModel != null && annotationModel.equals(((Tag<?>)obj).annotationModel));
+        } else {
+            return false;
+        }
+    }
+}
\ No newline at end of file

Added: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/TagSet.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/TagSet.java?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/TagSet.java (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/TagSet.java Wed Sep 19 08:48:32 2012
@@ -0,0 +1,131 @@
+package org.apache.stanbol.enhancer.nlp;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
+import org.apache.stanbol.enhancer.nlp.model.Chunk;
+import org.apache.stanbol.enhancer.nlp.model.Sentence;
+import org.apache.stanbol.enhancer.nlp.model.Token;
+import org.apache.stanbol.enhancer.nlp.model.annotation.Annotated;
+import org.apache.stanbol.enhancer.nlp.phrase.PhraseTag;
+import org.apache.stanbol.enhancer.nlp.pos.PosTag;
+
+/**
+ * An TagSet used for tagging {@link Annotated} resources like {@link Token}s,
+ * {@link Chunk}s or even whole {@link Sentence}s and 
+ * {@link AnalysedText Texts}s.<p>
+ * A TagSet defines a set of {@link Tag} and can be usd for one or more
+ * {@link #getLanguages() languages}.<p>
+ * {@link TagSet} uses generics to allow the specification of more specific 
+ * TagSets e.g. for {@link PosTag} or {@link PhraseTag}s.<p>
+ */
+public class TagSet<T extends Tag<T>> implements Iterable<T>{
+
+
+    private final String name;
+    private final Set<String> languages;
+    
+    private final Map<String,T> tag2PosTag = new HashMap<String,T>();
+    
+    private final Map<String,Object> properties = new HashMap<String,Object>();
+    
+    /**
+     * Creates an AnnotationModel for Tags of a specific type (e.g.
+     * {@link PosTag} or {@link PhraseTag}) that can be used for the parsed
+     * Languages.<p>
+     * In addition AnnotationModels allow to add additional properties.
+     * Those can be used to assign information such as the 
+     *  In addition this constructor allows to parse
+     * URIs for Ontologies that define the model and the linking to the
+     * <a herf="http://nlp2rdf.lod2.eu/olia/">nlp2rdf OLIA</a> annotation and
+     * linking models.<p>
+     * In the future those metadata might even be used by components to 
+     * automatically create Annotation models.<p>
+     * NOTE that the parsed name us used as unique criteria. TODO this should
+     * be evaluated. 
+     * @param name the unique name (is used for {@link #hashCode()} and 
+     * @param languages the languages
+     */
+    public TagSet(String name, String...languages) {
+        if(name == null || name.isEmpty()){
+            throw new IllegalArgumentException("The parsed name MUST NOT be NULL!");
+        }
+        this.name = name;
+        if(languages != null && languages.length > 0){
+            Set<String> langSet = new HashSet<String>(Arrays.asList(languages));
+            langSet.remove(null);
+            this.languages = Collections.unmodifiableSet(langSet);
+        }else {
+            this.languages = Collections.emptySet();
+                    
+        }
+    }
+    
+    /**
+     * Getter for the properties of this AnnotationModel
+     * @return
+     */
+    public Map<String,Object> getProperties(){
+        return properties;
+    }
+    
+    
+    /**
+     * @return the name
+     */
+    public String getName() {
+        return name;
+    }
+    
+    /**
+     * @return the languages
+     */
+    public Set<String> getLanguages() {
+        return languages;
+    }
+    /**
+     * Adds an PosTag 
+     * @param tag
+     */
+    public void addTag(T tag){
+        if(tag != null){
+            if(tag.getAnnotationModel() != null || this.equals(tag.getAnnotationModel())){
+                throw new IllegalStateException("Unable to add "+tag+" to "+this
+                    + "because it is already assigned to "+tag.getAnnotationModel());
+            }
+            tag.setAnnotationModel(this);
+            tag2PosTag.put(tag.getTag(), tag);
+        }
+    }
+    
+    public T getTag(String tag){
+        return tag2PosTag.get(tag);
+    }
+    
+    @Override
+    public Iterator<T> iterator() {
+        return tag2PosTag.values().iterator();
+    }
+    
+    @Override
+    public String toString() {
+        return String.format("AnnotationModel [name: %s |lanuages: %s]", 
+            getName(), languages);
+    }
+    
+    @Override
+    public int hashCode() {
+        return name.hashCode();
+    }
+    
+    @Override
+    public boolean equals(Object obj) {
+        return obj instanceof TagSet && name.equals(((TagSet<?>)obj).name);
+    }
+}

Added: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/AnalysedText.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/AnalysedText.java?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/AnalysedText.java (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/AnalysedText.java Wed Sep 19 08:48:32 2012
@@ -0,0 +1,80 @@
+package org.apache.stanbol.enhancer.nlp.model;
+
+import java.util.ConcurrentModificationException;
+import java.util.Iterator;
+
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.stanbol.enhancer.servicesapi.Blob;
+import org.apache.stanbol.enhancer.servicesapi.ContentItem;
+
+/**
+ * Provides access to NLP processing results of the <code>text/plain</code>
+ * {@link Blob} of an ContentItem. Intended to be
+ * {@link ContentItem#addPart(org.apache.clerezza.rdf.core.UriRef, Object) added
+ * as ContentPart} by using {@link #ANALYSED_TEXT_URI}.
+ * @see ContentItem#addPart(UriRef, Object)
+ */
+public interface AnalysedText extends Section{
+
+    
+    /**
+     * The {@link UriRef} used to register the {@link AnalysedText} instance
+     * as {@link ContentItem#addPart(org.apache.clerezza.rdf.core.UriRef, Object) 
+     * ContentPart} to the {@link ContentItem}
+     */
+    public static final UriRef ANALYSED_TEXT_URI = new UriRef("urn:stanbol.enhancer:nlp.analysedText");
+
+    /**
+     * Returns {@link SpanTypeEnum#Text}
+     * @see Span#getType()
+     * @see SpanTypeEnum#Text
+     */
+    SpanTypeEnum getType();
+
+    /**
+     * Adds an Sentence
+     * @param start the start index
+     * @param end the end index
+     * @return the Sentence
+     */
+    Sentence addSentence(int start, int end);
+
+    /**
+     * Adds an Chunk
+     * @param start the start of the chunk
+     * @param end
+     * @return
+     */
+    Chunk addChunk(int start, int end);
+
+    /**
+     * All sentences of the Analysed texts.<p>
+     * Returned Iterators MUST NOT throw {@link ConcurrentModificationException}
+     * but consider additions of Spans.
+     * @return
+     */
+    Iterator<Sentence> getSentences();
+
+    /**
+     * All Chunks of this analysed text.<p>
+     * Returned Iterators MUST NOT throw {@link ConcurrentModificationException}
+     * but consider additions of Spans.
+     * @return the chunks
+     */
+    Iterator<Chunk> getChunks();
+
+    /**
+     * Getter for the text.
+     * @return 
+     */
+    CharSequence getText();
+    
+    /**
+     * The analysed {@link Blob}. Typically {@link Blob#getMimeType()} will be
+     * <code>text/plain</code>.
+     * @return the analysed {@link Blob} instance.
+     */
+    Blob getBlob();
+    
+    
+}
\ No newline at end of file

Added: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/AnalysedTextFactory.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/AnalysedTextFactory.java?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/AnalysedTextFactory.java (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/AnalysedTextFactory.java Wed Sep 19 08:48:32 2012
@@ -0,0 +1,59 @@
+package org.apache.stanbol.enhancer.nlp.model;
+
+import java.io.IOException;
+
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.stanbol.enhancer.nlp.model.impl.AnalysedTextFactoryImpl;
+import org.apache.stanbol.enhancer.servicesapi.Blob;
+import org.apache.stanbol.enhancer.servicesapi.ContentItem;
+
+public abstract class AnalysedTextFactory {
+
+    private static AnalysedTextFactory defaultInstance = new AnalysedTextFactoryImpl();
+    
+    /**
+     * Creates an {@link AnalysedText} instance for the parsed {@link Blob}
+     * and registers itself as 
+     * {@link ContentItem#addPart(org.apache.clerezza.rdf.core.UriRef, Object) 
+     * ContentPart} with the {@link UriRef} {@link AnalysedText#ANALYSED_TEXT_URI}
+     * to the parsed {@link ContentItem}.<p>
+     * If already a ContentPart with the given UriRef is registered this 
+     * Method will throw an {@link IllegalStateException}.
+     * @param ci the ContentItem to register the created {@link AnalysedText} instance
+     * @param blob the analysed {@link Blob}
+     * @return the created {@link AnalysedText}
+     * @throws IllegalArgumentException of <code>null</code> is parsed as
+     * ContentItem or Blob
+     * @throws IllegalStateException if there is already an ContentPart is
+     * registered for {@link AnalysedText#ANALYSED_TEXT_URI} with the parsed
+     * ContentItem.
+     * @throws IOException on any error while reading data from the parsed blob
+     */
+    public abstract AnalysedText createAnalysedText(ContentItem ci, Blob blob) throws IOException ;
+    /**
+     * Creates a AnalysedText instance for the parsed blob.<p>
+     * NOTE: This implementation does NOT register the {@link AnalysedText}
+     * as ContentPart. 
+     * @param blob the analysed Blob
+     * @return the AnalysedText
+     * @throws IllegalArgumentException if <code>null</code> is parsed as 
+     * {@link Blob}.
+     * @throws IOException on any error while reading data from the parsed blob
+     */
+    public abstract AnalysedText createAnalysedText(Blob blob) throws IOException ;
+    
+    /**
+     * Intended to be used outside of an OSGI container to obtain an
+     * instance of a {@link AnalysedTextFactory}. <p>
+     * When using this within an OSGI environment it is preferred to obtain
+     * the factory as a service (e.g. via the BundleContext, an ServiceTracker
+     * or by injection). As this allows the usage of different implementations.
+     * <p>
+     * This is hard-wired with the default implementation contained within this
+     * module.
+     * @return the default {@link AnalysedTextFactory} instance.
+     */
+    public static final AnalysedTextFactory getDefaultInstance(){
+        return defaultInstance;
+    }
+}

Added: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/AnalysedTextUtils.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/AnalysedTextUtils.java?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/AnalysedTextUtils.java (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/AnalysedTextUtils.java Wed Sep 19 08:48:32 2012
@@ -0,0 +1,154 @@
+package org.apache.stanbol.enhancer.nlp.model;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.EnumSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Set;
+import java.util.SortedSet;
+import java.util.TreeSet;
+
+import org.apache.stanbol.enhancer.nlp.model.Span.SpanTypeEnum;
+import org.apache.stanbol.enhancer.servicesapi.ContentItem;
+import org.apache.stanbol.enhancer.servicesapi.NoSuchPartException;
+
+public class AnalysedTextUtils {
+
+    /**
+     * Getter for the {@link AnalysedText} content part of the parsed
+     * ContentItem.<p>
+     * This assumes that the AnalysedText is registered by using
+     * {@link AnalysedText#ANALYSED_TEXT_URI}. Otherwise it will not find it.
+     * @param ci The {@link ContentItem}
+     * @return the {@link AnalysedText} or <code>null</code> if not present.
+     * @throws ClassCastException if a content part is registered with
+     * {@link AnalysedText#ANALYSED_TEXT_URI} but its type is not compatible
+     * to {@link AnalysedText}.
+     */
+    public static AnalysedText getAnalysedText(ContentItem ci){
+        ci.getLock().readLock().lock();
+        try {
+            return ci.getPart(AnalysedText.ANALYSED_TEXT_URI, AnalysedText.class);
+        }catch (NoSuchPartException e) {
+            return null;
+        } finally {
+            ci.getLock().readLock().unlock();
+        }
+    }
+    
+    /**
+     * Copies the elements of the parsed iterator to a list.
+     * @param iterator the iterator
+     * @return the List with all spans of the Iterators
+     */
+    public static <T extends Span> List<T> asList(Iterator<T> it){
+        if(it == null || !it.hasNext()){
+            return Collections.emptyList();
+        } else {
+            List<T> spans = new ArrayList<T>();
+            appandToList(it, spans);
+            return spans;
+        }
+    }
+    /**
+     * Appends the elements provided by the parsed Iterator to the list.
+     * @param it the Iterator
+     * @param list the List
+     * @throws NullPointerException if the parsed List is <code>null</code>
+     */
+    public static <T extends Span> void appandToList(Iterator<T> it, List<? super T> list){
+        if(it != null){
+            while(it.hasNext()){
+                list.add(it.next());
+            }
+        }
+    }
+    
+    /**
+     * Copies the elements of the parsed iterator(s) to a {@link SortedSet}. As
+     * {@link Span} implements {@link Comparable} the Spans within the resulting
+     * set will have the same order as returned by the methods of {@link AnalysedText}
+     * @param it the iterator(s)
+     * @return the {@link SortedSet} containing all Spans of the iterators
+     */
+    public static <T extends Span> SortedSet<T> asSet(Iterator<T> it){
+        SortedSet<T> spans = new TreeSet<T>();
+        addToSet(it, spans);
+        return spans;
+    }
+    /**
+     * Adds the Spans of the parsed Iterator to the parsed Set
+     * @param it the Iterator
+     * @param set the set
+     * @throws NullPointerException if the parsed List is <code>null</code>
+     */
+    public static <T extends Span> void addToSet(Iterator<T> it,Set<? super T> set){
+        if(it != null){
+            while(it.hasNext()){
+                set.add(it.next());
+            }
+        }
+    }
+    /**
+     * Iterates over two levels of the Span hierarchy (e.g. all Tokens of a
+     * Sentence that are within a Chunk). The returned Iterator is a live
+     * view on the {@link AnalysedText} (being the context of the enclosing
+     * Span).<p>
+     * Usage Example
+     * <code><pre>
+     *     Sentence sentence; //The currently processed Sentence
+     *     Iterator&lt;Span&gt; tokens = AnalysedTextUtils.getSpansInSpans(
+     *         sentence,
+     *         {@link SpanTypeEnum#Chunk SpanTypeEnum.Chunk}
+     *         {@link SpanTypeEnum#Token SpanTypeEnum.Token}
+     *     while(tokens.hasNext()){
+     *         Token token = (Token)tokens.next();
+     *         // process only tokens within a chunk
+     *     }
+     * </pre></code>
+     * @param section 
+     * @param level1 the {@link SpanTypeEnum} for the first Level. MUST be
+     * a Type that is a {@link Section} (e.g. Chunk or Sentence).
+     * @param level2
+     * @return
+     * @throws IllegalArgumentException if {@link SpanTypeEnum#Token} is parsed
+     * as <code>level1</code> span type.
+     */
+    public static Iterator<Span> getSpansInSpans(Section section, SpanTypeEnum level1, final SpanTypeEnum level2){
+        if(level1 == SpanTypeEnum.Token){
+            throw new IllegalArgumentException("The SpanType for level1 MUST refer to a Section "
+                + "(Chunk, Sentence, TextSection or Text)");
+        }
+        final Iterator<Span> level1It = section.getEnclosed(EnumSet.of(level1));
+        return new Iterator<Span>(){
+            Iterator<Span> level2It = null;
+            @Override
+            public boolean hasNext() {
+                if(level2It != null && level2It.hasNext()) {
+                    return true;
+                } else {
+                    while(level1It.hasNext()){
+                        level2It = ((Section)level1It.next()).getEnclosed(EnumSet.of(level2));
+                        if(level2It.hasNext()){
+                            return true;
+                        }
+                    }
+                }
+                return false;
+            }
+
+            @Override
+            public Span next() {
+                hasNext(); //ensure hasNext is called on multiple calls to next()
+                return level2It.next();
+            }
+
+            @Override
+            public void remove() {
+                level2It.remove();
+            }
+        };
+    }
+    
+}

Added: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/Chunk.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/Chunk.java?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/Chunk.java (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/Chunk.java Wed Sep 19 08:48:32 2012
@@ -0,0 +1,13 @@
+package org.apache.stanbol.enhancer.nlp.model;
+
+
+public interface Chunk extends Section {
+
+    /**
+     * Returns {@link SpanTypeEnum#Chunk}
+     * @see Span#getType()
+     * @see SpanTypeEnum#Chunk
+     */
+    SpanTypeEnum getType();
+    
+}
\ No newline at end of file

Added: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/Section.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/Section.java?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/Section.java (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/Section.java Wed Sep 19 08:48:32 2012
@@ -0,0 +1,41 @@
+package org.apache.stanbol.enhancer.nlp.model;
+
+import java.util.ConcurrentModificationException;
+import java.util.Iterator;
+import java.util.Set;
+
+/**
+ * A {@link Span} that may enclose other Spans. Super type for {@link Chunk}s,
+ * {@link Sentence}s and {@link AnalysedText}.<p>
+ * As {@link Span} this is an meta (abstract) type. Implementations of this
+ * Interface SHOULD BE abstract Classes. 
+ */
+public interface Section extends Span {
+
+    /**
+     * Iterates over all Span enclosed by this one that are of any of the
+     * parsed Types.<p>
+     * Returned Iterators MUST NOT throw {@link ConcurrentModificationException}
+     * but consider additions of Spans.
+     * @param types the {@link SpanTypeEnum types} of Spans included
+     * @return sorted iterator over the selected Spans.
+     */
+    Iterator<Span> getEnclosed(Set<SpanTypeEnum> types);
+
+    /**
+     * Adds an Token relative to this Sentence
+     * @param start the start of the token relative to the sentence
+     * @param end
+     * @return
+     */
+    Token addToken(int start, int end);
+
+    /**
+     * The Tokens covered by this Sentence.<p>
+     * Returned Iterators MUST NOT throw {@link ConcurrentModificationException}
+     * but consider additions of Spans.
+     * @return the tokens
+     */
+    Iterator<Token> getTokens();
+
+}

Added: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/Sentence.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/Sentence.java?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/Sentence.java (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/Sentence.java Wed Sep 19 08:48:32 2012
@@ -0,0 +1,32 @@
+package org.apache.stanbol.enhancer.nlp.model;
+
+import java.util.ConcurrentModificationException;
+import java.util.Iterator;
+
+public interface Sentence extends Section {
+
+    /**
+     * Returns {@link SpanTypeEnum#Sentence}
+     * @see Span#getType()
+     * @see SpanTypeEnum#Sentence
+     */
+    SpanTypeEnum getType();
+
+    /**
+     * Adds an Chunk relative to this Sentence
+     * @param start the start of the chunk relative to the sentence
+     * @param end
+     * @return
+     */
+    Chunk addChunk(int start, int end);
+
+
+    /**
+     * The Chunks covered by this Sentence<p>
+     * Returned Iterators MUST NOT throw {@link ConcurrentModificationException}
+     * but consider additions of Spans.
+     * @return the chunks
+     */
+    Iterator<Chunk> getChunks();
+
+}
\ No newline at end of file

Added: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/Span.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/Span.java?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/Span.java (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/Span.java Wed Sep 19 08:48:32 2012
@@ -0,0 +1,73 @@
+package org.apache.stanbol.enhancer.nlp.model;
+
+import org.apache.stanbol.enhancer.nlp.model.annotation.Annotated;
+
+/**
+ * Represents a {@link #getSpan() span} [{@link #getStart() start},
+ * {@link #getEnd() end}] within the {@link #getContext() text}. Spans also have
+ * an assigned {@link #getType() type}. Possible types are defined within the
+ * {@link SpanTypeEnum}.<p>
+ * This is an meta (abstract) type. Implementations of this Interface 
+ * SHOULD BE abstract Classes.
+ */
+public interface Span extends Annotated, Comparable<Span>{
+
+    /**
+     * Enumeration over different types - or roles - spans defined for an
+     * {@link AnalysedText} may play.
+     */
+    public static enum SpanTypeEnum {
+        /**
+         * The Text as a whole
+         */
+        Text,
+        /**
+         * An section of the text (chapter, page, paragraph ...). NOTE: this
+         * does NOT define types of sections.
+         */
+        TextSection,
+        /**
+         * An Sentence
+         */
+        Sentence,
+        /**
+         * A Chunk (e.g. a Noun Phrase) NOTE: this does NOT define types of
+         * Chunks
+         */
+        Chunk,
+        /**
+         * A Token (e.g. a noun, verb, punctuation) NOTE: this does NOT define
+         * types of Tokens
+         */
+        Token;
+    }
+    /**
+     * The type of the Span
+     * @return
+     */
+    SpanTypeEnum getType();
+
+    /**
+     * The start index of this span This is the absolute offset from the
+     * {@link #getContext()}{@link AnalysedText#getText() .getText()}
+     */
+    int getStart();
+    /**
+     * The end index of this span. This is the absolute offset from the
+     * {@link #getContext()}{@link AnalysedText#getText() .getText()}
+     * @return
+     */
+    int getEnd();
+
+    /**
+     * The {@link AnalysedText} this Span was added to.
+     * @return the AnalysedText representing the context of this Span
+     */
+    AnalysedText getContext();
+    /**
+     * The section of the text selected by this span
+     * @return the selected section of the text
+     */
+    String getSpan();
+    
+}
\ No newline at end of file

Added: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/SpanTypeEnum.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/SpanTypeEnum.java?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/SpanTypeEnum.java (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/SpanTypeEnum.java Wed Sep 19 08:48:32 2012
@@ -0,0 +1,2 @@
+package org.apache.stanbol.enhancer.nlp.model;
+

Added: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/Token.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/Token.java?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/Token.java (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/Token.java Wed Sep 19 08:48:32 2012
@@ -0,0 +1,15 @@
+package org.apache.stanbol.enhancer.nlp.model;
+
+
+
+public interface Token extends Span {
+
+    /**
+     * Returns {@link SpanTypeEnum#Token}
+     * @see Span#getType()
+     * @see SpanTypeEnum#Token
+     */
+    SpanTypeEnum getType();
+
+    
+}
\ No newline at end of file

Added: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/annotation/Annotated.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/annotation/Annotated.java?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/annotation/Annotated.java (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/annotation/Annotated.java Wed Sep 19 08:48:32 2012
@@ -0,0 +1,70 @@
+package org.apache.stanbol.enhancer.nlp.model.annotation;
+
+import java.util.List;
+
+public interface Annotated {
+    
+    
+
+    /**
+     * Method for requesting Values of a given Key. This allows to request
+     * Values without an {@link Annotation}.
+     * @param key the Key
+     * @return the Value with the highest probability
+     */
+    Value<?> getValue(Object key);
+    
+    /**
+     * Method for requesting the Value of an Annotation.
+     * @param annotation the requested {@link Annotation}
+     * @return the Value with the highest probability
+     * @throws ClassCastException if values of {@link Annotation#getKey()} are
+     * not of type V
+     */
+    <V> Value<V> getAnnotation(Annotation<?,V> annotation);
+
+    /**
+     * Method for requesting Values of a given Key. This allows to request
+     * Values without an {@link Annotation}.
+     * @param key the Key
+     * @return all Value sorted by probability
+     */
+    List<Value<?>> getValues(Object key);
+    
+    /**
+     * Method for requesting the Value of an Annotation.
+     * @param annotation the requested {@link Annotation}
+     * @return all Values sorted by probability
+     * @throws ClassCastException if the returned value of 
+     * {@link Annotation#getKey()} is not of type V
+     */
+    <V> List<Value<V>> getAnnotations(Annotation<?,V> annotation);
+    
+    /**
+     * Appends an Annotation to eventually already existing values 
+     * @param annotation the annotation
+     * @param value the value to append
+     */
+    <K,V> void addAnnotation(Annotation<K,V> annotation, Value<V> value);
+
+    /**
+     * Replaces existing Annotations with the parsed one
+     * @param annotation the annotation
+     * @param value the value for the annotation
+     */
+    <K,V> void setAnnotation(Annotation<K,V> annotation, Value<V> value);
+    
+    /**
+     * Appends an Annotation to eventually already existing values 
+     * @param annotation the annotation
+     * @param value the value to append
+     */
+    <K,V> void addAnnotations(Annotation<K,V> annotation, List<Value<V>> values);
+
+    /**
+     * Replaces existing Annotations with the parsed one
+     * @param annotation the annotation
+     * @param value the value for the annotation
+     */
+    <K,V> void setAnnotations(Annotation<K,V> annotation, List<Value<V>> values);
+}

Added: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/annotation/Annotation.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/annotation/Annotation.java?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/annotation/Annotation.java (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/annotation/Annotation.java Wed Sep 19 08:48:32 2012
@@ -0,0 +1,39 @@
+package org.apache.stanbol.enhancer.nlp.model.annotation;
+
+/**
+ * Definition of an Annotation including the <ul>
+ * <li>key used to store the Annotation
+ * <li>generic type of Values for this Annotation
+ * </ul>
+ *
+ * @param <K>
+ * @param <V>
+ */
+public final class Annotation<K,V> {
+
+    /**
+     * The type of the used Key
+     */
+    final K key;
+    /**
+     * The type of the used Value
+     */
+    final Class<V> valueType;
+    
+    public Annotation(K key,Class<V> valueType){
+        if(key == null || key == null){
+            throw new IllegalArgumentException("Key and Value MUST NOT be NULL!");
+        }
+        this.key = key;
+        this.valueType = valueType;
+    }
+ 
+    public K getKey(){
+        return key;
+    }
+    
+    public Class<V> getValueType(){
+        return valueType;
+    }
+        
+}

Added: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/annotation/Value.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/annotation/Value.java?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/annotation/Value.java (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/annotation/Value.java Wed Sep 19 08:48:32 2012
@@ -0,0 +1,118 @@
+package org.apache.stanbol.enhancer.nlp.model.annotation;
+
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.List;
+import java.util.Set;
+
+public final class Value<T> {
+
+    /**
+     * For Values that do not have a probability we use {@link Double#NaN}
+     */
+    public static final double UNKNOWN_PROBABILITY = -1.0d;
+    
+    /**
+     * The value
+     */
+    private final T value;
+    /**
+     * The probability of the Annotation
+     */
+    private final double probability;
+    
+    /**
+     * Creates an Annotation for the value with an {@link #UNKNOWN_PROBABILITY
+     * unknown probability}.
+     * @param value the value
+     */
+    public Value(T value){
+        this(value,UNKNOWN_PROBABILITY);
+    }
+    
+    public Value(T value, double probability){
+        if(value == null){
+            throw new IllegalArgumentException("The parsed Value MUST NOT be NULL!");
+        }
+        this.value = value;
+        if(probability != UNKNOWN_PROBABILITY && (probability > 1 || probability < 0)){
+            throw new IllegalArgumentException("Probabilities MUST BE in the range [0..1]");
+        }
+        this.probability = probability;
+    }
+    
+    public final T value(){
+        return value;
+    }
+    
+    public final double probability(){
+        return probability;
+    }
+    
+    public static <T> Value<T> value(T value){
+        return new Value<T>(value);
+    }
+    
+    public static <T> Value<T> value(T value,double probability){
+        return new Value<T>(value,probability);
+    }
+    
+    public static <T> List<Value<T>> values(T...values){
+        List<Value<T>> valList = new ArrayList<Value<T>>(values.length);
+        for(T value : values){
+            valList.add(new Value<T>(value));
+        }
+        return valList;
+    }
+    public static <T> List<Value<T>> values(T[] values, double[] probabilities){
+        return values(values,probabilities,values.length);
+    }
+    public static <T> List<Value<T>> values(T[] values, double[] probabilities,int elements){
+        List<Value<T>> valList = new ArrayList<Value<T>>(elements);
+        for(int i=0;i<elements;i++){
+            valList.add(new Value<T>(values[i],probabilities[i]));
+        }
+        return valList;
+    }
+
+    @Override
+    public int hashCode() {
+        //for long hash see 
+        //http://docs.oracle.com/javase/6/docs/api/java/lang/Double.html#hashCode()
+        long bits = Double.doubleToLongBits(probability);
+        return value.hashCode() + (int)(bits^(bits>>>32));
+    }
+    
+    @Override
+    public boolean equals(Object obj) {
+        return obj instanceof Value && value.equals(((Value<?>)obj).value) &&
+                probability == ((Value<?>)obj).probability;
+    }
+    
+    @Override
+    public String toString() {
+        StringBuilder sb = new StringBuilder("Value [");
+        sb.append(value.toString()).append(']');
+        if(probability != UNKNOWN_PROBABILITY){
+            sb.append(".prob=").append(probability);
+        }
+        return sb.toString();
+    }
+    
+    /**
+     * Comparator that sorts Values ONLY based on {@link Value#probability()} -
+     * DO NOT USE with {@link Set} implementations as it will only allow a 
+     * single Value with the same probability.<p>
+     * Values with {@link #UNKNOWN_PROBABILITY} are considered as lowest
+     * probability.
+     */
+    public static final Comparator<Value<?>> PROBABILITY_COMPARATOR = new Comparator<Value<?>>() {
+
+        @Override
+        public int compare(Value<?> o1, Value<?> o2) {
+            return Double.compare(o2.probability, o1.probability);
+        }
+    };
+    
+
+}

Added: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/impl/AnalysedTextFactoryImpl.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/impl/AnalysedTextFactoryImpl.java?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/impl/AnalysedTextFactoryImpl.java (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/impl/AnalysedTextFactoryImpl.java Wed Sep 19 08:48:32 2012
@@ -0,0 +1,59 @@
+package org.apache.stanbol.enhancer.nlp.model.impl;
+
+import java.io.IOException;
+
+import org.apache.felix.scr.annotations.Component;
+import org.apache.felix.scr.annotations.Properties;
+import org.apache.felix.scr.annotations.Property;
+import org.apache.felix.scr.annotations.Service;
+import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
+import org.apache.stanbol.enhancer.nlp.model.AnalysedTextFactory;
+import org.apache.stanbol.enhancer.servicesapi.Blob;
+import org.apache.stanbol.enhancer.servicesapi.ContentItem;
+import org.apache.stanbol.enhancer.servicesapi.NoSuchPartException;
+import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
+import org.osgi.framework.Constants;
+
+@Component(immediate=true)
+@Service(value=AnalysedTextFactory.class)
+@Properties(value={
+    @Property(name=Constants.SERVICE_RANKING,intValue=Integer.MIN_VALUE)
+})
+public class AnalysedTextFactoryImpl extends AnalysedTextFactory {
+
+    @Override
+    public AnalysedText createAnalysedText(ContentItem ci, Blob blob) throws IOException {
+        ci.getLock().readLock().lock();
+        try {
+            AnalysedText existing = ci.getPart(AnalysedText.ANALYSED_TEXT_URI, AnalysedText.class);
+            throw new IllegalStateException("The AnalysedText ContentPart already exists (impl: "
+                +existing.getClass().getSimpleName()+"| blob: "+existing.getBlob().getMimeType()+")");
+        }catch (NoSuchPartException e) {
+            //this is the expected case
+        }catch (ClassCastException e) {
+            throw new IllegalStateException("A ContentPart with the URI '"
+                + AnalysedText.ANALYSED_TEXT_URI+"' already exists but the parts "
+                + "type is not compatible with "+AnalysedText.class.getSimpleName()+"!",
+                e);
+        } finally {
+            ci.getLock().readLock().unlock();
+        }
+        //create the Analysed text
+        AnalysedText at = createAnalysedText(blob);
+        ci.getLock().writeLock().lock();
+        try {
+            //NOTE: there is a possibility that an other thread has added
+            // the contentpart
+            ci.addPart(AnalysedText.ANALYSED_TEXT_URI, at);
+        } finally {
+            ci.getLock().writeLock().unlock();
+        }
+        return at;
+    }
+
+    @Override
+    public AnalysedText createAnalysedText(Blob blob) throws IOException {
+        String text = ContentItemHelper.getText(blob);
+        return new AnalysedTextImpl(blob,text);
+    }
+}

Added: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/impl/AnalysedTextImpl.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/impl/AnalysedTextImpl.java?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/impl/AnalysedTextImpl.java (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/impl/AnalysedTextImpl.java Wed Sep 19 08:48:32 2012
@@ -0,0 +1,89 @@
+package org.apache.stanbol.enhancer.nlp.model.impl;
+
+import java.util.Iterator;
+import java.util.NavigableMap;
+import java.util.TreeMap;
+
+import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
+import org.apache.stanbol.enhancer.nlp.model.Chunk;
+import org.apache.stanbol.enhancer.nlp.model.Sentence;
+import org.apache.stanbol.enhancer.nlp.model.Span;
+import org.apache.stanbol.enhancer.servicesapi.Blob;
+
+/**
+ * The Class added as ContentPart to the contentItem
+ * @author westei
+ *
+ */
+public class AnalysedTextImpl extends SectionImpl implements AnalysedText {
+
+        
+    private final Blob blob;
+    /**
+     * The analysed text
+     */
+    private String text;
+    
+    protected NavigableMap<Span,Span> spans = new TreeMap<Span,Span>();
+    
+    public AnalysedTextImpl(Blob blob, String text){
+        super(SpanTypeEnum.Text,0,text.length());
+        this.setContext(this); //the the context to itself
+        this.blob = blob;
+        this.text = text;
+    }
+    
+    @Override
+    public SpanTypeEnum getType() {
+        return SpanTypeEnum.Text;
+    }
+    
+    /* (non-Javadoc)
+     * @see org.apache.stanbol.enhancer.nlp.model.impl.AnalyzedText#addSentence(int, int)
+     */
+    @Override
+    public SentenceImpl addSentence(int start, int end){
+        return register(new SentenceImpl(context, this, start, end));
+    }
+    
+    /* (non-Javadoc)
+     * @see org.apache.stanbol.enhancer.nlp.model.impl.AnalyzedText#getSentences()
+     */
+    @Override
+    public Iterator<Sentence> getSentences(){
+        return filter(Sentence.class);
+    }
+        
+    /* (non-Javadoc)
+     * @see org.apache.stanbol.enhancer.nlp.model.impl.Sentence#addChunk(int, int)
+     */
+    @Override
+    public ChunkImpl addChunk(int start, int end){
+        return register(new ChunkImpl(context, this, start, end));
+    }
+    
+    /* (non-Javadoc)
+     * @see org.apache.stanbol.enhancer.nlp.model.impl.Sentence#getChunks()
+     */
+    @Override
+    public Iterator<Chunk> getChunks(){
+        return filter(Chunk.class);
+    }
+    /**
+     * Reference to the Blob those data got analysed
+     * @return
+     */
+    public final Blob getAnalysedBlob(){
+        return blob;
+    }
+
+    @Override
+    public CharSequence getText() {
+        return text;
+    }
+
+    @Override
+    public Blob getBlob() {
+        return blob;
+    }
+}