You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2012/09/19 10:48:39 UTC
svn commit: r1387488 [3/5] - in
/incubator/stanbol/branches/stanbol-nlp-processing: ./ data/
data/bundlelists/sentiment/ data/bundlelists/sentiment/src/
data/bundlelists/sentiment/src/main/
data/bundlelists/sentiment/src/main/bundles/ data/opennlp/lang...
Added: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/classifiers/SentiWordNet.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/classifiers/SentiWordNet.java?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/classifiers/SentiWordNet.java (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/classifiers/SentiWordNet.java Wed Sep 19 08:48:32 2012
@@ -0,0 +1,258 @@
+/*
+ * Copyright (c) 2012 Sebastian Schaffert
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.stanbol.enhancer.engines.sentiment.classifiers;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.Dictionary;
+import java.util.HashMap;
+import java.util.Hashtable;
+import java.util.Map;
+import java.util.TreeMap;
+import java.util.concurrent.locks.ReadWriteLock;
+import java.util.concurrent.locks.ReentrantReadWriteLock;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.felix.scr.annotations.Activate;
+import org.apache.felix.scr.annotations.Component;
+import org.apache.felix.scr.annotations.Deactivate;
+import org.apache.felix.scr.annotations.Reference;
+import org.apache.lucene.analysis.en.EnglishMinimalStemmer;
+import org.apache.stanbol.commons.stanboltools.datafileprovider.DataFileListener;
+import org.apache.stanbol.commons.stanboltools.datafileprovider.DataFileTracker;
+import org.apache.stanbol.enhancer.engines.sentiment.api.LexicalCategoryClassifier;
+import org.apache.stanbol.enhancer.engines.sentiment.api.SentimentClassifier;
+import org.osgi.framework.BundleContext;
+import org.osgi.framework.ServiceRegistration;
+import org.osgi.service.component.ComponentContext;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * A word classifier for the english language based on SentiWordNet. Reads in a SentiWordNet file and
+ * represents mappings from word to sentiment score between -1 and 1 in a hashmap.
+ * <p/>
+ * Future versions might make use of a disk-based storage of the hashmap to improve memory performance.
+ * <p/>
+ * Note that a license for SentiWordNet is required if you intend to use the classifier in commercial
+ * settings.
+ * <p/>
+ * @author Sebastian Schaffert
+ */
+@Component(immediate = true)
+public class SentiWordNet {
+
+ private static final Map<String,String> modelProperties = new HashMap<String,String>();
+ static {
+ modelProperties.put("Description", "Sentiment Word List (German)");
+ modelProperties.put("Download Location", "http://wordnet.princeton.edu/");
+ }
+ private static Logger log = LoggerFactory.getLogger(SentiWordNet.class);
+
+ private static final String SENTIWORDNET_RESOURCE = "SentiWordNet_3.0.0_20120206.txt";
+
+ protected String sentiWordNetFile;
+
+ private ModelListener modelListener = new ModelListener();
+
+ @Reference
+ private DataFileTracker dataFileTracker;
+
+ private BundleContext bundleContext;
+
+ protected SentiWordNetClassifierEN classifier;
+
+ protected ServiceRegistration classifierRegistration;
+
+ public SentiWordNet() {}
+
+ @Activate
+ protected void activate(ComponentContext ctx){
+ bundleContext = ctx.getBundleContext();
+ //TODO: make configurable
+ sentiWordNetFile = SENTIWORDNET_RESOURCE;
+
+ classifier = new SentiWordNetClassifierEN();
+
+ dataFileTracker.add(modelListener, sentiWordNetFile, modelProperties);
+ }
+
+ @Deactivate
+ protected void deactivate(ComponentContext ctx){
+ if(classifierRegistration != null){
+ classifierRegistration.unregister();
+ classifierRegistration = null;
+ }
+ if(classifier != null){
+ classifier.close();
+ classifier = null;
+ }
+ dataFileTracker.removeAll(modelListener);
+ sentiWordNetFile = null;
+ }
+
+ /**
+ * Tracks the SentiWS files and triggers the registration of the service
+ */
+ private class ModelListener implements DataFileListener {
+
+ @Override
+ public boolean available(String resourceName, InputStream is) {
+ if(sentiWordNetFile.equals(resourceName)){
+ log.info("{} resource available",resourceName);
+ try {
+ long start = System.currentTimeMillis();
+ if(classifier != null){
+ classifier.parseSentiWordNet(is);
+ log.info(" ... loaded in {} ms",(System.currentTimeMillis()-start));
+ registerService(); //register the service
+ }
+ } catch (IOException e) {
+ log.warn("Unable to load '"+resourceName+"'!",e);
+ return false; //keep tracking
+ } catch (RuntimeException e) {
+ log.error("RuntimeException while loading '"
+ +resourceName+"!",e);
+ return false; //keep tracking
+ }
+ } else {
+ log.warn("Tracker notified event for non-tracked resource '{}'"
+ + "(tracked: {})!",resourceName,sentiWordNetFile);
+ }
+ //remove registration
+ return true;
+ }
+
+ @Override
+ public boolean unavailable(String resourceName) {
+ //not used;
+ return false;
+ }
+
+ }
+
+ protected void registerService() {
+ Dictionary<String,Object> serviceProperties = new Hashtable<String,Object>();
+ serviceProperties.put("language", "en"); //set the language
+ BundleContext bc = bundleContext;
+ if(bc != null){
+ classifierRegistration = bc.registerService(
+ SentimentClassifier.class.getName(), classifier,
+ serviceProperties);
+ }
+ }
+ /**
+ * The OSGI service registered as soon as the required DataFiles are
+ * available
+ */
+ public static class SentiWordNetClassifierEN extends LexicalCategoryClassifier implements SentimentClassifier {
+
+ private ReadWriteLock lock = new ReentrantReadWriteLock();
+ private Map<String,Double> wordMap = new TreeMap<String,Double>();
+
+ private EnglishMinimalStemmer stemmer = new EnglishMinimalStemmer();
+
+ protected SentiWordNetClassifierEN() {}
+
+ protected void parseSentiWordNet(InputStream is) throws IOException {
+ BufferedReader in = new BufferedReader(new InputStreamReader(is));
+ lock.writeLock().lock();
+ try {
+ // read line by line:
+ // - lines starting with # are ignored
+ // - valid lines have the format POS ID POSSCORE NEGSCORE SYNONYMS GLOSS separated by tags
+ for (String line = in.readLine(); line != null; line = in.readLine()) {
+ line = line.trim();
+ if (line.length() > 0 && line.charAt(0) != '#') {
+ String[] components = line.split("\t");
+
+ try {
+ double posScore = Double.parseDouble(components[2]);
+ double negScore = Double.parseDouble(components[3]);
+ String synonyms = components[4];
+
+ Double score = posScore - negScore;
+
+ if (score != 0.0) {
+ for (String synonymToken : synonyms.split(" ")) {
+ // synonymTokens are of the form word#position, so we strip out the position
+ // part
+ String[] synonym = synonymToken.split("#");
+ wordMap.put(getStemmed(synonym[0]), score);
+ }
+ }
+
+ } catch (Exception ex) {
+ log.warn("could not parse SentiWordNet line '{}': {}", line, ex.getMessage());
+ }
+ }
+ }
+ } finally {
+ lock.writeLock().unlock();
+ IOUtils.closeQuietly(in);
+ }
+ }
+
+ public int getWordCount() {
+ lock.readLock().lock();
+ try {
+ return wordMap.size();
+ } finally {
+ lock.readLock().unlock();
+ }
+ }
+
+ /**
+ * Given the word passed as argument, return a value between -1 and 1 indicating its sentiment value
+ * from very negative to very positive. Unknown words should return the value 0.
+ *
+ * @param word
+ * @return
+ */
+ @Override
+ public double classifyWord(String word) {
+ String stemmed = getStemmed(word);
+ lock.readLock().lock();
+ try {
+ Double sentiment = wordMap.get(stemmed);
+ return sentiment != null ? sentiment.doubleValue() : 0.0;
+ } finally {
+ lock.readLock().unlock();
+ }
+ }
+
+ private String getStemmed(String word) {
+ return word.substring(0, stemmer.stem(word.toCharArray(), word.length()));
+ }
+
+ @Override
+ public String getLanguage() {
+ return "en";
+ }
+
+ protected void close(){
+ lock.writeLock().lock();
+ try {
+ wordMap.clear();
+ } finally {
+ lock.writeLock().unlock();
+ }
+ }
+ }
+}
Added: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/services/SentimentEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/services/SentimentEngine.java?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/services/SentimentEngine.java (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/services/SentimentEngine.java Wed Sep 19 08:48:32 2012
@@ -0,0 +1,384 @@
+/*
+ * Copyright (c) 2012 Sebastian Schaffert
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.stanbol.enhancer.engines.sentiment.services;
+
+import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.sentimentAnnotation;
+import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText;
+import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage;
+
+import org.apache.felix.scr.annotations.Activate;
+import org.apache.felix.scr.annotations.Component;
+import org.apache.felix.scr.annotations.ConfigurationPolicy;
+import org.apache.felix.scr.annotations.Deactivate;
+import org.apache.felix.scr.annotations.Properties;
+import org.apache.felix.scr.annotations.Property;
+import org.apache.felix.scr.annotations.Reference;
+import org.apache.felix.scr.annotations.ReferenceCardinality;
+import org.apache.felix.scr.annotations.ReferencePolicy;
+import org.apache.felix.scr.annotations.ReferenceStrategy;
+import org.apache.felix.scr.annotations.Service;
+import org.apache.lucene.messages.NLS;
+import org.apache.stanbol.commons.stanboltools.datafileprovider.DataFileProvider;
+import org.apache.stanbol.enhancer.engines.sentiment.api.SentimentClassifier;
+import org.apache.stanbol.enhancer.engines.sentiment.classifiers.SentiWSComponent;
+import org.apache.stanbol.enhancer.engines.sentiment.classifiers.SentiWordNet;
+import org.apache.stanbol.enhancer.nlp.NlpAnnotations;
+import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
+import org.apache.stanbol.enhancer.nlp.model.AnalysedTextUtils;
+import org.apache.stanbol.enhancer.nlp.model.Sentence;
+import org.apache.stanbol.enhancer.nlp.model.Token;
+import org.apache.stanbol.enhancer.nlp.model.annotation.Annotation;
+import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
+import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
+import org.apache.stanbol.enhancer.nlp.pos.PosTag;
+import org.apache.stanbol.enhancer.nlp.sentiment.SentimentTag;
+import org.apache.stanbol.enhancer.nlp.utils.LanguageConfiguration;
+import org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper;
+import org.apache.stanbol.enhancer.servicesapi.ContentItem;
+import org.apache.stanbol.enhancer.servicesapi.EngineException;
+import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
+import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
+import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
+import org.osgi.service.cm.ConfigurationException;
+import org.osgi.service.component.ComponentContext;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.lang.reflect.Constructor;
+import java.lang.reflect.InvocationTargetException;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Dictionary;
+import java.util.EnumSet;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * A Stanbol engine that associates sentiment values with the tokens created by the POS tagging engine.
+ * Sentiment values are added to the POSContentPart of the content item and can by further analysed by other
+ * engines, e.g. to compute sentiment values for the whole content item or in relation to certain nouns.
+ * <p/>
+ * The configuration allows specifying whether to analyse all words or only adjectives and nouns (a typical case).
+ * <p/>
+ * Currently, sentiment analysis is available for English and for German language. It uses the following word lists:
+ * <ul>
+ * <li>English: SentiWordNet (http://wordnet.princeton.edu/), license allows commercial use</li>
+ * <li>German: SentiWS (http://wortschatz.informatik.uni-leipzig.de/download/), license does NOT allow commercial use</li>
+ * </ul>
+ * <p/>
+ * Author: Sebastian Schaffert
+ */
+@Component(immediate = true, metatype = true, configurationFactory = true, policy = ConfigurationPolicy.REQUIRE)
+@Service
+@Properties(value={
+ @Property(name= EnhancementEngine.PROPERTY_NAME,value="sentiment")
+})
+
+public class SentimentEngine extends AbstractEnhancementEngine<RuntimeException,RuntimeException> {
+
+ /**
+ * Language configuration. Takes a list of ISO language codes of supported languages. Currently supported
+ * are the languages given as default value.
+ */
+ @Property(value={SentimentEngine.DEFAULT_LANGUAGE_CONFIG})
+ public static final String CONFIG_LANGUAGES = "org.apache.stanbol.enhancer.sentiment.languages";
+
+ /**
+ * When set to true, only adjectives and nouns will be considered in sentiment analysis.
+ */
+ @Property(boolValue = SentimentEngine.DEFAULT_PROCESS_ADJECTIVES_ONLY )
+ public static final String CONFIG_ADJECTIVES = "org.apache.stanbol.enhancer.sentiment.adjectives";
+ /**
+ * POS tags that are not selected by {@link SentimentClassifier#isAdjective(PosTag)}
+ * or {@link SentimentClassifier#isNoun(PosTag)} are ignored if their confidence
+ * is >= the configured values. If there are multiple POS tag suggestions,
+ * that Words that do have a suitable TAG are still considered if the
+ * confidence of the fitting tag is >= {min-pos-confidence}/2
+ */
+ @Property(doubleValue = SentimentEngine.DEFAULT_MIN_POS_CONFIDNECE)
+ public static final String CONFIG_MIN_POS_CONFIDENCE = "org.apache.stanbol.enhancer.sentiment.min-pos-confidence";
+
+ @Property(boolValue=true)
+ public static final String DEBUG_SENTIMENTS = "debug";
+ boolean debugSentiments;
+
+ public static final String DEFAULT_LANGUAGE_CONFIG = "*";
+ private LanguageConfiguration langaugeConfig =
+ new LanguageConfiguration(CONFIG_LANGUAGES, new String[]{DEFAULT_LANGUAGE_CONFIG});
+
+ /**
+ * The minimum confidence of POS tags so that a token is NOT processed if
+ * the {@link LexicalCategory} is NOT {@link LexicalCategory#Adjective} (or
+ * {@link LexicalCategory#Noun Noun} if {@link #CONFIG_ADJECTIVES} is
+ * deactivated) - default: 0.8<p>
+ */
+ private static final double DEFAULT_MIN_POS_CONFIDNECE = 0.8;
+
+ private static final boolean DEFAULT_PROCESS_ADJECTIVES_ONLY = false;
+
+
+ private static Logger log = LoggerFactory.getLogger(SentimentEngine.class);
+
+ /**
+ * {@link SentimentClassifier} are now OSGI services and injected via events
+ * (calls to {@link #bindClassifier(SentimentClassifier)} and
+ * {@link #unbindClassifier(SentimentClassifier)}) as soon as they become
+ * (un)available.
+ */
+ @Reference(referenceInterface=SentimentClassifier.class,
+ cardinality=ReferenceCardinality.OPTIONAL_MULTIPLE,
+ bind="bindClassifier",
+ unbind="unbindClassifier",
+ policy=ReferencePolicy.DYNAMIC,
+ strategy=ReferenceStrategy.EVENT)
+ private Map<String,SentimentClassifier> classifiers = Collections.synchronizedMap(
+ new HashMap<String,SentimentClassifier>());
+ /** bind method for {@link #classifiers} */
+ protected void bindClassifier(SentimentClassifier classifier){
+ log.info(" ... bind Sentiment Classifier {} for language {}",
+ classifier.getClass().getSimpleName(),classifier.getLanguage());
+ SentimentClassifier old = classifiers.put(classifier.getLanguage(), classifier);
+ if(old != null){
+ log.warn("Replaced Sentiment Classifier for language {} (old: {}, new: {}",
+ new Object[]{old.getLanguage(),old,classifier});
+ }
+ }
+ /** unbind method for {@link #classifiers} */
+ protected void unbindClassifier(SentimentClassifier classifier){
+ String lang = classifier.getLanguage();
+ synchronized (classifiers) {
+ SentimentClassifier current = classifiers.remove(lang);
+ if(!classifier.equals(current) //the current is not the parsed one
+ && current != null){
+ classifiers.put(lang,current); //re-add the value
+ } else {
+ log.info(" ... unbind Sentiment Classifier {} for language {}",
+ classifier.getClass().getSimpleName(),lang);
+ }
+ }
+ }
+
+ /**
+ * The processed {@link LexicalCategory LexicalCategories}.
+ */
+ boolean adjectivesOnly = DEFAULT_PROCESS_ADJECTIVES_ONLY;
+
+ /**
+ * The minimum {@link PosTag} value {@link Value#probability() confidence}.<p>
+ * This means that if the {@link Value#probability() confidence} of a
+ * {@link NlpAnnotations#POSAnnotation}s (returned by
+ * {@link Token#getAnnotations(Annotation)}) is greater than
+ * {@link #minPOSConfidence} that the result of
+ * {@link SentimentClassifier#isAdjective(PosTag)} (and
+ * {@link SentimentClassifier#isNoun(PosTag)} - if #CONFIG_ADJECTIVES is
+ * deactivated) is used to decide if a Token needs to be processed or not.
+ * Otherwise further {@link NlpAnnotations#POSAnnotation}s are analysed for
+ * processable POS tags. Processable POS tags are accepted until
+ * <code>{@link #minPOSConfidence}/2</code>.
+ */
+ private double minPOSConfidence = DEFAULT_MIN_POS_CONFIDNECE;
+
+ /**
+ * Indicate if this engine can enhance supplied ContentItem, and if it
+ * suggests enhancing it synchronously or asynchronously. The
+ * {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager} can force sync/async mode if desired, it is
+ * just a suggestion from the engine.
+ * <p/>
+ * Returns {@link EnhancementEngine}#ENHANCE_ASYNC if <ul>
+ * <li> the {@link AnalysedText} content part is present
+ * <li> the language of the content is known
+ * <li> the language is active based on the language configuration and
+ * <li> a sentiment classifier is available for the language
+ * </ul>
+ *
+ * @throws org.apache.stanbol.enhancer.servicesapi.EngineException
+ * if the introspecting process of the content item
+ * fails
+ */
+ @Override
+ public int canEnhance(ContentItem ci) throws EngineException {
+ if(getAnalysedText(this,ci, false) == null){
+ return CANNOT_ENHANCE;
+ }
+ String language = getLanguage(this, ci,false);
+
+ if(language == null) {
+ return CANNOT_ENHANCE;
+ }
+ if(classifiers.containsKey(language)){
+ return ENHANCE_ASYNC;
+ } else {
+ return CANNOT_ENHANCE;
+ }
+ }
+
+
+ /**
+ * Compute enhancements for supplied ContentItem. The results of the process
+ * are expected to be stored in the metadata of the content item.
+ * <p/>
+ * The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
+ * persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
+ *
+ * @throws org.apache.stanbol.enhancer.servicesapi.EngineException
+ * if the underlying process failed to work as
+ * expected
+ */
+ @Override
+ public void computeEnhancements(ContentItem ci) throws EngineException {
+ AnalysedText analysedText = getAnalysedText(this,ci, true);
+ String language = getLanguage(this, ci, true);
+ SentimentClassifier classifier = classifiers.get(language);
+ if(classifier == null){
+ throw new IllegalStateException("Sentiment Classifier for language '"
+ + language +"' not available. As this is also checked in "
+ + " canEnhance this may indicate an Bug in the used "
+ + "EnhancementJobManager!");
+ }
+ //TODO: locking for AnalysedText not yet defined
+// ci.getLock().writeLock().lock();
+// try {
+ Iterator<Token> tokens = analysedText.getTokens();
+ while(tokens.hasNext()){
+ Token token = tokens.next();
+ boolean process = !adjectivesOnly;
+ if(!process){ //check POS types
+ Iterator<Value<PosTag>> posTags = token.getAnnotations(NlpAnnotations.POSAnnotation).iterator();
+ boolean ignore = false;
+ while(!ignore && !process && posTags.hasNext()) {
+ Value<PosTag> value = posTags.next();
+ PosTag tag = value.value();
+ boolean state = classifier.isAdjective(tag) || classifier.isNoun(tag);
+ ignore = !state && value.probability() >= minPOSConfidence;
+ process = state && value.probability() >= (minPOSConfidence/2.0);
+ }
+ } //else process all tokens ... no POS tag checking needed
+ if(process){
+ double sentiment = classifier.classifyWord(token.getSpan());
+ if(sentiment != 0.0){
+ token.addAnnotation(sentimentAnnotation,
+ new Value<SentimentTag>(sentiment > 0 ?
+ SentimentTag.POSITIVE : SentimentTag.NEGATIVE,
+ Math.abs(sentiment)));
+ } //else do not set sentiments with 0.0
+ }
+ }
+// } finally {
+// ci.getLock().writeLock().unlock();
+// }
+// if(debugSentiments){
+// Iterator<Sentence> sentences = analysedText.getSentences();
+// if(sentences.hasNext()){
+// while(sentences.hasNext()){
+// Sentence sent = sentences.next();
+// log.info("Sentence: {}", sent.getSpan());
+// tokens = sent.getTokens();
+// double positive = 0.0;
+// double negaitve = 0.0;
+// while (tokens.hasNext()){
+// Token token = tokens.next();
+// Value<SentimentTag> sentiment = token.getAnnotation(NlpAnnotations.sentimentAnnotation);
+// if(sentiment != null){
+// if(sentiment.value().isPositive()){
+// positive = positive+sentiment.probability();
+// } else {
+// negaitve = negaitve+sentiment.probability();
+// }
+// Value<PosTag> posTag = token.getAnnotation(NlpAnnotations.POSAnnotation);
+// log.info(" - {} '{}'[{}] - value: {}",
+// new Object []{
+// sentiment.value().isPositive()?"positive":"negative",
+// token.getSpan(),
+// posTag != null ? posTag.value(): "POS unknown",
+// sentiment.probability()
+// });
+// }
+// }
+// log.info(" > positive: {} | negative: {} | sum: {}",
+// new Object []{positive, negaitve, (positive - negaitve)});
+// }
+// } else {
+//
+// }
+// }
+ }
+
+
+ /**
+ * Activate and read the properties. Configures and initialises a ChunkerHelper for each language configured in
+ * CONFIG_LANGUAGES.
+ *
+ * @param ce the {@link org.osgi.service.component.ComponentContext}
+ */
+ @Activate
+ protected void activate(ComponentContext ce) throws ConfigurationException {
+ log.info("activating POS tagging engine");
+ super.activate(ce);
+ @SuppressWarnings("unchecked")
+ Dictionary<String, Object> properties = ce.getProperties();
+
+ //parse the configured languages
+ langaugeConfig.setConfiguration(properties);
+
+ //set the processed lexical categories
+ Object value = properties.get(CONFIG_ADJECTIVES);
+ adjectivesOnly = value instanceof Boolean ? (Boolean)value :
+ value != null ? Boolean.parseBoolean(value.toString()) :
+ DEFAULT_PROCESS_ADJECTIVES_ONLY;
+
+ //set minimum POS confidence
+ value = properties.get(CONFIG_MIN_POS_CONFIDENCE);
+ if(value instanceof Number){
+ minPOSConfidence = ((Number)value).doubleValue();
+ } else if(value != null){
+ try {
+ minPOSConfidence = Double.parseDouble(value.toString());
+ } catch (NumberFormatException e) {
+ throw new ConfigurationException(CONFIG_MIN_POS_CONFIDENCE,
+ "Unable to parsed minimum POS confidence value from '"
+ + value +"'!",e);
+ }
+ } else {
+ minPOSConfidence = DEFAULT_MIN_POS_CONFIDNECE;
+ }
+ if(minPOSConfidence <= 0 || minPOSConfidence >= 1){
+ throw new ConfigurationException(CONFIG_MIN_POS_CONFIDENCE,
+ "The configured minimum POS confidence value '"
+ +minPOSConfidence+"' MUST BE > 0 and < 1!");
+ }
+
+ //TODO: just for testing
+ value = properties.get(DEBUG_SENTIMENTS);
+ debugSentiments = value instanceof Boolean ? (Boolean)value :
+ value != null ? Boolean.parseBoolean(value.toString()) :
+ false;
+ }
+
+ @Deactivate
+ protected void deactivate(ComponentContext ctx){
+ //remove remaining classifiers
+ this.classifiers.clear();
+ langaugeConfig.setDefault();
+ super.deactivate(ctx);
+ }
+
+}
Added: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/engines/sentiment-wordclassifier/src/main/resources/OSGI-INF/metatype/metatype.properties
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/engines/sentiment-wordclassifier/src/main/resources/OSGI-INF/metatype/metatype.properties?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/engines/sentiment-wordclassifier/src/main/resources/OSGI-INF/metatype/metatype.properties (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/engines/sentiment-wordclassifier/src/main/resources/OSGI-INF/metatype/metatype.properties Wed Sep 19 08:48:32 2012
@@ -0,0 +1,47 @@
+#
+# Copyright 2012, FORMCEPT [http://www.formcept.com]
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+org.apache.stanbol.enhancer.engines.sentiment.services.SentimentEngine.name=Apache Stanbol Enhancer Engine: Sentiment Analysis
+
+stanbol.enhancer.engine.name.name=Sentiment Analysis Engine
+stanbol.enhancer.engine.name.description=The name of the enhancement engine as \
+used in the RESTful interface '/engine/<name>'
+service.ranking.name=Ranking
+service.ranking.description=If two enhancement engines with the same name are active the \
+one with the higher ranking will be used to process parsed content items.
+
+#====================================================
+#Properties used to configure FORMCEPT Enhancer
+#====================================================
+
+
+org.apache.stanbol.enhancer.sentiment.languages.name=Language configuration
+org.apache.stanbol.enhancer.sentiment.languages.description=Takes a list of ISO \
+ language codes of supported languages. Currently supported are the languages given as default value.
+
+org.apache.stanbol.enhancer.sentiment.adjectives.name=Adjectives/Nouns only
+org.apache.stanbol.enhancer.sentiment.adjectives.description=When set to true, only adjectives and nouns \
+ will be considered in sentiment analysis. Note that this will case this engine only to Tag words \
+ if POS tags are available.
+
+org.apache.stanbol.enhancer.sentiment.min-pos-confidence.name=Minimum POS Tag Confidence
+org.apache.stanbol.enhancer.sentiment.min-pos-confidence.description=If "Adjectives/Nouns only" \
+ is activated this is used as minimum confidence for POS tags. All non Noun \
+ and Adjective tokens with a confidence >= the configured value are filtered. \
+ NOTE that for words with ambiguous POS tags (multiple POS tags and no tag with \
+ an confidence >= the configured value) POS tags representing a Noun or Adjective \
+ are also considered if their confidence >= half of the configured value.
+
Added: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/engines/sentiment-wordclassifier/src/main/resources/config/at.newmedialab.stanbol.enhancer.person.PersonEnhancer-snml.config
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/engines/sentiment-wordclassifier/src/main/resources/config/at.newmedialab.stanbol.enhancer.person.PersonEnhancer-snml.config?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/engines/sentiment-wordclassifier/src/main/resources/config/at.newmedialab.stanbol.enhancer.person.PersonEnhancer-snml.config (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/engines/sentiment-wordclassifier/src/main/resources/config/at.newmedialab.stanbol.enhancer.person.PersonEnhancer-snml.config Wed Sep 19 08:48:32 2012
@@ -0,0 +1,2 @@
+stanbol.enhancer.chain.name="person-enhancer"
+stanbol.enhancer.chain.list.enginelist=["tika","langid","snml-person-enhancer"]
Propchange: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/
------------------------------------------------------------------------------
--- svn:ignore (added)
+++ svn:ignore Wed Sep 19 08:48:32 2012
@@ -0,0 +1,7 @@
+.project
+
+.settings
+
+.classpath
+
+target
Added: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/pom.xml
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/pom.xml?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/pom.xml (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/pom.xml Wed Sep 19 08:48:32 2012
@@ -0,0 +1,125 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ You under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+ <parent>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.enhancer.parent</artifactId>
+ <version>0.10.0-incubating-SNAPSHOT</version>
+ <relativePath>../../parent</relativePath>
+ </parent>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.enhancer.nlp</artifactId>
+ <version>0.10.0-incubating-SNAPSHOT</version>
+ <packaging>bundle</packaging>
+
+ <name>Apache Stanbol Enhancer NLP</name>
+ <description>
+ Module that defines the ContentPart defining the NLP processing metadata.
+ </description>
+ <inceptionYear>2012</inceptionYear>
+
+ <scm>
+ <connection>
+ scm:svn:http://svn.apache.org/repos/asf/incubator/stanbol/trunk/enhancer/generic/nlp/
+ </connection>
+ <developerConnection>
+ scm:svn:https://svn.apache.org/repos/asf/incubator/stanbol/trunk/enhancer/generic/nlp/
+ </developerConnection>
+ <url>http://incubator.apache.org/stanbol/</url>
+ </scm>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>maven-bundle-plugin</artifactId>
+ <extensions>true</extensions>
+ <configuration>
+ <instructions>
+ <!-- Enable this for including your enhancement chain configuration -->
+ <!-- TODO: maybe include POS AnnotationModel definitions -->
+ <!-- <Install-Path>config</Install-Path> -->
+ <Export-Package>
+ org.apache.stanbol.enhancer.nlp;version=${project.version},
+ org.apache.stanbol.enhancer.nlp.model;version=${project.version},
+ org.apache.stanbol.enhancer.nlp.model.annotation;version=${project.version},
+ org.apache.stanbol.enhancer.nlp.ontology;version=${project.version},
+ org.apache.stanbol.enhancer.nlp.pos;version=${project.version},
+ org.apache.stanbol.enhancer.nlp.pos.*;version=${project.version},
+ org.apache.stanbol.enhancer.nlp.phrase;version=${project.version},
+ org.apache.stanbol.enhancer.nlp.sentiment;version=${project.version},
+ org.apache.stanbol.enhancer.nlp.utils;version=${project.version}
+ </Export-Package>
+ <Private-Package>
+ org.apache.stanbol.enhancer.nlp.model.impl.*;version=${project.version}
+ </Private-Package>
+ <Embed-Dependency></Embed-Dependency>
+ </instructions>
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.rat</groupId>
+ <artifactId>apache-rat-plugin</artifactId>
+ <configuration>
+ <excludes>
+ <exclude>src/license/THIRD-PARTY.properties</exclude>
+ </excludes>
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>maven-scr-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+
+ <dependencies>
+ <dependency>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.enhancer.servicesapi</artifactId>
+ <version>0.10.0-incubating-SNAPSHOT</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-collections</groupId>
+ <artifactId>commons-collections</artifactId>
+ <version>3.2.1</version>
+ </dependency>
+ <!-- Logging -->
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-api</artifactId>
+ <version>1.6.1</version>
+ </dependency>
+
+ <!-- test dependencies -->
+ <dependency>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.enhancer.core</artifactId>
+ <version>0.10.0-incubating-SNAPSHOT</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-simple</artifactId>
+ <scope>test</scope>
+ </dependency>
+
+ </dependencies>
+
+</project>
Added: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/NlpAnnotations.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/NlpAnnotations.java?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/NlpAnnotations.java (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/NlpAnnotations.java Wed Sep 19 08:48:32 2012
@@ -0,0 +1,38 @@
+package org.apache.stanbol.enhancer.nlp;
+
+import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
+import org.apache.stanbol.enhancer.nlp.model.Chunk;
+import org.apache.stanbol.enhancer.nlp.model.Token;
+import org.apache.stanbol.enhancer.nlp.model.annotation.Annotation;
+import org.apache.stanbol.enhancer.nlp.phrase.PhraseTag;
+import org.apache.stanbol.enhancer.nlp.pos.PosTag;
+import org.apache.stanbol.enhancer.nlp.sentiment.SentimentTag;
+
+/**
+ * Defines the {@link Annotation} constants typically used by NLP components
+ */
+public interface NlpAnnotations {
+
+ /**
+ * The POS {@link Annotation} added by POS taggers to {@link Token}s of
+ * an {@link AnalysedText}.
+ */
+ Annotation<String,PosTag> POSAnnotation = new Annotation<String,PosTag>(
+ "stanbol.enhancer.nlp.pos", PosTag.class);
+
+
+ /**
+ * The Phrase {@link Annotation} added by chunker to a group of
+ * [1..*] {@link Token}s.<p>
+ * This annotation is typically found on {@link Chunk}s.
+ */
+ Annotation<String,PhraseTag> phraseAnnotation = new Annotation<String,PhraseTag>(
+ "stanbol.enhancer.nlp.phrase", PhraseTag.class);
+
+ /**
+ * The Sentiment {@link Annotation} added by a sentiment tagger typically
+ * to single {@link Token}s that do carry a positive or negative sentiment.
+ */
+ Annotation<String,SentimentTag> sentimentAnnotation = new Annotation<String,SentimentTag>(
+ "stanbol.enhancer.nlp.sentiment", SentimentTag.class);
+}
Added: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/Tag.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/Tag.java?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/Tag.java (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/Tag.java Wed Sep 19 08:48:32 2012
@@ -0,0 +1,62 @@
+package org.apache.stanbol.enhancer.nlp;
+
+import org.apache.stanbol.enhancer.nlp.pos.PosTag;
+
+
+public abstract class Tag<T extends Tag<T>> { //lol ??!! is that how to define T
+
+
+ protected final String tag;
+ private TagSet<T> annotationModel;
+
+ /**
+ * Creates a PosTag for the given String
+ * @param tag the tag
+ * @throws IllegalArgumentException if the parsed tag is <code>null</code>
+ * or empty.
+ */
+ public Tag(String tag){
+ if(tag == null || tag.isEmpty()){
+ throw new IllegalArgumentException("The tag MUST NOT be NULL!");
+ }
+ this.tag = tag;
+ }
+
+ public final String getTag() {
+ return tag;
+ }
+ /**
+ * @return the annotationModel
+ */
+ public final TagSet<T> getAnnotationModel() {
+ return annotationModel;
+ }
+ /**
+ * Used by the {@link TagSet} class to assign itself to an PosTag
+ * that is {@link TagSet#addTag(PosTag) added}.
+ * @param annotationModel the annotationModel to set
+ */
+ protected final void setAnnotationModel(TagSet<T> annotationModel) {
+ this.annotationModel = annotationModel;
+ }
+
+ @Override
+ public String toString() {
+ return String.format("{} %s ", getClass().getSimpleName(), tag);
+ }
+
+ @Override
+ public int hashCode() {
+ return tag.hashCode();
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if(obj instanceof Tag && tag.equals(((Tag<?>)obj).tag)){
+ return (annotationModel == null && ((Tag<?>)obj).annotationModel == null) ||
+ (annotationModel != null && annotationModel.equals(((Tag<?>)obj).annotationModel));
+ } else {
+ return false;
+ }
+ }
+}
\ No newline at end of file
Added: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/TagSet.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/TagSet.java?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/TagSet.java (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/TagSet.java Wed Sep 19 08:48:32 2012
@@ -0,0 +1,131 @@
+package org.apache.stanbol.enhancer.nlp;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
+import org.apache.stanbol.enhancer.nlp.model.Chunk;
+import org.apache.stanbol.enhancer.nlp.model.Sentence;
+import org.apache.stanbol.enhancer.nlp.model.Token;
+import org.apache.stanbol.enhancer.nlp.model.annotation.Annotated;
+import org.apache.stanbol.enhancer.nlp.phrase.PhraseTag;
+import org.apache.stanbol.enhancer.nlp.pos.PosTag;
+
+/**
+ * An TagSet used for tagging {@link Annotated} resources like {@link Token}s,
+ * {@link Chunk}s or even whole {@link Sentence}s and
+ * {@link AnalysedText Texts}s.<p>
+ * A TagSet defines a set of {@link Tag} and can be usd for one or more
+ * {@link #getLanguages() languages}.<p>
+ * {@link TagSet} uses generics to allow the specification of more specific
+ * TagSets e.g. for {@link PosTag} or {@link PhraseTag}s.<p>
+ */
+public class TagSet<T extends Tag<T>> implements Iterable<T>{
+
+
+ private final String name;
+ private final Set<String> languages;
+
+ private final Map<String,T> tag2PosTag = new HashMap<String,T>();
+
+ private final Map<String,Object> properties = new HashMap<String,Object>();
+
+ /**
+ * Creates an AnnotationModel for Tags of a specific type (e.g.
+ * {@link PosTag} or {@link PhraseTag}) that can be used for the parsed
+ * Languages.<p>
+ * In addition AnnotationModels allow to add additional properties.
+ * Those can be used to assign information such as the
+ * In addition this constructor allows to parse
+ * URIs for Ontologies that define the model and the linking to the
+ * <a herf="http://nlp2rdf.lod2.eu/olia/">nlp2rdf OLIA</a> annotation and
+ * linking models.<p>
+ * In the future those metadata might even be used by components to
+ * automatically create Annotation models.<p>
+ * NOTE that the parsed name us used as unique criteria. TODO this should
+ * be evaluated.
+ * @param name the unique name (is used for {@link #hashCode()} and
+ * @param languages the languages
+ */
+ public TagSet(String name, String...languages) {
+ if(name == null || name.isEmpty()){
+ throw new IllegalArgumentException("The parsed name MUST NOT be NULL!");
+ }
+ this.name = name;
+ if(languages != null && languages.length > 0){
+ Set<String> langSet = new HashSet<String>(Arrays.asList(languages));
+ langSet.remove(null);
+ this.languages = Collections.unmodifiableSet(langSet);
+ }else {
+ this.languages = Collections.emptySet();
+
+ }
+ }
+
+ /**
+ * Getter for the properties of this AnnotationModel
+ * @return
+ */
+ public Map<String,Object> getProperties(){
+ return properties;
+ }
+
+
+ /**
+ * @return the name
+ */
+ public String getName() {
+ return name;
+ }
+
+ /**
+ * @return the languages
+ */
+ public Set<String> getLanguages() {
+ return languages;
+ }
+ /**
+ * Adds an PosTag
+ * @param tag
+ */
+ public void addTag(T tag){
+ if(tag != null){
+ if(tag.getAnnotationModel() != null || this.equals(tag.getAnnotationModel())){
+ throw new IllegalStateException("Unable to add "+tag+" to "+this
+ + "because it is already assigned to "+tag.getAnnotationModel());
+ }
+ tag.setAnnotationModel(this);
+ tag2PosTag.put(tag.getTag(), tag);
+ }
+ }
+
+ public T getTag(String tag){
+ return tag2PosTag.get(tag);
+ }
+
+ @Override
+ public Iterator<T> iterator() {
+ return tag2PosTag.values().iterator();
+ }
+
+ @Override
+ public String toString() {
+ return String.format("AnnotationModel [name: %s |lanuages: %s]",
+ getName(), languages);
+ }
+
+ @Override
+ public int hashCode() {
+ return name.hashCode();
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ return obj instanceof TagSet && name.equals(((TagSet<?>)obj).name);
+ }
+}
Added: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/AnalysedText.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/AnalysedText.java?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/AnalysedText.java (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/AnalysedText.java Wed Sep 19 08:48:32 2012
@@ -0,0 +1,80 @@
+package org.apache.stanbol.enhancer.nlp.model;
+
+import java.util.ConcurrentModificationException;
+import java.util.Iterator;
+
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.stanbol.enhancer.servicesapi.Blob;
+import org.apache.stanbol.enhancer.servicesapi.ContentItem;
+
+/**
+ * Provides access to NLP processing results of the <code>text/plain</code>
+ * {@link Blob} of an ContentItem. Intended to be
+ * {@link ContentItem#addPart(org.apache.clerezza.rdf.core.UriRef, Object) added
+ * as ContentPart} by using {@link #ANALYSED_TEXT_URI}.
+ * @see ContentItem#addPart(UriRef, Object)
+ */
+public interface AnalysedText extends Section{
+
+
+ /**
+ * The {@link UriRef} used to register the {@link AnalysedText} instance
+ * as {@link ContentItem#addPart(org.apache.clerezza.rdf.core.UriRef, Object)
+ * ContentPart} to the {@link ContentItem}
+ */
+ public static final UriRef ANALYSED_TEXT_URI = new UriRef("urn:stanbol.enhancer:nlp.analysedText");
+
+ /**
+ * Returns {@link SpanTypeEnum#Text}
+ * @see Span#getType()
+ * @see SpanTypeEnum#Text
+ */
+ SpanTypeEnum getType();
+
+ /**
+ * Adds an Sentence
+ * @param start the start index
+ * @param end the end index
+ * @return the Sentence
+ */
+ Sentence addSentence(int start, int end);
+
+ /**
+ * Adds an Chunk
+ * @param start the start of the chunk
+ * @param end
+ * @return
+ */
+ Chunk addChunk(int start, int end);
+
+ /**
+ * All sentences of the Analysed texts.<p>
+ * Returned Iterators MUST NOT throw {@link ConcurrentModificationException}
+ * but consider additions of Spans.
+ * @return
+ */
+ Iterator<Sentence> getSentences();
+
+ /**
+ * All Chunks of this analysed text.<p>
+ * Returned Iterators MUST NOT throw {@link ConcurrentModificationException}
+ * but consider additions of Spans.
+ * @return the chunks
+ */
+ Iterator<Chunk> getChunks();
+
+ /**
+ * Getter for the text.
+ * @return
+ */
+ CharSequence getText();
+
+ /**
+ * The analysed {@link Blob}. Typically {@link Blob#getMimeType()} will be
+ * <code>text/plain</code>.
+ * @return the analysed {@link Blob} instance.
+ */
+ Blob getBlob();
+
+
+}
\ No newline at end of file
Added: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/AnalysedTextFactory.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/AnalysedTextFactory.java?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/AnalysedTextFactory.java (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/AnalysedTextFactory.java Wed Sep 19 08:48:32 2012
@@ -0,0 +1,59 @@
+package org.apache.stanbol.enhancer.nlp.model;
+
+import java.io.IOException;
+
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.stanbol.enhancer.nlp.model.impl.AnalysedTextFactoryImpl;
+import org.apache.stanbol.enhancer.servicesapi.Blob;
+import org.apache.stanbol.enhancer.servicesapi.ContentItem;
+
+public abstract class AnalysedTextFactory {
+
+ private static AnalysedTextFactory defaultInstance = new AnalysedTextFactoryImpl();
+
+ /**
+ * Creates an {@link AnalysedText} instance for the parsed {@link Blob}
+ * and registers itself as
+ * {@link ContentItem#addPart(org.apache.clerezza.rdf.core.UriRef, Object)
+ * ContentPart} with the {@link UriRef} {@link AnalysedText#ANALYSED_TEXT_URI}
+ * to the parsed {@link ContentItem}.<p>
+ * If already a ContentPart with the given UriRef is registered this
+ * Method will throw an {@link IllegalStateException}.
+ * @param ci the ContentItem to register the created {@link AnalysedText} instance
+ * @param blob the analysed {@link Blob}
+ * @return the created {@link AnalysedText}
+ * @throws IllegalArgumentException of <code>null</code> is parsed as
+ * ContentItem or Blob
+ * @throws IllegalStateException if there is already an ContentPart is
+ * registered for {@link AnalysedText#ANALYSED_TEXT_URI} with the parsed
+ * ContentItem.
+ * @throws IOException on any error while reading data from the parsed blob
+ */
+ public abstract AnalysedText createAnalysedText(ContentItem ci, Blob blob) throws IOException ;
+ /**
+ * Creates a AnalysedText instance for the parsed blob.<p>
+ * NOTE: This implementation does NOT register the {@link AnalysedText}
+ * as ContentPart.
+ * @param blob the analysed Blob
+ * @return the AnalysedText
+ * @throws IllegalArgumentException if <code>null</code> is parsed as
+ * {@link Blob}.
+ * @throws IOException on any error while reading data from the parsed blob
+ */
+ public abstract AnalysedText createAnalysedText(Blob blob) throws IOException ;
+
+ /**
+ * Intended to be used outside of an OSGI container to obtain an
+ * instance of a {@link AnalysedTextFactory}. <p>
+ * When using this within an OSGI environment it is preferred to obtain
+ * the factory as a service (e.g. via the BundleContext, an ServiceTracker
+ * or by injection). As this allows the usage of different implementations.
+ * <p>
+ * This is hard-wired with the default implementation contained within this
+ * module.
+ * @return the default {@link AnalysedTextFactory} instance.
+ */
+ public static final AnalysedTextFactory getDefaultInstance(){
+ return defaultInstance;
+ }
+}
Added: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/AnalysedTextUtils.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/AnalysedTextUtils.java?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/AnalysedTextUtils.java (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/AnalysedTextUtils.java Wed Sep 19 08:48:32 2012
@@ -0,0 +1,154 @@
+package org.apache.stanbol.enhancer.nlp.model;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.EnumSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Set;
+import java.util.SortedSet;
+import java.util.TreeSet;
+
+import org.apache.stanbol.enhancer.nlp.model.Span.SpanTypeEnum;
+import org.apache.stanbol.enhancer.servicesapi.ContentItem;
+import org.apache.stanbol.enhancer.servicesapi.NoSuchPartException;
+
+public class AnalysedTextUtils {
+
+ /**
+ * Getter for the {@link AnalysedText} content part of the parsed
+ * ContentItem.<p>
+ * This assumes that the AnalysedText is registered by using
+ * {@link AnalysedText#ANALYSED_TEXT_URI}. Otherwise it will not find it.
+ * @param ci The {@link ContentItem}
+ * @return the {@link AnalysedText} or <code>null</code> if not present.
+ * @throws ClassCastException if a content part is registered with
+ * {@link AnalysedText#ANALYSED_TEXT_URI} but its type is not compatible
+ * to {@link AnalysedText}.
+ */
+ public static AnalysedText getAnalysedText(ContentItem ci){
+ ci.getLock().readLock().lock();
+ try {
+ return ci.getPart(AnalysedText.ANALYSED_TEXT_URI, AnalysedText.class);
+ }catch (NoSuchPartException e) {
+ return null;
+ } finally {
+ ci.getLock().readLock().unlock();
+ }
+ }
+
+ /**
+ * Copies the elements of the parsed iterator to a list.
+ * @param iterator the iterator
+ * @return the List with all spans of the Iterators
+ */
+ public static <T extends Span> List<T> asList(Iterator<T> it){
+ if(it == null || !it.hasNext()){
+ return Collections.emptyList();
+ } else {
+ List<T> spans = new ArrayList<T>();
+ appandToList(it, spans);
+ return spans;
+ }
+ }
+ /**
+ * Appends the elements provided by the parsed Iterator to the list.
+ * @param it the Iterator
+ * @param list the List
+ * @throws NullPointerException if the parsed List is <code>null</code>
+ */
+ public static <T extends Span> void appandToList(Iterator<T> it, List<? super T> list){
+ if(it != null){
+ while(it.hasNext()){
+ list.add(it.next());
+ }
+ }
+ }
+
+ /**
+ * Copies the elements of the parsed iterator(s) to a {@link SortedSet}. As
+ * {@link Span} implements {@link Comparable} the Spans within the resulting
+ * set will have the same order as returned by the methods of {@link AnalysedText}
+ * @param it the iterator(s)
+ * @return the {@link SortedSet} containing all Spans of the iterators
+ */
+ public static <T extends Span> SortedSet<T> asSet(Iterator<T> it){
+ SortedSet<T> spans = new TreeSet<T>();
+ addToSet(it, spans);
+ return spans;
+ }
+ /**
+ * Adds the Spans of the parsed Iterator to the parsed Set
+ * @param it the Iterator
+ * @param set the set
+ * @throws NullPointerException if the parsed List is <code>null</code>
+ */
+ public static <T extends Span> void addToSet(Iterator<T> it,Set<? super T> set){
+ if(it != null){
+ while(it.hasNext()){
+ set.add(it.next());
+ }
+ }
+ }
+ /**
+ * Iterates over two levels of the Span hierarchy (e.g. all Tokens of a
+ * Sentence that are within a Chunk). The returned Iterator is a live
+ * view on the {@link AnalysedText} (being the context of the enclosing
+ * Span).<p>
+ * Usage Example
+ * <code><pre>
+ * Sentence sentence; //The currently processed Sentence
+ * Iterator<Span> tokens = AnalysedTextUtils.getSpansInSpans(
+ * sentence,
+ * {@link SpanTypeEnum#Chunk SpanTypeEnum.Chunk}
+ * {@link SpanTypeEnum#Token SpanTypeEnum.Token}
+ * while(tokens.hasNext()){
+ * Token token = (Token)tokens.next();
+ * // process only tokens within a chunk
+ * }
+ * </pre></code>
+ * @param section
+ * @param level1 the {@link SpanTypeEnum} for the first Level. MUST be
+ * a Type that is a {@link Section} (e.g. Chunk or Sentence).
+ * @param level2
+ * @return
+ * @throws IllegalArgumentException if {@link SpanTypeEnum#Token} is parsed
+ * as <code>level1</code> span type.
+ */
+ public static Iterator<Span> getSpansInSpans(Section section, SpanTypeEnum level1, final SpanTypeEnum level2){
+ if(level1 == SpanTypeEnum.Token){
+ throw new IllegalArgumentException("The SpanType for level1 MUST refer to a Section "
+ + "(Chunk, Sentence, TextSection or Text)");
+ }
+ final Iterator<Span> level1It = section.getEnclosed(EnumSet.of(level1));
+ return new Iterator<Span>(){
+ Iterator<Span> level2It = null;
+ @Override
+ public boolean hasNext() {
+ if(level2It != null && level2It.hasNext()) {
+ return true;
+ } else {
+ while(level1It.hasNext()){
+ level2It = ((Section)level1It.next()).getEnclosed(EnumSet.of(level2));
+ if(level2It.hasNext()){
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+
+ @Override
+ public Span next() {
+ hasNext(); //ensure hasNext is called on multiple calls to next()
+ return level2It.next();
+ }
+
+ @Override
+ public void remove() {
+ level2It.remove();
+ }
+ };
+ }
+
+}
Added: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/Chunk.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/Chunk.java?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/Chunk.java (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/Chunk.java Wed Sep 19 08:48:32 2012
@@ -0,0 +1,13 @@
+package org.apache.stanbol.enhancer.nlp.model;
+
+
+public interface Chunk extends Section {
+
+ /**
+ * Returns {@link SpanTypeEnum#Chunk}
+ * @see Span#getType()
+ * @see SpanTypeEnum#Chunk
+ */
+ SpanTypeEnum getType();
+
+}
\ No newline at end of file
Added: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/Section.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/Section.java?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/Section.java (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/Section.java Wed Sep 19 08:48:32 2012
@@ -0,0 +1,41 @@
+package org.apache.stanbol.enhancer.nlp.model;
+
+import java.util.ConcurrentModificationException;
+import java.util.Iterator;
+import java.util.Set;
+
+/**
+ * A {@link Span} that may enclose other Spans. Super type for {@link Chunk}s,
+ * {@link Sentence}s and {@link AnalysedText}.<p>
+ * As {@link Span} this is an meta (abstract) type. Implementations of this
+ * Interface SHOULD BE abstract Classes.
+ */
+public interface Section extends Span {
+
+ /**
+ * Iterates over all Span enclosed by this one that are of any of the
+ * parsed Types.<p>
+ * Returned Iterators MUST NOT throw {@link ConcurrentModificationException}
+ * but consider additions of Spans.
+ * @param types the {@link SpanTypeEnum types} of Spans included
+ * @return sorted iterator over the selected Spans.
+ */
+ Iterator<Span> getEnclosed(Set<SpanTypeEnum> types);
+
+ /**
+ * Adds an Token relative to this Sentence
+ * @param start the start of the token relative to the sentence
+ * @param end
+ * @return
+ */
+ Token addToken(int start, int end);
+
+ /**
+ * The Tokens covered by this Sentence.<p>
+ * Returned Iterators MUST NOT throw {@link ConcurrentModificationException}
+ * but consider additions of Spans.
+ * @return the tokens
+ */
+ Iterator<Token> getTokens();
+
+}
Added: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/Sentence.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/Sentence.java?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/Sentence.java (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/Sentence.java Wed Sep 19 08:48:32 2012
@@ -0,0 +1,32 @@
+package org.apache.stanbol.enhancer.nlp.model;
+
+import java.util.ConcurrentModificationException;
+import java.util.Iterator;
+
+public interface Sentence extends Section {
+
+ /**
+ * Returns {@link SpanTypeEnum#Sentence}
+ * @see Span#getType()
+ * @see SpanTypeEnum#Sentence
+ */
+ SpanTypeEnum getType();
+
+ /**
+ * Adds an Chunk relative to this Sentence
+ * @param start the start of the chunk relative to the sentence
+ * @param end
+ * @return
+ */
+ Chunk addChunk(int start, int end);
+
+
+ /**
+ * The Chunks covered by this Sentence<p>
+ * Returned Iterators MUST NOT throw {@link ConcurrentModificationException}
+ * but consider additions of Spans.
+ * @return the chunks
+ */
+ Iterator<Chunk> getChunks();
+
+}
\ No newline at end of file
Added: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/Span.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/Span.java?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/Span.java (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/Span.java Wed Sep 19 08:48:32 2012
@@ -0,0 +1,73 @@
+package org.apache.stanbol.enhancer.nlp.model;
+
+import org.apache.stanbol.enhancer.nlp.model.annotation.Annotated;
+
+/**
+ * Represents a {@link #getSpan() span} [{@link #getStart() start},
+ * {@link #getEnd() end}] within the {@link #getContext() text}. Spans also have
+ * an assigned {@link #getType() type}. Possible types are defined within the
+ * {@link SpanTypeEnum}.<p>
+ * This is an meta (abstract) type. Implementations of this Interface
+ * SHOULD BE abstract Classes.
+ */
+public interface Span extends Annotated, Comparable<Span>{
+
+ /**
+ * Enumeration over different types - or roles - spans defined for an
+ * {@link AnalysedText} may play.
+ */
+ public static enum SpanTypeEnum {
+ /**
+ * The Text as a whole
+ */
+ Text,
+ /**
+ * An section of the text (chapter, page, paragraph ...). NOTE: this
+ * does NOT define types of sections.
+ */
+ TextSection,
+ /**
+ * An Sentence
+ */
+ Sentence,
+ /**
+ * A Chunk (e.g. a Noun Phrase) NOTE: this does NOT define types of
+ * Chunks
+ */
+ Chunk,
+ /**
+ * A Token (e.g. a noun, verb, punctuation) NOTE: this does NOT define
+ * types of Tokens
+ */
+ Token;
+ }
+ /**
+ * The type of the Span
+ * @return
+ */
+ SpanTypeEnum getType();
+
+ /**
+ * The start index of this span This is the absolute offset from the
+ * {@link #getContext()}{@link AnalysedText#getText() .getText()}
+ */
+ int getStart();
+ /**
+ * The end index of this span. This is the absolute offset from the
+ * {@link #getContext()}{@link AnalysedText#getText() .getText()}
+ * @return
+ */
+ int getEnd();
+
+ /**
+ * The {@link AnalysedText} this Span was added to.
+ * @return the AnalysedText representing the context of this Span
+ */
+ AnalysedText getContext();
+ /**
+ * The section of the text selected by this span
+ * @return the selected section of the text
+ */
+ String getSpan();
+
+}
\ No newline at end of file
Added: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/SpanTypeEnum.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/SpanTypeEnum.java?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/SpanTypeEnum.java (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/SpanTypeEnum.java Wed Sep 19 08:48:32 2012
@@ -0,0 +1,2 @@
+package org.apache.stanbol.enhancer.nlp.model;
+
Added: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/Token.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/Token.java?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/Token.java (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/Token.java Wed Sep 19 08:48:32 2012
@@ -0,0 +1,15 @@
+package org.apache.stanbol.enhancer.nlp.model;
+
+
+
+public interface Token extends Span {
+
+ /**
+ * Returns {@link SpanTypeEnum#Token}
+ * @see Span#getType()
+ * @see SpanTypeEnum#Token
+ */
+ SpanTypeEnum getType();
+
+
+}
\ No newline at end of file
Added: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/annotation/Annotated.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/annotation/Annotated.java?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/annotation/Annotated.java (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/annotation/Annotated.java Wed Sep 19 08:48:32 2012
@@ -0,0 +1,70 @@
+package org.apache.stanbol.enhancer.nlp.model.annotation;
+
+import java.util.List;
+
+public interface Annotated {
+
+
+
+ /**
+ * Method for requesting Values of a given Key. This allows to request
+ * Values without an {@link Annotation}.
+ * @param key the Key
+ * @return the Value with the highest probability
+ */
+ Value<?> getValue(Object key);
+
+ /**
+ * Method for requesting the Value of an Annotation.
+ * @param annotation the requested {@link Annotation}
+ * @return the Value with the highest probability
+ * @throws ClassCastException if values of {@link Annotation#getKey()} are
+ * not of type V
+ */
+ <V> Value<V> getAnnotation(Annotation<?,V> annotation);
+
+ /**
+ * Method for requesting Values of a given Key. This allows to request
+ * Values without an {@link Annotation}.
+ * @param key the Key
+ * @return all Value sorted by probability
+ */
+ List<Value<?>> getValues(Object key);
+
+ /**
+ * Method for requesting the Value of an Annotation.
+ * @param annotation the requested {@link Annotation}
+ * @return all Values sorted by probability
+ * @throws ClassCastException if the returned value of
+ * {@link Annotation#getKey()} is not of type V
+ */
+ <V> List<Value<V>> getAnnotations(Annotation<?,V> annotation);
+
+ /**
+ * Appends an Annotation to eventually already existing values
+ * @param annotation the annotation
+ * @param value the value to append
+ */
+ <K,V> void addAnnotation(Annotation<K,V> annotation, Value<V> value);
+
+ /**
+ * Replaces existing Annotations with the parsed one
+ * @param annotation the annotation
+ * @param value the value for the annotation
+ */
+ <K,V> void setAnnotation(Annotation<K,V> annotation, Value<V> value);
+
+ /**
+ * Appends an Annotation to eventually already existing values
+ * @param annotation the annotation
+ * @param value the value to append
+ */
+ <K,V> void addAnnotations(Annotation<K,V> annotation, List<Value<V>> values);
+
+ /**
+ * Replaces existing Annotations with the parsed one
+ * @param annotation the annotation
+ * @param value the value for the annotation
+ */
+ <K,V> void setAnnotations(Annotation<K,V> annotation, List<Value<V>> values);
+}
Added: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/annotation/Annotation.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/annotation/Annotation.java?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/annotation/Annotation.java (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/annotation/Annotation.java Wed Sep 19 08:48:32 2012
@@ -0,0 +1,39 @@
+package org.apache.stanbol.enhancer.nlp.model.annotation;
+
+/**
+ * Definition of an Annotation including the <ul>
+ * <li>key used to store the Annotation
+ * <li>generic type of Values for this Annotation
+ * </ul>
+ *
+ * @param <K>
+ * @param <V>
+ */
+public final class Annotation<K,V> {
+
+ /**
+ * The type of the used Key
+ */
+ final K key;
+ /**
+ * The type of the used Value
+ */
+ final Class<V> valueType;
+
+ public Annotation(K key,Class<V> valueType){
+ if(key == null || key == null){
+ throw new IllegalArgumentException("Key and Value MUST NOT be NULL!");
+ }
+ this.key = key;
+ this.valueType = valueType;
+ }
+
+ public K getKey(){
+ return key;
+ }
+
+ public Class<V> getValueType(){
+ return valueType;
+ }
+
+}
Added: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/annotation/Value.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/annotation/Value.java?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/annotation/Value.java (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/annotation/Value.java Wed Sep 19 08:48:32 2012
@@ -0,0 +1,118 @@
+package org.apache.stanbol.enhancer.nlp.model.annotation;
+
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.List;
+import java.util.Set;
+
+public final class Value<T> {
+
+ /**
+ * For Values that do not have a probability we use {@link Double#NaN}
+ */
+ public static final double UNKNOWN_PROBABILITY = -1.0d;
+
+ /**
+ * The value
+ */
+ private final T value;
+ /**
+ * The probability of the Annotation
+ */
+ private final double probability;
+
+ /**
+ * Creates an Annotation for the value with an {@link #UNKNOWN_PROBABILITY
+ * unknown probability}.
+ * @param value the value
+ */
+ public Value(T value){
+ this(value,UNKNOWN_PROBABILITY);
+ }
+
+ public Value(T value, double probability){
+ if(value == null){
+ throw new IllegalArgumentException("The parsed Value MUST NOT be NULL!");
+ }
+ this.value = value;
+ if(probability != UNKNOWN_PROBABILITY && (probability > 1 || probability < 0)){
+ throw new IllegalArgumentException("Probabilities MUST BE in the range [0..1]");
+ }
+ this.probability = probability;
+ }
+
+ public final T value(){
+ return value;
+ }
+
+ public final double probability(){
+ return probability;
+ }
+
+ public static <T> Value<T> value(T value){
+ return new Value<T>(value);
+ }
+
+ public static <T> Value<T> value(T value,double probability){
+ return new Value<T>(value,probability);
+ }
+
+ public static <T> List<Value<T>> values(T...values){
+ List<Value<T>> valList = new ArrayList<Value<T>>(values.length);
+ for(T value : values){
+ valList.add(new Value<T>(value));
+ }
+ return valList;
+ }
+ public static <T> List<Value<T>> values(T[] values, double[] probabilities){
+ return values(values,probabilities,values.length);
+ }
+ public static <T> List<Value<T>> values(T[] values, double[] probabilities,int elements){
+ List<Value<T>> valList = new ArrayList<Value<T>>(elements);
+ for(int i=0;i<elements;i++){
+ valList.add(new Value<T>(values[i],probabilities[i]));
+ }
+ return valList;
+ }
+
+ @Override
+ public int hashCode() {
+ //for long hash see
+ //http://docs.oracle.com/javase/6/docs/api/java/lang/Double.html#hashCode()
+ long bits = Double.doubleToLongBits(probability);
+ return value.hashCode() + (int)(bits^(bits>>>32));
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ return obj instanceof Value && value.equals(((Value<?>)obj).value) &&
+ probability == ((Value<?>)obj).probability;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder sb = new StringBuilder("Value [");
+ sb.append(value.toString()).append(']');
+ if(probability != UNKNOWN_PROBABILITY){
+ sb.append(".prob=").append(probability);
+ }
+ return sb.toString();
+ }
+
+ /**
+ * Comparator that sorts Values ONLY based on {@link Value#probability()} -
+ * DO NOT USE with {@link Set} implementations as it will only allow a
+ * single Value with the same probability.<p>
+ * Values with {@link #UNKNOWN_PROBABILITY} are considered as lowest
+ * probability.
+ */
+ public static final Comparator<Value<?>> PROBABILITY_COMPARATOR = new Comparator<Value<?>>() {
+
+ @Override
+ public int compare(Value<?> o1, Value<?> o2) {
+ return Double.compare(o2.probability, o1.probability);
+ }
+ };
+
+
+}
Added: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/impl/AnalysedTextFactoryImpl.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/impl/AnalysedTextFactoryImpl.java?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/impl/AnalysedTextFactoryImpl.java (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/impl/AnalysedTextFactoryImpl.java Wed Sep 19 08:48:32 2012
@@ -0,0 +1,59 @@
+package org.apache.stanbol.enhancer.nlp.model.impl;
+
+import java.io.IOException;
+
+import org.apache.felix.scr.annotations.Component;
+import org.apache.felix.scr.annotations.Properties;
+import org.apache.felix.scr.annotations.Property;
+import org.apache.felix.scr.annotations.Service;
+import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
+import org.apache.stanbol.enhancer.nlp.model.AnalysedTextFactory;
+import org.apache.stanbol.enhancer.servicesapi.Blob;
+import org.apache.stanbol.enhancer.servicesapi.ContentItem;
+import org.apache.stanbol.enhancer.servicesapi.NoSuchPartException;
+import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
+import org.osgi.framework.Constants;
+
+@Component(immediate=true)
+@Service(value=AnalysedTextFactory.class)
+@Properties(value={
+ @Property(name=Constants.SERVICE_RANKING,intValue=Integer.MIN_VALUE)
+})
+public class AnalysedTextFactoryImpl extends AnalysedTextFactory {
+
+ @Override
+ public AnalysedText createAnalysedText(ContentItem ci, Blob blob) throws IOException {
+ ci.getLock().readLock().lock();
+ try {
+ AnalysedText existing = ci.getPart(AnalysedText.ANALYSED_TEXT_URI, AnalysedText.class);
+ throw new IllegalStateException("The AnalysedText ContentPart already exists (impl: "
+ +existing.getClass().getSimpleName()+"| blob: "+existing.getBlob().getMimeType()+")");
+ }catch (NoSuchPartException e) {
+ //this is the expected case
+ }catch (ClassCastException e) {
+ throw new IllegalStateException("A ContentPart with the URI '"
+ + AnalysedText.ANALYSED_TEXT_URI+"' already exists but the parts "
+ + "type is not compatible with "+AnalysedText.class.getSimpleName()+"!",
+ e);
+ } finally {
+ ci.getLock().readLock().unlock();
+ }
+ //create the Analysed text
+ AnalysedText at = createAnalysedText(blob);
+ ci.getLock().writeLock().lock();
+ try {
+ //NOTE: there is a possibility that an other thread has added
+ // the contentpart
+ ci.addPart(AnalysedText.ANALYSED_TEXT_URI, at);
+ } finally {
+ ci.getLock().writeLock().unlock();
+ }
+ return at;
+ }
+
+ @Override
+ public AnalysedText createAnalysedText(Blob blob) throws IOException {
+ String text = ContentItemHelper.getText(blob);
+ return new AnalysedTextImpl(blob,text);
+ }
+}
Added: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/impl/AnalysedTextImpl.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/impl/AnalysedTextImpl.java?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/impl/AnalysedTextImpl.java (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/model/impl/AnalysedTextImpl.java Wed Sep 19 08:48:32 2012
@@ -0,0 +1,89 @@
+package org.apache.stanbol.enhancer.nlp.model.impl;
+
+import java.util.Iterator;
+import java.util.NavigableMap;
+import java.util.TreeMap;
+
+import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
+import org.apache.stanbol.enhancer.nlp.model.Chunk;
+import org.apache.stanbol.enhancer.nlp.model.Sentence;
+import org.apache.stanbol.enhancer.nlp.model.Span;
+import org.apache.stanbol.enhancer.servicesapi.Blob;
+
+/**
+ * The Class added as ContentPart to the contentItem
+ * @author westei
+ *
+ */
+public class AnalysedTextImpl extends SectionImpl implements AnalysedText {
+
+
+ private final Blob blob;
+ /**
+ * The analysed text
+ */
+ private String text;
+
+ protected NavigableMap<Span,Span> spans = new TreeMap<Span,Span>();
+
+ public AnalysedTextImpl(Blob blob, String text){
+ super(SpanTypeEnum.Text,0,text.length());
+ this.setContext(this); //the the context to itself
+ this.blob = blob;
+ this.text = text;
+ }
+
+ @Override
+ public SpanTypeEnum getType() {
+ return SpanTypeEnum.Text;
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.stanbol.enhancer.nlp.model.impl.AnalyzedText#addSentence(int, int)
+ */
+ @Override
+ public SentenceImpl addSentence(int start, int end){
+ return register(new SentenceImpl(context, this, start, end));
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.stanbol.enhancer.nlp.model.impl.AnalyzedText#getSentences()
+ */
+ @Override
+ public Iterator<Sentence> getSentences(){
+ return filter(Sentence.class);
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.stanbol.enhancer.nlp.model.impl.Sentence#addChunk(int, int)
+ */
+ @Override
+ public ChunkImpl addChunk(int start, int end){
+ return register(new ChunkImpl(context, this, start, end));
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.stanbol.enhancer.nlp.model.impl.Sentence#getChunks()
+ */
+ @Override
+ public Iterator<Chunk> getChunks(){
+ return filter(Chunk.class);
+ }
+ /**
+ * Reference to the Blob those data got analysed
+ * @return
+ */
+ public final Blob getAnalysedBlob(){
+ return blob;
+ }
+
+ @Override
+ public CharSequence getText() {
+ return text;
+ }
+
+ @Override
+ public Blob getBlob() {
+ return blob;
+ }
+}