You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2014/10/22 12:09:42 UTC

svn commit: r1633587 - in /stanbol/trunk: enhancement-engines/nlp2rdf/src/main/java/org/apache/stanbol/enhancer/engines/nlp2rdf/engine/ enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/nif/

Author: rwesten
Date: Wed Oct 22 10:09:41 2014
New Revision: 1633587

URL: http://svn.apache.org/r1633587
Log:
first implementation of an NLP 2 NIF 2.0 engine (STANBOL-1397)

Added:
    stanbol/trunk/enhancement-engines/nlp2rdf/src/main/java/org/apache/stanbol/enhancer/engines/nlp2rdf/engine/Nif20Helper.java
    stanbol/trunk/enhancement-engines/nlp2rdf/src/main/java/org/apache/stanbol/enhancer/engines/nlp2rdf/engine/Nif20MetadataEngine.java
    stanbol/trunk/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/nif/Nif20.java

Added: stanbol/trunk/enhancement-engines/nlp2rdf/src/main/java/org/apache/stanbol/enhancer/engines/nlp2rdf/engine/Nif20Helper.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/nlp2rdf/src/main/java/org/apache/stanbol/enhancer/engines/nlp2rdf/engine/Nif20Helper.java?rev=1633587&view=auto
==============================================================================
--- stanbol/trunk/enhancement-engines/nlp2rdf/src/main/java/org/apache/stanbol/enhancer/engines/nlp2rdf/engine/Nif20Helper.java (added)
+++ stanbol/trunk/enhancement-engines/nlp2rdf/src/main/java/org/apache/stanbol/enhancer/engines/nlp2rdf/engine/Nif20Helper.java Wed Oct 22 10:09:41 2014
@@ -0,0 +1,302 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.stanbol.enhancer.engines.nlp2rdf.engine;
+
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_CONFIDENCE;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDF_TYPE;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.nio.charset.Charset;
+import java.util.Collections;
+import java.util.EnumMap;
+import java.util.Map;
+
+import org.apache.clerezza.rdf.core.Language;
+import org.apache.clerezza.rdf.core.LiteralFactory;
+import org.apache.clerezza.rdf.core.MGraph;
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
+import org.apache.clerezza.rdf.core.impl.TripleImpl;
+import org.apache.commons.io.IOUtils;
+import org.apache.stanbol.enhancer.nlp.NlpAnnotations;
+import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
+import org.apache.stanbol.enhancer.nlp.model.Chunk;
+import org.apache.stanbol.enhancer.nlp.model.Span;
+import org.apache.stanbol.enhancer.nlp.model.SpanTypeEnum;
+import org.apache.stanbol.enhancer.nlp.model.Token;
+import org.apache.stanbol.enhancer.nlp.model.annotation.Annotated;
+import org.apache.stanbol.enhancer.nlp.model.annotation.Annotation;
+import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
+import org.apache.stanbol.enhancer.nlp.ner.NerTag;
+import org.apache.stanbol.enhancer.nlp.nif.Nif20;
+import org.apache.stanbol.enhancer.nlp.phrase.PhraseTag;
+import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
+import org.apache.stanbol.enhancer.nlp.pos.Pos;
+import org.apache.stanbol.enhancer.nlp.pos.PosTag;
+import org.apache.stanbol.enhancer.servicesapi.ContentItem;
+import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
+
+public final class Nif20Helper {
+    
+    private static final LiteralFactory lf = LiteralFactory.getInstance();
+
+    private Nif20Helper(){}
+    
+    public static final Map<SpanTypeEnum,UriRef> SPAN_TYPE_TO_SSO_TYPE;
+    static {
+        Map<SpanTypeEnum,UriRef> mapping = new EnumMap<SpanTypeEnum,UriRef>(SpanTypeEnum.class);
+        //mapping.put(SpanTypeEnum.Text, null);
+        //mapping.put(SpanTypeEnum.TextSection, null);
+        mapping.put(SpanTypeEnum.Sentence, Nif20.Sentence.getUri());
+        mapping.put(SpanTypeEnum.Chunk, Nif20.Phrase.getUri());
+        mapping.put(SpanTypeEnum.Token, Nif20.Word.getUri());
+        SPAN_TYPE_TO_SSO_TYPE = Collections.unmodifiableMap(mapping);
+    }
+    
+    /**
+     * Read-only map that maps from the {@link LexicalCategory} to the OLIA
+     * Concept representing the Phrase (e.g. {@link LexicalCategory#Noun} maps
+     * to "<code>http://purl.org/olia/olia.owl#NounPhrase</code>").
+     */
+    public static final Map<LexicalCategory,UriRef> LEXICAL_TYPE_TO_PHRASE_TYPE;
+    static {
+        String olia = "http://purl.org/olia/olia.owl#";
+        Map<LexicalCategory,UriRef> mapping = new EnumMap<LexicalCategory,UriRef>(LexicalCategory.class);
+        mapping.put(LexicalCategory.Noun, new UriRef(olia+"NounPhrase"));
+        mapping.put(LexicalCategory.Verb, new UriRef(olia+"VerbPhrase"));
+        mapping.put(LexicalCategory.Adjective, new UriRef(olia+"AdjectivePhrase"));
+        mapping.put(LexicalCategory.Adverb, new UriRef(olia+"AdverbPhrase"));
+        mapping.put(LexicalCategory.Conjuction, new UriRef(olia+"ConjuctionPhrase"));
+        LEXICAL_TYPE_TO_PHRASE_TYPE = Collections.unmodifiableMap(mapping);
+    }    
+    /**
+     * Creates a NIF2.0 Fragment URI using the parsed base URI and the start/end
+     * indexes.
+     * @param base the base URI
+     * @param start the start position. If <code>&lt; 0</code> than zero is added.
+     * @param end the end position or values &lt; 1 when open ended.
+     * @return the NIF 2.0 Fragment URI
+     * @throws IllegalArgumentException if <code>null</code> is parsed as base
+     * {@link UriRef} or the end position is &gt;=0 but &lt= the parsed start
+     * position.
+     */
+    public static final UriRef getNifFragmentURI(UriRef base, int start,int end){
+        if(base == null){
+            throw new IllegalArgumentException("Base URI MUST NOT be NULL!");
+        }
+        StringBuilder sb = new StringBuilder(base.getUnicodeString());
+        sb.append("#char=");
+        sb.append(start >= 0 ? start : 0).append(',');
+        if(end >= 0){
+            if(end < start){
+                throw new IllegalArgumentException("End index '"+end+"' < start '"+start+"'!");
+            }
+            sb.append(end);
+        } //else open ended ...
+        return new UriRef(sb.toString());
+    }
+ 
+    public static final UriRef getNifRFC5147URI(UriRef base, int start, int end){
+        if(base == null){
+            throw new IllegalArgumentException("Base URI MUST NOT be NULL!");
+        }
+        assert start >= 0;
+        assert end < 0 || end >= start;
+        StringBuilder sb = new StringBuilder(base.getUnicodeString());
+        sb.append("#char=");
+        sb.append(start >= 0 ? start : 0);
+        if(end >= 0){
+            sb.append(',').append(end);
+        } //else select the whole string ...
+        return new UriRef(sb.toString());
+    }
+    
+    public static final int NIF_HASH_CONTEXT_LENGTH = 10;
+    public static final int NIF_HASH_MAX_STRING_LENGTH = 20;
+    
+    public static final Charset UTF8 = Charset.forName("UTF8");
+    
+    public static final UriRef getNifHashURI(UriRef base, int start, int end, String text){
+        if(base == null){
+            throw new IllegalArgumentException("Base URI MUST NOT be NULL!");
+        }
+        start = start < 0 ? 0 : start;
+        end = end < 0 ? start : end;
+        if(end < start){
+            throw new IllegalArgumentException("End index '"+end+"' < start '"+start+"'!");
+        }
+        if(end >= text.length()){
+            throw new IllegalArgumentException("The End index '"+end+"' exeeds the "
+                + "length of the text '"+text.length()+"'!");
+        }
+        int contextStart = Math.max(0, start-NIF_HASH_CONTEXT_LENGTH);
+        int contextEnd = Math.min(text.length(), end+NIF_HASH_CONTEXT_LENGTH);
+        StringBuilder sb = new StringBuilder(base.getUnicodeString());
+        sb.append("#hash_");
+        sb.append(NIF_HASH_CONTEXT_LENGTH);
+        sb.append('_');
+        sb.append(end-start);
+        sb.append('_');
+        sb.append(getContextDigest(text, contextStart, start, end, contextEnd));
+        sb.append('_');
+        sb.append(text.substring(start, 
+            Math.min(end,start+NIF_HASH_MAX_STRING_LENGTH)));
+        return new UriRef(sb.toString());
+    }
+
+    /**
+     * Creates the UTF8 byte representation for the '{prefix}({selected}){suffix}'
+     * calculated based on the parsed parameters
+     * @param text the text
+     * @param contextStart the start index of the prefix
+     * @param start the start index of the selected text part
+     * @param end the end index of the selecte text part
+     * @param contextEnd the end index of the suffix
+     * @return the HASH string representation of the MD5 over 
+     *  <code>'{prefix}({selected}){suffix}'</code> (NOTE the brackets that are
+     *  added at the start/end of the selected text)
+     */
+    private static String getContextDigest(String text, int contextStart, int start, int end, int contextEnd) {
+        ByteArrayOutputStream contextOs = new ByteArrayOutputStream();
+        Writer contextWriter = new OutputStreamWriter(contextOs, UTF8);
+        try {
+            if(contextStart<start){
+                contextWriter.append(text, contextStart, start);
+            }
+            contextWriter.append('(');
+            if(start < end){
+                contextWriter.append(text, start, end);
+            }
+            contextWriter.append(')');
+            if(end < contextEnd){
+                contextWriter.append(text,end,contextEnd);
+            }
+            contextWriter.flush();
+            return ContentItemHelper.streamDigest(
+                new ByteArrayInputStream(contextOs.toByteArray()),
+                null, "MD5");
+        } catch (IOException e) {
+            //NO IOExceptions in in-memory stream implementations
+            throw new IllegalStateException(e);
+        } finally {
+            IOUtils.closeQuietly(contextOs);
+        }
+    }
+
+    /**
+     * Writes basic information of the parsed span by using NIF 1.0 including the
+     * {@link SsoOntology} Sentence/Phrase/Word type based on 
+     * the {@link Span#getType()}<p>
+     * As {@link AnalysedText} is based on the plain text version of the ContentItem
+     * this uses the {@link StringOntology#OffsetBasedString} notation.<p>
+     * <i>NOTE:</i> This DOES NOT write string relations, lemma, pos ... information
+     * that might be stored as {@link Annotation} with the parsed {@link Span}.
+     * @param graph the graph to add the triples
+     * @param base the base URI
+     * @param text the {@link AnalysedText}
+     * @param language the {@link Language} or <code>null</code> if not known
+     * @param span the {@link Span} to write.
+     * @return the {@link UriRef} representing the parsed {@link Span} in the
+     * graph
+     */
+    public static UriRef writeSpan(MGraph graph, UriRef base, AnalysedText text, Language language, Span span){
+        UriRef segment = getNifRFC5147URI(base, span.getStart(), 
+        		span.getType() == SpanTypeEnum.Text ? -1 : span.getEnd());
+        graph.add(new TripleImpl(segment, RDF_TYPE, Nif20.RFC5147String.getUri()));
+        if(span.getEnd() - span.getStart() < 100){
+	        graph.add(new TripleImpl(segment, Nif20.anchorOf.getUri(), 
+	            new PlainLiteralImpl(span.getSpan(),language)));
+        } else {
+        	graph.add(new TripleImpl(segment, Nif20.head.getUri(), 
+	            new PlainLiteralImpl(span.getSpan().substring(0,10),language)));
+        }
+        graph.add(new TripleImpl(segment, Nif20.beginIndex.getUri(), 
+            lf.createTypedLiteral(span.getStart())));
+        graph.add(new TripleImpl(segment, Nif20.endIndex.getUri(), 
+            lf.createTypedLiteral(span.getEnd())));
+        switch (span.getType()) {
+            case Token:
+                graph.add(new TripleImpl(segment, RDF_TYPE, Nif20.Word.getUri()));
+                break;
+            case Chunk:
+                graph.add(new TripleImpl(segment, RDF_TYPE, Nif20.Phrase.getUri()));
+                break;
+            case Sentence:
+                graph.add(new TripleImpl(segment, RDF_TYPE, Nif20.Sentence.getUri()));
+                break;
+            case Text:
+                graph.add(new TripleImpl(segment, RDF_TYPE, Nif20.Context.getUri()));
+                break;
+            default:
+            	// no default:
+        }
+        return segment;
+    }
+    
+    /**
+     * Writes the {@link NlpAnnotations#POS_ANNOTATION} as NIF 1.0 to the parsed
+     * RDF graph by using the parsed segmentUri as subject
+     * @param graph the graph
+     * @param annotated the annotated element (e.g. a {@link Token})
+     * @param segmentUri the URI of the resource representing the parsed 
+     * annotated element in the graph
+     */
+    public static void writePos(MGraph graph, Annotated annotated, UriRef segmentUri) {
+        Value<PosTag> posTag = annotated.getAnnotation(NlpAnnotations.POS_ANNOTATION);
+        if(posTag != null){
+            if(posTag.value().isMapped()){
+                for(Pos pos : posTag.value().getPos()){
+                    graph.add(new TripleImpl(segmentUri, Nif20.oliaCategory.getUri(), 
+                        pos.getUri()));
+                }
+                for(LexicalCategory cat : posTag.value().getCategories()){
+                    graph.add(new TripleImpl(segmentUri, Nif20.oliaCategory.getUri(), 
+                        cat.getUri()));
+                }
+            }
+            graph.add(new TripleImpl(segmentUri, Nif20.posTag.getUri(), 
+                lf.createTypedLiteral(posTag.value().getTag())));
+            graph.add(new TripleImpl(segmentUri, ENHANCER_CONFIDENCE, 
+                lf.createTypedLiteral(posTag.probability())));
+        }
+    }    
+    
+    /**
+     * Writes a {@link NlpAnnotations#PHRASE_ANNOTATION} as NIF 1.0 to the
+     * parsed RDF graph by using the segmentUri as subject
+     * @param graph the graph
+     * @param annotated the annotated element (e.g. a {@link Chunk})
+     * @param segmentUri the URI of the resource representing the parsed 
+     * annotated element in the graph
+     */
+    public static void writePhrase(MGraph graph, Annotated annotated, UriRef segmentUri) {
+        Value<PhraseTag> phraseTag = annotated.getAnnotation(NlpAnnotations.PHRASE_ANNOTATION);
+        if(phraseTag != null){
+            UriRef phraseTypeUri = LEXICAL_TYPE_TO_PHRASE_TYPE.get(phraseTag.value().getCategory());
+            if(phraseTypeUri != null){ //add the oliaLink for the Phrase
+                graph.add(new TripleImpl(segmentUri, Nif20.oliaCategory.getUri(), phraseTypeUri));
+                graph.add(new TripleImpl(segmentUri, ENHANCER_CONFIDENCE, 
+                    lf.createTypedLiteral(phraseTag.probability())));
+            }
+        }
+    }
+
+}

Added: stanbol/trunk/enhancement-engines/nlp2rdf/src/main/java/org/apache/stanbol/enhancer/engines/nlp2rdf/engine/Nif20MetadataEngine.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/nlp2rdf/src/main/java/org/apache/stanbol/enhancer/engines/nlp2rdf/engine/Nif20MetadataEngine.java?rev=1633587&view=auto
==============================================================================
--- stanbol/trunk/enhancement-engines/nlp2rdf/src/main/java/org/apache/stanbol/enhancer/engines/nlp2rdf/engine/Nif20MetadataEngine.java (added)
+++ stanbol/trunk/enhancement-engines/nlp2rdf/src/main/java/org/apache/stanbol/enhancer/engines/nlp2rdf/engine/Nif20MetadataEngine.java Wed Oct 22 10:09:41 2014
@@ -0,0 +1,197 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.stanbol.enhancer.engines.nlp2rdf.engine;
+
+import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText;
+
+import java.util.Collections;
+import java.util.Dictionary;
+import java.util.EnumSet;
+import java.util.Iterator;
+import java.util.Map;
+
+import org.apache.clerezza.rdf.core.Language;
+import org.apache.clerezza.rdf.core.LiteralFactory;
+import org.apache.clerezza.rdf.core.MGraph;
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.clerezza.rdf.core.impl.TripleImpl;
+import org.apache.felix.scr.annotations.Activate;
+import org.apache.felix.scr.annotations.Component;
+import org.apache.felix.scr.annotations.ConfigurationPolicy;
+import org.apache.felix.scr.annotations.Properties;
+import org.apache.felix.scr.annotations.Property;
+import org.apache.felix.scr.annotations.Service;
+import org.apache.stanbol.enhancer.nlp.NlpAnnotations;
+import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
+import org.apache.stanbol.enhancer.nlp.model.Span;
+import org.apache.stanbol.enhancer.nlp.model.SpanTypeEnum;
+import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
+import org.apache.stanbol.enhancer.nlp.nif.Nif20;
+import org.apache.stanbol.enhancer.servicesapi.ContentItem;
+import org.apache.stanbol.enhancer.servicesapi.EngineException;
+import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
+import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
+import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.rdf.NamespaceEnum;
+import org.osgi.service.cm.ConfigurationException;
+import org.osgi.service.component.ComponentContext;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+@Component(immediate = true, metatype = true, 
+    configurationFactory = true, //allow multiple configuration
+    policy = ConfigurationPolicy.OPTIONAL) //create a default instance
+@Service
+@Properties(value={
+        @Property(name= EnhancementEngine.PROPERTY_NAME,value="nif20")
+})
+public class Nif20MetadataEngine extends AbstractEnhancementEngine<RuntimeException,RuntimeException> implements ServiceProperties{
+
+    private final Logger log = LoggerFactory.getLogger(Nif20MetadataEngine.class);
+    //TODO: replace this with a reald ontology
+    private final static UriRef SENTIMENT_PROPERTY = new UriRef(NamespaceEnum.fise+"sentiment-value");
+    private final LiteralFactory lf = LiteralFactory.getInstance();
+    
+    /**
+     * Activate and read the properties. Configures and initialises a ChunkerHelper for each language configured in
+     * CONFIG_LANGUAGES.
+     *
+     * @param ce the {@link org.osgi.service.component.ComponentContext}
+     */
+    @Activate
+    protected void activate(ComponentContext ce) throws ConfigurationException {
+        log.info("activating POS tagging engine");
+        super.activate(ce);
+        @SuppressWarnings("unchecked")
+        Dictionary<String, Object> properties = ce.getProperties();
+        //TODO: read configuration
+    }
+    
+    @Override
+    public int canEnhance(ContentItem ci) throws EngineException {
+        return getAnalysedText(this, ci, false) != null ? 
+                ENHANCE_ASYNC : CANNOT_ENHANCE;
+    }
+
+    @Override
+    public void computeEnhancements(ContentItem ci) throws EngineException {
+        AnalysedText at = getAnalysedText(this, ci, true);
+        String lang = EnhancementEngineHelper.getLanguage(ci);
+        Language language = lang == null ? null : new Language(lang);
+        //now iterate over the AnalysedText data and create the RDF representation
+        //TODO: make configureable
+        boolean sentences = true;
+        boolean phrases = true;
+        boolean words = true;
+        
+        EnumSet<SpanTypeEnum> activeTypes = EnumSet.noneOf(SpanTypeEnum.class);
+        if(sentences){
+            activeTypes.add(SpanTypeEnum.Sentence);
+        }
+        if(phrases){
+            activeTypes.add(SpanTypeEnum.Chunk);
+        }
+        if(words){
+            activeTypes.add(SpanTypeEnum.Token);
+        }
+        MGraph metadata = ci.getMetadata();
+        UriRef base = ci.getUri();
+        ci.getLock().writeLock().lock();
+        try {
+        	//write the context
+        	UriRef text = Nif20Helper.writeSpan(metadata, base, at, language, at);
+    		metadata.add(new TripleImpl(text, Nif20.sourceUrl.getUri(), ci.getUri()));
+        	
+            Iterator<Span> spans = at.getEnclosed(activeTypes);
+            UriRef sentence = null;
+            UriRef phrase = null;
+            UriRef word = null;
+            boolean firstWordInSentence = true;
+            while(spans.hasNext()){
+                Span span = spans.next();
+                //TODO: filter Spans based on additional requirements
+                //(1) write generic information about the span
+                UriRef current = Nif20Helper.writeSpan(metadata, base, at, language, span);
+                //write the context
+                metadata.add(new TripleImpl(current, Nif20.referenceContext.getUri(), text));
+                //(2) add the relations between the different spans
+                switch (span.getType()) {
+                    case Sentence:
+                        if(sentence != null){
+                            metadata.add(new TripleImpl(sentence, Nif20.nextSentence.getUri(), current));
+                        }
+                        sentence = current;
+                        firstWordInSentence = true;
+                        break;
+                    case Chunk:
+                        if(sentence != null){
+                            metadata.add(new TripleImpl(current, Nif20.superString.getUri(), sentence));
+                            if(word != null){
+                                metadata.add(new TripleImpl(word, Nif20.lastWord.getUri(), sentence));
+                            }
+                        }
+                        phrase = current;
+                        break;
+                    case Token:
+                        if(sentence != null){
+                            metadata.add(new TripleImpl(current, Nif20.sentence.getUri(), sentence));
+                            if(firstWordInSentence){
+                                metadata.add(new TripleImpl(current, Nif20.firstWord.getUri(), sentence));
+                                firstWordInSentence = false;
+                            }
+                        }
+                        if(phrase != null){
+                            metadata.add(new TripleImpl(current, Nif20.subString.getUri(), phrase));
+                        }
+                        if(word != null){
+                            metadata.add(new TripleImpl(word, Nif20.nextWord.getUri(), current));
+                            metadata.add(new TripleImpl(current, Nif20.previousWord.getUri(), word));
+                        }
+                        word = current;
+                        break;
+                    default:
+                        break;
+                }
+                //(3) add specific information such as POS, chunk type ...
+                Nif20Helper.writePos(metadata, span, current);
+                Nif20Helper.writePhrase(metadata, span, current);
+
+                //OlIA does not include Sentiments
+                
+                Value<Double> sentiment = span.getAnnotation(NlpAnnotations.SENTIMENT_ANNOTATION);
+                if(sentiment != null && sentiment.value() != null){
+                    metadata.add(new TripleImpl(current, SENTIMENT_PROPERTY, 
+                        lf.createTypedLiteral(sentiment.value())));
+                }
+            }
+        } finally {
+            ci.getLock().writeLock().unlock();
+        }
+    }
+
+    @Override
+    public Map<String,Object> getServiceProperties() {
+        return Collections.singletonMap(ServiceProperties.ENHANCEMENT_ENGINE_ORDERING, 
+            (Object)ServiceProperties.ORDERING_POST_PROCESSING);
+    }
+
+
+
+
+
+}

Added: stanbol/trunk/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/nif/Nif20.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/nif/Nif20.java?rev=1633587&view=auto
==============================================================================
--- stanbol/trunk/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/nif/Nif20.java (added)
+++ stanbol/trunk/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/nif/Nif20.java Wed Oct 22 10:09:41 2014
@@ -0,0 +1,545 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.stanbol.enhancer.nlp.nif;
+
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
+import org.apache.stanbol.enhancer.nlp.pos.Pos;
+
+public enum Nif20 {
+	/**
+	 * A URI Scheme for NIF which is able to refer to a single, consecutive 
+	 * string in a context. Note that any scheme subclassing this class, 
+	 * requires the existence of beginIndex, endIndex and referenceContext .
+	 * <p>
+	 * This is an abstract class and should not be serialized. 
+	 */
+	CString,
+	/**
+	 * An abitrary URI (e.g. a URN) for an arbitrary string of the context. 
+	 * This is roughly the same as TextAnnotations are currently implemented in Stanbol.
+	 */
+	CStringInst,
+	/**
+	 * The string that serves as a context for its substrings. The Unicode String 
+	 * given in the nif:isString property must be used to calculate the begin and 
+	 * endIndex for all nif:Strings that have a nif:referenceContext property to 
+	 * this URI. For further information, see 
+	 * http://svn.aksw.org/papers/2013/ISWC_NIF/public.pdf 
+	 */
+	Context,
+	/**
+	 * A collection of contexts used to create an unordered set of context via 
+	 * the nif:hasContext property. This can be compared to a document collection, 
+	 * but here it is a collection of nif:Context and therefore a collection of 
+	 * annotated strings, not documents. 
+	 */
+	ContextCollection,
+	/**
+	 * see <a href="http://jens-lehmann.org/files/2012/ekaw_nif.pdf">
+	 * Linked-Data Aware URI Schemes for Referencing Text Fragments</a> 
+	 * by Sebastian Hellmann, Jens Lehmann und Sören Auer in EKAW 2012 for more
+	 * information.
+	 */
+	ContextHashBasedString,
+	/**
+	 * see <a href="http://jens-lehmann.org/files/2012/ekaw_nif.pdf">
+	 * Linked-Data Aware URI Schemes for Referencing Text Fragments</a> 
+	 * by Sebastian Hellmann, Jens Lehmann und Sören Auer in EKAW 2012 for more
+	 * information.
+	 */
+	OffsetBasedString,
+	/**
+	 * A Paragraph
+	 */
+	Paragraph,
+	/**
+	 * A nif:Phrase can be a nif:String, that is a chunk of several words or a 
+	 * word itself (e.g. a NounPhrase as a Named Entity). The term is underspecified 
+	 * and can be compatible with many defintitions of phrase. Please subClass 
+	 * it to specify the meaning (e.g. for Chunking or Phrase Structure Grammar). 
+	 * Example: ((My dog)(also)(likes)(eating (sausage))) 
+	 */
+	Phrase,
+	/**
+	 * URIs of this class have to conform with the syntax of <a 
+	 * href="http://tools.ietf.org/html/rfc5147">RFC 5147</a> in a way that the 
+	 * end on a valid identifier, if you remove the prefix. Note that unlike 
+	 * RFC 5147 NIF does not requrire '#' URIs. So valid URIs are 
+	 * http://example.org#char=0,28 , http://example.org/whatever/char=0,28 , 
+	 * http://example.org/nif?char=0,28
+	 */
+	RFC5147String,
+	/**
+	 * A Sentence 
+	 */
+	Sentence,
+	/**
+	 * Individuals of this class are a string, i.e. Unicode characters, who 
+	 * have been given a URI and are used in the subject of an RDF statement.
+	 * <p>
+	 * This class is abstract and should not be serialized.
+	 * <p>
+	 * NIF-Stanbol (nif-stanbol.ttl): subclassOf nifs:Annotation because it 
+	 * "annotates" strings for example with begin and end index. The class is 
+	 * similar to fise:TextAnnotation
+	 */
+	String,
+	/**
+	 * A structure is a more or less arbitrary label for a partitioning of a 
+	 * string. We do not follow a strict approach for what a word, phrase, 
+	 * sentence, title, paragraph is. These labels enable the definition 
+	 * processes for tool chains, e.g. tool analyses nif:Paragraph and 
+	 * calculates term frequency.
+	 * <p>
+	 * This is an abstract class and should not be serialized. 
+	 */
+	Structure,
+	/**
+	 * A title within a text.
+	 */
+	Title,
+	/**
+	 * A URI Scheme for NIF, subclasses need to define guidelines on the URI 
+	 * Scheme as well as the text it refers to. This class is just to keep some 
+	 * order, and should not be serialized.
+	 * <p>
+	 * This is an abstract class and should not be serialized. 
+	 */
+	URIScheme,
+	/**
+	 *  The Word class represents strings that are tokens or words. A string is 
+	 *  a Word, if it is a word. We don't nitpic about whether it is a a pronoun, 
+	 *  a name, a punctuation mark or an apostrophe or whether it is separated 
+	 *  by white space from another Word or something else. The string 
+	 *  'He enters the room.' for example has 5 words. Words are assigned by a 
+	 *  tokenizer NIF Implementation. Single word phrases might be tagged as 
+	 *  nif:Word and nif:Phrase.
+	 *  
+	 *  Example 1: "The White House" are three Words separated by whitespace
+	 *  
+	 *  Comment 1: We adopted the definition style from foaf:Person, see 
+	 *  here: http://xmlns.com/foaf/spec/#term_Person We are well aware that 
+	 *  the world out there is much more complicated, but we are ignorant about 
+	 *  it, for the following reasons:
+	 *  
+	 *  Comment 2: <ol>
+	 *  <li> NIF has a client-server and the client has the ability to 
+	 *  dictate the tokenization to the server (i.e. the NIF Implementation) by 
+	 *  sending properly tokenized NIF annotated with nif:Word. All NIF 
+	 *  Implementations are supposed to honor and respect the current assignment 
+	 *  of the Word class. Thus the client should decide which NIF Implementation 
+	 *  should create the tokenization. Therefore this class is not descriptive, 
+	 *  but prescriptive.
+	 *  <li>The client may choose to send an existing tokenization to a NIF 
+	 *  Implementation, with the capability to change (for better or for worse) 
+	 *  the tokenization.
+	 *  </ol>
+	 *  
+	 *  The class has not been named 'Token' as the NLP definition of 'token' 
+	 *  is descriptive (and not well-defined), while the assignment of what is 
+	 *  a Word and what not is prescriptive, e.g. "can't" could be described as 
+	 *  one, two or three tokens or defined as being one, two or three words. 
+	 *  For further reading, we refer the reader to: By all these lovely tokens... 
+	 *  Merging conflicting tokenizations by Christian Chiarcos, Julia Ritz, and 
+	 *  Manfred Stede. Language Resources and Evaluation 46(1):53-74 (2012) or 
+	 *  the short form: http://www.aclweb.org/anthology/W09-3005
+	 *  
+	 *  There the task at hand is to merge two tokenization T_1 and T_2 which 
+	 *  is normally not the case in the NIF world as tokenization is prescribed, 
+	 *  i.e. given as a baseline (Note that this ideal state might not be 
+	 *  achieved by all implementations.)
+	 */
+	Word,
+	//Object Properties
+	/**
+	 * see <a href="http://svn.aksw.org/papers/2012/PeoplesWeb/public_preprint.pdf>
+	 * Towards Web-Scale Collaborative Knowledge Extraction</a> ‎ page 21
+	 */
+	annotation,
+	/**
+	 * This property should be used to express that one Context is contained in 
+	 * another Context, e.g. several sentences of a document are modelled 
+	 * indivudally and refer to the broader context of the whole document.
+	 */
+	broaderContext,
+	/**
+	 * A dependency relation pointing from gov to dep.
+	 */
+	dependency,
+	/**
+	 * Links a nif:ContextCollection to its contexts. 
+	 */
+	hasContext,
+	/**
+	 * This property links sentences to their first word.
+	 */
+	firstWord,
+	/**
+	 * This property links sentences to their last word.
+	 */
+	lastWord,
+	/**
+	 * This property links sentences to their words.
+	 */
+	word,
+	/**
+	 * This object property models a relation between two nif:Strings. 
+	 * The name "inter" is kept generic and can be used to express any kind of 
+	 * relation in between (inter) two nif:Strings. Extensions can create 
+	 * rdfs:subPropertyOf for "head", "dependent", nif:substring and 
+	 * nif:nextWord. 
+	 */
+	inter,
+	/**
+	 * Defines the language of a substring of the context. 
+	 * If the language for the nif:Context should be specified, 
+	 * nif:predominantLanguage must be used. 
+	 */
+	lang,
+	/**
+	 * The inverse of nif:broaderContex
+	 */
+	narrowerContext,
+	/**
+	 * This property can be used to make resources of 
+	 * nif:Sentence traversable, it can not be assumed that no gaps 
+	 * or whitespaces between sentences or words exist, i.e. string adjacency 
+	 * is not mandatory. The transitivity axioms are included in nif-core-inf.ttl 
+	 * and need to be included separately to keep a low reasoning profile. 
+	 * They are modeled after skos:broader and skos:broaderTransitive
+	 */
+	nextSentence,
+	/**
+	 * transitive version of {@link #nextSentence}
+	 */
+	nextSentenceTrans,
+	/**
+	 * This property can be used to make resources of 
+	 * nif:Word traversable, it can not be assumed that no gaps 
+	 * or whitespaces between sentences or words exist, i.e. string adjacency 
+	 * is not mandatory. The transitivity axioms are included in nif-core-inf.ttl 
+	 * and need to be included separately to keep a low reasoning profile. 
+	 * They are modeled after skos:broader and skos:broaderTransitive
+	 */
+	nextWord,
+	/**
+	 * transitive version of {@link #nextWord}
+	 */
+	nextWordTrans,
+	/**
+	 * This property links a string to a URI from one of the OLiA Annotation model, 
+	 *  - members of the {@link Pos} enumeration 
+	 */
+	oliaLink,
+	/**
+	 * This property is used to link to a <a href="http://marl.gi2mo.org/?page_id=1#overview">marl:Opinion</a>. 
+	 * We have not investigated marl, so it might be replaced.
+	 * <p>
+	 * InverseOf marl:extractedFrom
+	 */
+	opinion,
+	/**
+	 * Defines the predominant language of the text. If this annotation is given 
+	 * on a nif:Context, all NIF tools have to treat the text to be in this 
+	 * language unless specified differently for a subpart. To change the 
+	 * language for a smaller substring nif:lang must be used.
+	 * <p>
+	 * This property requires a uri as an argument. We expect this to be a URI 
+	 * from the lexvo.org namespace, e.g. http://lexvo.org/id/iso639-3/eng using 
+	 * ISO639-3
+	 * <p>
+	 * Examples:
+	 * <p>
+	 * "The dealer says: "Rien ne va plus!" "
+	 * <p>
+	 * has nif:predomintLanguage http://lexvo.org/id/iso639-3/eng and 
+	 * nif:lang http://www.lexvo.org/page/iso639-3/fra
+	 * <p>
+	 * see also: http://www.w3.org/TR/its20/#selection-local
+	 * <p>
+	 * Tests for RDFUnit (not written yet):
+	 * <p>
+	 * - write a test for RDFUnit, so people do not use 
+	 * http://www.lexvo.org/page/iso639-3/eng 
+	 */
+	predLang,
+	/**
+	 * This property can be used to make resources of 
+	 * nif:Sentence, it can not be assumed that no gaps 
+	 * or whitespaces between sentences or words exist, i.e. string adjacency 
+	 * is not mandatory. The transitivity axioms are included in nif-core-inf.ttl 
+	 * and need to be included separately to keep a low reasoning profile. 
+	 * They are modeled after skos:broader and skos:broaderTransitive
+	 */
+	previousSentence,
+	/**
+	 * Transitive version of {@link #previousSentence}
+	 */
+	previousSentenceTrans,
+	/**
+	 * This property can be used to make resources of 
+	 * nif:Word, it can not be assumed that no gaps 
+	 * or whitespaces between sentences or words exist, i.e. string adjacency 
+	 * is not mandatory. The transitivity axioms are included in nif-core-inf.ttl 
+	 * and need to be included separately to keep a low reasoning profile. 
+	 * They are modeled after skos:broader and skos:broaderTransitive
+	 */
+	previousWord,
+	/**
+	 * Transitive version of {@link #previousWord}
+	 */
+	previousWordTrans,
+	/**
+	 * Links to the URI describing the provenance
+	 */
+	oliaProv,
+	/**
+	 * Links a URI of a string to its reference context of type nif:Context. 
+	 * The reference context determines the calculation of begin and end index
+	 * <p>
+	 * Each String that is not an instance of nif:Context MUST have exactly one 
+	 * reference context.
+	 * <p>
+	 * Inferences (nif-core-inf.ttl):
+	 * <p>
+	 * Instances of nif:Context do have itself as reference context, this is 
+	 * inferred automatically, MAY be materialized, as well.
+	 * <p>
+	 * OWL validation (nif-core-val.ttl):
+	 * <p>
+	 * This property is functional.
+	 */
+	referenceContext,
+	/**
+	 * This property links words to their sentence.
+	 */
+	sentence,
+	/**
+	 * The URL the context was extracted from, e.g. the blog or news article url. 
+	 * Doesn't matter whether it is HTML or XML or plain text. rdfs:range is 
+	 * foaf:Document. Subproperty of prov:hadPrimarySource. In case the string 
+	 * comes from another NIF String and gives the exact provenance, please use 
+	 * nif:wasConvertedFrom or a subProperty thereof.
+	 */
+	sourceUrl,
+	/**
+	 * This property together with nif:subString, nif:superString, and their 
+	 * transitive extension can be used to express that one string is contained 
+	 * in another one. Examples: "a" nif:subString "apple" , "apple" 
+	 * nif:subString "apple". The transitivity axioms are included in 
+	 * nif-core-inf.ttl and need to be included separately to keep a low 
+	 * reasoning profile. They are modeled after skos:broader and 
+	 * skos:broaderTransitive
+	 */
+	subString,
+	/**
+	 * Inverse of {@link #subString}
+	 */
+	superString,
+	/**
+	 * Transitive version of {@link #dependency}
+	 */
+	dependencyTrans,
+	/**
+	 * Transitive version of {@link #subString}
+	 */
+	subStringTrans,
+	/**
+	 * Transitive version of {@link #superString}
+	 */
+	superStringTrans,
+	/**
+	 * This property should be used, when mapping one nif:String or nif:Context 
+	 * to another and is often confused with nif:sourceUrl.
+	 * <p>
+	 * While nif:sourceUrl is built on PROV-O and is used to link the nif:Context 
+	 * to the document URL for provenance information, nif:convertedFrom is more 
+	 * precise and pinpoints exact locations where a certain NIF String 
+	 * "wasConvertedFrom".
+	 * <p>
+	 * nif:wasConvertedFrom is therefore used to provide *exact* provenance 
+	 * during a conversion process, e.g. when removing tags from XHTML and then 
+	 * linking XPath URIs to NIF index based URIs (e.g. RFC 5147 with char=x,y). 
+	 * An example of the usage of this property can be found here: 
+	 * http://www.w3.org/TR/its20/#conversion-to-nif
+	 * <p>
+	 * Example
+	 * <p>
+	 * # "Dublin"
+	 * <p>
+	 * &lt;http://example.com/myitsservice?informat=html&intype=url&input=http://example.com/doc.html&char=11,17&gt;
+	 * <p>
+	 * nif:wasConvertedFrom
+	 * <p>
+	 * &lt;http://example.com/myitsservice?informat=html&intype=url&input=http://example.com/doc.html&xpath=/html/body[1]/h2[1]/span[1]/text()[1]&gt;.
+	 */
+	wasConvertedFrom,
+	//Datatype properties
+	/**
+	 * For each string you can include a snippet (e.g. 10-40 characters of text), 
+	 * that occurs immediately after the subject string.
+	 */
+	after,
+	/**
+	 * The string, which the URI is representing as an RDF Literal. Some use 
+	 * cases require this property, as it is necessary for certain sparql queries. 
+	 */
+	anchorOf,
+	/**
+	 * For each string you can include a snippet (e.g. 10-40 characters of text), 
+	 * that occurs immediately before the subject string.
+	 */
+	before,
+	/**
+	 * The begin index of a character range as defined in 
+	 * http://tools.ietf.org/html/rfc5147#section-2.2.1 and 
+	 * http://tools.ietf.org/html/rfc5147#section-2.2.2, measured as the gap 
+	 * between two characters, starting to count from 0 (the position before 
+	 * the first character of a text).
+	 * <p>
+	 * Example: Index "2" is the postion between "Mr" and "." in "Mr. Sandman".
+	 * <p>
+	 * Note: RFC 5147 is re-used for the definition of character ranges. RFC 5147 
+	 * is assuming a text/plain MIME type. NIF builds upon Unicode and is content 
+	 * agnostic.
+	 * <p>
+	 * Requirement (1): This property has the same value the "Character position" 
+	 * of RFC 5147 and it MUST therefore be castable to xsd:nonNegativeInteger, 
+	 * i.e. it MUST not have negative values.
+	 * <p>
+	 * Requirement (2): The index of the subject string MUST be calculated 
+	 * relative to the nif:referenceContext of the subject. If available, this 
+	 * is the rdf:Literal of the nif:isString property.
+	 */
+	beginIndex,
+	/**
+	 * The confidence is relative to the tool and can be between 0.0 and 1.0, 
+	 * it is for nif:oliaLink and therefore also for nif:oliaCategory.
+	 */
+	oliaConf,
+	/**
+	 * The end index of a character range as defined in 
+	 * http://tools.ietf.org/html/rfc5147#section-2.2.1 and 
+	 * http://tools.ietf.org/html/rfc5147#section-2.2.2, measured as the gap 
+	 * between two characters, starting to count from 0 (the position before 
+	 * the first character of a text).
+	 * <p>
+	 * Example: Index "2" is the postion between "Mr" and "." in "Mr. Sandman".
+	 * <p>
+	 * Note: RFC 5147 is re-used for the definition of character ranges. RFC 5147 
+	 * is assuming a text/plain MIME type. NIF builds upon Unicode and is content 
+	 * agnostic.
+	 * <p>
+	 * Requirement (1): This property has the same value the "Character position" 
+	 * of RFC 5147 and it must therefore be an xsd:nonNegativeInteger .
+	 * <p>
+	 * Requirement (2): The index of the subject string MUST be calculated 
+	 * relative to the nif:referenceContext of the subject. If available, this 
+	 * is the rdf:Literal of the nif:isString property.
+	 */
+	endIndex,
+	/**
+	 * The first few chars of the nif:anchorOf. Typically used if the nif:anchorOf
+	 * is to long for inclusion as RDF literal.
+	 */
+	head,
+	/**
+	 * The reference text as rdf:Literal for this nif:Context resource.
+	 * NIF requires that the reference text (i.e. the context) is always 
+	 * included in the RDF as an rdf:Literal.
+	 * <p>
+	 * Note, that the isString property is *the* place to keep the string itself 
+	 * in RDF.
+	 * <p>
+	 * All other nif:Strings and nif:URISchemes relate to the text of this 
+	 * property to calculate character position and indices.
+	 */
+	isString,
+	/**
+	 * The lemma(s) of the nif:String.
+	 */
+	lemma,
+	/**
+	 * see <a href=http://svn.aksw.org/papers/2012/PeoplesWeb/public_preprint.pdf">
+	 * Towards Web-Scale Collaborative Knowledge Extraction</a>‎ page 21 .
+	 */
+	literalAnnotation,
+	/**
+	 * To include the pos tag as it comes out of the NLP tool as RDF Literal. 
+	 * This property is discouraged to use alone, please use oliaLink and 
+	 * oliaCategory. We included it, because some people might still want it 
+	 * and will even create their own property, if the string variant is missing  
+	 */
+	posTag,
+	/**
+	 * Between -1 negative and 1 positive
+	 */
+	sentimentValue,
+	/**
+	 * The stem(s) of the nif:String.
+	 */
+	stem,
+	//Annotation properties
+	/**
+	 * A simple annotation for machine learning purposes. The object can be 
+	 * anything, e.g. the literal "A. PRESS: Reportage" from Brown or any URI. 
+	 */
+	category,
+	/**
+	 * see <a href=http://svn.aksw.org/papers/2012/PeoplesWeb/public_preprint.pdf">
+	 * Towards Web-Scale Collaborative Knowledge Extraction</a>‎ page 12 .
+	 */
+	classAnnotation,
+	/**
+	 * This property marks the most specific class from itsrdf:taClassRef. 
+	 * The rule is: from the set S of itsrdf:taClassRef attached to this resource 
+	 * taMscRef points to the one that does not have any subclasses in the set 
+	 * S except itself. So if taClassRef is owl:Thing, dbo:Agent, dbo:Person, 
+	 * dbp:Actor taMsClassRef is dbo:Actor 
+	 */
+	taMsClassRef,
+	/**
+	 * This property links a string URI to classes of the OLiA Reference model. 
+	 * It provides a direct link for querying, thus it is a redundant optimization.
+	 * <p>
+	 * Values are expected to be member of {@link Pos}
+	 */
+	oliaCategory,
+	;
+    public final static String NAMESPACE = "http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#";
+
+    UriRef uri;
+    
+    private Nif20() {
+        uri = new UriRef(NAMESPACE+name());
+    }
+    
+    public String getLocalName(){
+        return name();
+    }
+    
+    public UriRef getUri(){
+        return uri;
+    }
+    
+    @Override
+    public String toString() {
+        return uri.getUnicodeString();
+    }
+
+}